Skip to content

Commit

Permalink
batch of fixes for v0.2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
riasc committed Jun 19, 2024
1 parent 6f82fc3 commit f1632ac
Show file tree
Hide file tree
Showing 23 changed files with 5,000,336 additions and 214 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

# [0.2.3]

## Fix

- Consumer/Producer pattern for 'detect' to integrate concurrency
- Write to .fq using SeqAn3 write operations
- Assertion error when writing to .fq (caused by variable initialization outside worker)


# [0.2.2]

## Features
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.22.1)
project(RNAnue VERSION 0.2.2)
project(RNAnue VERSION 0.2.3)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_CXX_FLAGS -fopenmp)
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,5 @@ RUN mv seqan3-3.3.0-Source seqan3
WORKDIR /RNAnue/build
RUN cmake .. -DCMAKE_BUILD_TYPE=Release
RUN make
RUN echo 'alias RNAnue="./RNAnue/build/RNAnue"' >> ~/.bashrc
WORKDIR /
# make RNAnue available in the PATH
ENV PATH=$PATH:/RNAnue/build/
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ duplex.
| XR:f | site length ratio |
| XA:Z | alignment of sequence |
| XE:f | hybridization energy |
| XD:f | MFE structure in dot-bracket notation |

### Clustering results

Expand Down Expand Up @@ -182,6 +183,14 @@ each step of the analysis.
In additon, we provide a ready-to-use [Docker container](https://hub.docker.com/repository/docker/cobirna/rnanue) that
has RNAnue preconfigured.

### Singularity

The provided Docker container can also be used with Singularity.
```
singularity pull docker://cobirna/rnanue:latest
singularity exec --bind /path/to/data:/data rnanue_latest.sif RNAnue <subcall> --config /data/params.cfg
```

### Testing

We provide a test dataset in the [test](./test/data/) folder that can be used to test the installation.
Expand Down
4 changes: 0 additions & 4 deletions include/Config.hpp

This file was deleted.

8 changes: 8 additions & 0 deletions include/DataTypes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <seqan3/alphabet/quality/phred42.hpp>
#include <seqan3/io/sequence_file/input.hpp>
#include <seqan3/io/sequence_file/output.hpp>
#include <seqan3/io/sam_file/all.hpp>


namespace fs = boost::filesystem;
Expand Down Expand Up @@ -103,6 +104,13 @@ namespace dtp {
}
};
using IntKey = std::vector<IntPartner>;

using BAMOut = seqan3::sam_file_output<seqan3::fields<seqan3::field::seq,
seqan3::field::id, seqan3::field::ref_id, seqan3::field::ref_offset,
seqan3::field::cigar, seqan3::field::mapq, seqan3::field::qual,
seqan3::field::flag, seqan3::field::mate, seqan3::field::tags,
seqan3::field::header_ptr>, seqan3::type_list<seqan3::format_sam, seqan3::format_bam>,
std::deque<std::__cxx11::basic_string<char>>>;
}

#endif //RNANUE_DATATYPES_HPP
6 changes: 5 additions & 1 deletion include/Node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
class IntervalData {
public:
IntervalData(std::string chrom, char strand, std::string id, std::string name,
std::string biotype, dtp::Interval interval, IntervalData* split);
std::string biotype, std::string product, dtp::Interval interval,
IntervalData* split);
~IntervalData();

// operator overloading
Expand All @@ -31,6 +32,8 @@ class IntervalData {
void setName(std::string name);
std::string getBiotype() const;
void setBiotype(std::string biotype);
std::string getProduct() const;
void setProduct(std::string product);
dtp::Interval getInterval();
void setInterval(dtp::Interval interval);
dtp::SpliceJunctions getJunctions() const;
Expand All @@ -50,6 +53,7 @@ class IntervalData {
std::string id;
std::string name;
std::string biotype;
std::string product;
dtp::Interval interval;
dtp::SpliceJunctions junctions;
IntervalData* split;
Expand Down
3 changes: 0 additions & 3 deletions include/Preproc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,6 @@ class Preproc {
bool filterReads(auto& qual);
double calcAvgPhred(auto& qual); // calculate the average phred score

// helper functions
std::string fastqToString(dtp::FASTQRecord& rec);

private:
po::variables_map params;
StateTransition adpt5Tables;
Expand Down
39 changes: 34 additions & 5 deletions include/SplitReadCalling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <sstream>
#include <regex>
#include <cstdlib>
#include <condition_variable>

// Boost
#include <boost/program_options.hpp>
Expand Down Expand Up @@ -72,6 +73,7 @@ template <> struct seqan3::sam_tag_type<"XR"_tag> { using type = float; }; // si
template <> struct seqan3::sam_tag_type<"XA"_tag> { using type = std::string; }; // alignment
template <> struct seqan3::sam_tag_type<"XS"_tag> { using type = int32_t; }; // quality
template <> struct seqan3::sam_tag_type<"XE"_tag> { using type = float; }; // end of split
template <> struct seqan3::sam_tag_type<"XD"_tag> { using type = std::string; }; // MFE in dot-bracket notation


using namespace seqan3::literals;
Expand All @@ -81,6 +83,27 @@ namespace po = boost::program_options;
namespace fs = boost::filesystem;
namespace pt = boost::property_tree;

template <typename T>
class SafeQueue {
public:
// constructor
SafeQueue();
~SafeQueue();

// operations
void push(T value);
bool pop(T& result);
void reset();
void setDone();

private:
std::queue<T> q;
std::mutex mtx;
std::condition_variable cv;
bool done = false;
};


class SplitReadCalling {
public:
SplitReadCalling(po::variables_map params);
Expand All @@ -90,8 +113,10 @@ class SplitReadCalling {
void start(pt::ptree sample, pt::ptree condition);
void sort(const std::string& inputFile, const std::string& outputFile); // sort BAM file

void producer(seqan3::sam_file_input<>& inputfile);
void consumer(dtp::BAMOut& singleOut, dtp::BAMOut& splitsOut, dtp::BAMOut& multsplitsOut,
std::mutex& singleOutMutex, std::mutex& splitsOutMutex, std::mutex& multsplitsOutMutex);

void iterate(std::string& matched, std::string& single, std::string& splits, std::string& multsplits);
template <typename T>
void process(std::vector<T>& readrecords, auto& singleOut, auto& splitsOut, auto& multsplitsOut,
auto& singleOutMutex, auto& splitsOutMutex, auto& multsplitsOutMutex);
Expand All @@ -105,11 +130,11 @@ class SplitReadCalling {

// filters
TracebackResult complementarity(dtp::DNASpan &seq1, dtp::DNASpan &seq2);
double hybridization(dtp::DNASpan &seq1, dtp::DNASpan& seq2);
std::pair<double,std::string> hybridization(dtp::DNASpan &seq1, dtp::DNASpan& seq2);

// output
void addComplementarityToSamRecord(SAMrecord &rec1, SAMrecord &rec2, TracebackResult &res);
void addHybEnergyToSamRecord(SAMrecord &rec1, SAMrecord &rec2, double &hyb);
void addHybEnergyToSamRecord(SAMrecord &rec1, SAMrecord &rec2, std::pair<double,std::string> &hyb);
void writeSAMrecordToBAM(auto& bamfile, std::vector<std::pair<SAMrecord, SAMrecord>>& records);
void writeStats();

Expand All @@ -120,8 +145,12 @@ class SplitReadCalling {
std::shared_ptr<Stats> stats;
int replPerCond; // number of replicates per condition
std::string condition; // stores the current condition
std::deque<std::string> refIds; // stores the reference ids
FilterScores filterScores;

// refIds
std::deque<std::string> refIds;

// queue
SafeQueue<std::vector<typename seqan3::sam_file_input<>::record_type>> queue;
};

#endif //RNANUE_DETECT_HPP
6 changes: 3 additions & 3 deletions src/Align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void Align::buildIndex() {
std::cout << helper::getTime() << "The reference index " << idx << " already exists\n";
} else {
// create index
std::cout << helper::getTime() << "Create index for " << ref << "\n";
std::cout << helper::getTime() << "Create reference index for " << ref << "\n";
std::string idxCall = "segemehl.x -x " + idx.string() + " -d " + ref;
const char* idxCallChar = idxCall.c_str();
int result = system(idxCallChar);
Expand All @@ -32,7 +32,7 @@ void Align::buildIndex() {

// aligns the reads to the reference genome
void Align::alignReads(std::string query, std::string mate, std::string matched) {
std::cout << helper::getTime() << "Start Alignment\n";
std::cout << helper::getTime() << "Start Alignment on sample: " << query << "\n";
std::string align = "segemehl.x";
align += " -S "; // split mode
align += " -b "; // output in bam format (and sorted)
Expand All @@ -49,7 +49,7 @@ void Align::alignReads(std::string query, std::string mate, std::string matched)
align += " -p " + mate;
}
align += " -o " + matched;
std::cout << align << "\n";
// std::cout << align << "\n";
const char* alignCallChar = align.c_str();
int result = system(alignCallChar);
if(result != 0) {
Expand Down
21 changes: 9 additions & 12 deletions src/Analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ Analysis::Analysis(po::variables_map _params) : params(_params), condition{""},
this->stats = std::make_shared<Stats>((outdir / "stats.txt").string());
}

Analysis::~Analysis() {
}
Analysis::~Analysis() {}

void Analysis::start(pt::ptree sample, pt::ptree condition) {

this->freq.clear(); // reset freq
// retrieve input and output files
pt::ptree input = sample.get_child("input");
Expand Down Expand Up @@ -98,6 +98,7 @@ void Analysis::processSplits(std::string& splits, std::string& output) {
seqan3::sam_tag_dictionary tags;
std::pair<float, float> filters; // complementarity and hybridization energy
std::vector<std::string> aligns; // alignment of complementarity
std::string mfeStruc = ""; // minimum free energy structure

// segment information (e.g., gene name, product, etc.) - keys are the as or ss
std::map<std::string, std::vector<IntervalData*>> segmentInfo = {};
Expand All @@ -113,6 +114,7 @@ void Analysis::processSplits(std::string& splits, std::string& output) {
// retrieve complementarity and energy
tags = (*rec.begin()).tags();
filters = std::make_pair(std::get<float>(tags["XC"_tag]), std::get<float>(tags["XE"_tag]));
mfeStruc = std::get<std::string>(tags["XD"_tag]);

dtp::IntKey intKey = {}; // key to store the interaction (partner1, orientation1, partner2, orientation2)

Expand Down Expand Up @@ -141,24 +143,18 @@ void Analysis::processSplits(std::string& splits, std::string& output) {
if (strand == ovl.second->getStrand()) { orient = "sense"; } else { orient = "antisense"; }
segmentInfo[orient].push_back(ovl.second); // add to segmentInfo
addToFreqMap(std::make_pair(orient, ovl.second->getName()), 1.0/(double)ovlps.size()); // add to freq


/*
std::string info = ovl.second->getName() + "\t" + ovl.second->getBiotype() + "\t" +
ovl.second->getStrand() + "\t" + "." + "\t" + orient;
segmentInfo[orient].push_back(info);*/
}
// check which one to select
if (segmentInfo["sense"].size() == 1) {
IntervalData* data = segmentInfo["sense"][0];
outLine += "\t" + data->getName() + "\t" + data->getBiotype() + "\t" + data->getStrand();
outLine += "\t.\tsense";
outLine += "\t" + data->getProduct() + "\tsense";
intKey.push_back(dtp::IntPartner(data->getName(),"sense"));
} else {
if (segmentInfo["antisense"].size() == 1) {
IntervalData* data = segmentInfo["antisense"][0];
outLine += "\t" + data->getName() + "\t" + data->getBiotype() + "\t" + data->getStrand();
outLine += "\t.\tsense";
outLine += "\t" + data->getProduct() + "\tsense";
intKey.push_back(dtp::IntPartner(data->getName(),"antisense"));
} else {
skip = 1; // skip if not unique
Expand All @@ -168,7 +164,8 @@ void Analysis::processSplits(std::string& splits, std::string& output) {
}
outLine += "\t" + std::to_string(filters.first); // complementarity
outLine += "\t" + aligns[0] + "\t" + aligns[1]; // alignment
outLine += "\t" + std::to_string(filters.second) + "\n"; // energy
outLine += "\t" + std::to_string(filters.second);
outLine += "\t" + mfeStruc + "\n"; // energy
if (skip == 0) {
outInts << outLine;
addToSuppReads(intKey[0], intKey[1]);
Expand All @@ -184,7 +181,7 @@ void Analysis::writeInteractionsHeader(std::ofstream& fout) {
fout << "qname\tfst_seg_strd\tfst_seg_strt\tfst_seg_end\tfst_seg_ref\tfst_seg_name\tfirst_seg_bt\t";
fout << "fst_seg_anno_strd\tfst_seg_prod\tfst_seg_ori\tsec_seg_strd\tsec_seg_strt\tsec_seg_end\t";
fout << "sec_seg_ref\tsec_seg_name\tsec_seg_bt\tsec_seg_anno_strd\tsec_seg_prod\tsec_seg_ori\t";
fout << "cmpl\tfst_seg_compl_aln\tsec_seg_cmpl_aln\tmfe\n";
fout << "cmpl\tfst_seg_compl_aln\tsec_seg_cmpl_aln\tmfe\tmfe_struc\n";
}

void Analysis::writeAllIntsHeader(std::vector<int> condLastFlag, std::ofstream& fout) {
Expand Down
4 changes: 3 additions & 1 deletion src/Clustering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ void Clustering::overlaps(std::vector<Cluster> &clusterlist) {
// refid needs to match
if((clusterlist[i].elements[0].refid == clusterlist[j].elements[0].refid) &&
(clusterlist[i].elements[0].refid == clusterlist[j].elements[0].refid)) {
// .. same with strand
// same with strand
if((clusterlist[i].elements[0].flag == clusterlist[j].elements[0].flag) &&
(clusterlist[i].elements[1].flag == clusterlist[j].elements[1].flag)) {
Cluster ncl = clusterlist[j];
Expand All @@ -170,5 +170,7 @@ void Clustering::overlaps(std::vector<Cluster> &clusterlist) {
void Clustering::start(pt::ptree sample) {
pt::ptree input = sample.get_child("input");
std::string splits = input.get<std::string>("splits");

std::cout << helper::getTime() << "Process sample: " << splits << "\n";
iterate(splits);
}
4 changes: 2 additions & 2 deletions src/Data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,14 +401,14 @@ void Data::detect() {
}

void Data::clustering() {
std::cout << helper::getTime() << "Start the Clustering\n";
std::cout << helper::getTime() << "Start the Clustering of the Split Reads\n";
Clustering clu(params);
callInAndOut(std::bind(&Clustering::start, &clu, std::placeholders::_1));
clu.sumup();
}

void Data::analysis() {
std::cout << helper::getTime() << "Start the Analysis\n";
std::cout << helper::getTime() << "Start the Analysis of the Split Reads\n";
Analysis ana(params);
callInAndOut(std::bind(&Analysis::start, &ana, std::placeholders::_1, std::placeholders::_2));
// write file output files
Expand Down
7 changes: 4 additions & 3 deletions src/IBPTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,14 @@ void IBPTree::iterateFeatures(std::string featuresFile) {
std::string id = getTag(attr, std::vector<std::string>{"ID"});
std::string name = getTag(attr, std::vector<std::string>{"gene_name", "Name"});
std::string biotype = getTag(attr, std::vector<std::string>{"gene_biotype", "gene_type"});
std::string product = getTag(attr, std::vector<std::string>{"product"});

if(attr.find("Parent") == attr.end()) { // new gene/transcript indicated by missing 'Parent' attribute
if(intvl != nullptr) { // there exists and interval object (add to tree)
insert(*intvl);
intvl = nullptr;
}
intvl = new IntervalData(fields.seqid, fields.strand, id, name, biotype,
intvl = new IntervalData(fields.seqid, fields.strand, id, name, biotype, product,
std::make_pair(fields.start, fields.end), nullptr);
} else {
// only if the feature is an exon (needed for detect step)
Expand Down Expand Up @@ -159,7 +160,7 @@ void IBPTree::iterateClusters(std::string clusterFile) {
intvl1 = res.second;
}
} else {
intvl1 = new IntervalData(chroms.second, strand.second, name, name, "cluster",
intvl1 = new IntervalData(chroms.second, strand.second, name, name, "cluster", ".",
std::make_pair(start.second, end.second), nullptr);
insert(*intvl1);
}
Expand All @@ -184,7 +185,7 @@ void IBPTree::iterateClusters(std::string clusterFile) {
intvl2 = res.second;
}
} else {
intvl2 = new IntervalData(chroms.second, strand.second, name, name, "cluster",
intvl2 = new IntervalData(chroms.second, strand.second, name, name, "cluster", ".",
std::make_pair(start.second, end.second), nullptr);
insert(*intvl2);
}
Expand Down
7 changes: 5 additions & 2 deletions src/Node.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "Node.hpp"

IntervalData::IntervalData(std::string chrom, char strand, std::string id, std::string name,
std::string biotype, dtp::Interval interval, IntervalData* split) :
chrom(chrom), strand(strand), id(id), name(name), biotype(biotype), interval(interval), split{split} {}
std::string biotype, std::string product, dtp::Interval interval, IntervalData* split) :
chrom(chrom), strand(strand), id(id), name(name), biotype(biotype), product(product),
interval(interval), split{split} {}
IntervalData::~IntervalData() {}
bool IntervalData::operator>(const dtp::Interval& other) const { return this->interval.first > other.first; }
bool IntervalData::operator<(const dtp::Interval& other) const { return this->interval.first < other.first; }
Expand All @@ -18,6 +19,8 @@ std::string IntervalData::getName() const { return name; }
void IntervalData::setName(std::string name) { this->name = name; }
std::string IntervalData::getBiotype() const { return biotype; }
void IntervalData::setBiotype(std::string biotype) { this->biotype = biotype; }
std::string IntervalData::getProduct() const { return product; }
void IntervalData::setProduct(std::string product) { this->product = product; }
dtp::Interval IntervalData::getInterval() { return interval; }
void IntervalData::setInterval(dtp::Interval interval) { this->interval = interval; }
dtp::SpliceJunctions IntervalData::getJunctions() const { return junctions; }
Expand Down
Loading

0 comments on commit f1632ac

Please sign in to comment.