From 4877916ec5053569c4a733d4af53f50e848c791f Mon Sep 17 00:00:00 2001 From: riasc Date: Thu, 13 Jun 2024 03:23:37 -0500 Subject: [PATCH] fixing boost anycast error --- CMakeLists.txt | 2 +- Dockerfile | 12 +- README.md | 2 +- include/Analysis.hpp | 46 ++- include/Clustering.hpp | 8 +- include/Config.hpp | 2 +- include/DataTypes.hpp | 20 ++ include/IBPTree.hpp | 6 +- include/Node.hpp | 6 +- include/Utility.hpp | 7 +- src/Analysis.cpp | 659 ++++++++++++++++++--------------------- src/Clustering.cpp | 6 +- src/Data.cpp | 3 +- src/IBPTree.cpp | 109 ++++++- src/Node.cpp | 12 +- src/SplitReadCalling.cpp | 4 +- src/Utility.cpp | 12 + 17 files changed, 507 insertions(+), 409 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eae5d7bd..9cb9e9e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.22.1) -project(RNAnue VERSION 0.2.0) +project(RNAnue VERSION 0.2.1) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_CXX_FLAGS -fopenmp) diff --git a/Dockerfile b/Dockerfile index 4ecd85a5..2bb79792 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ LABEL authors="Richard A. Schaefer" RUN apt-get -y update && apt-get -y upgrade RUN apt-get install -y curl build-essential cmake git pkg-config RUN apt-get install -y libbz2-dev zlib1g-dev libncurses5-dev liblzma-dev -RUN apt-get install -y libboost-all-dev gdb +RUN apt-get install -y libboost-all-dev gdb libmpfr-dev liblapacke-dev # install htslib WORKDIR / @@ -25,24 +25,30 @@ WORKDIR /htslib-1.20 RUN ./configure RUN make RUN make install +RUN export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig # install segemehl WORKDIR / RUN curl -L http://legacy.bioinf.uni-leipzig.de/Software/segemehl/downloads/segemehl-0.3.4.tar.gz -o segemehl-0.3.4.tar.gz RUN tar -xvf segemehl-0.3.4.tar.gz && rm segemehl-0.3.4.tar.gz WORKDIR /segemehl-0.3.4 -RUN export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig RUN make all RUN cp segemehl.x /usr/local/bin # install ViennaRNA +WORKDIR / +RUN curl -L https://www.tbi.univie.ac.at/RNA/download/sourcecode/2_6_x/ViennaRNA-2.6.4.tar.gz -o ViennaRNA-2.6.4.tar.gz +RUN tar -xvf ViennaRNA-2.6.4.tar.gz && rm ViennaRNA-2.6.4.tar.gz +WORKDIR /ViennaRNA-2.6.4 +RUN ./configure +RUN make && make install # retrieve RNAnue WORKDIR / RUN git clone https://github.com/Ibvt/RNAnue.git WORKDIR /RNAnue -# retrieve SeqAn +# retrieve SeqAn3 RUN curl -L https://github.com/seqan/seqan3/releases/download/3.3.0/seqan3-3.3.0-Source.tar.xz -o seqan3-3.3.0-Source.tar.xz RUN tar -xvf seqan3-3.3.0-Source.tar.xz && rm seqan3-3.3.0-Source.tar.xz RUN mv seqan3-3.3.0-Source seqan3 diff --git a/README.md b/README.md index 5937ab45..31c2b8b7 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ located in $PATH. * [Boost C++ Libraries](https://www.boost.org/) (v1.7.2) * [SeqAn](https://github.com/seqan/seqan3) (v3.3.0) * [Segemehl](http://www.bioinf.uni-leipzig.de/Software/segemehl/) (v0.3.4) -* [Vienna Package](https://www.tbi.univie.ac.at/RNA/#binary_packages) (v2.4.17) +* [Vienna Package](https://www.tbi.univie.ac.at/RNA/#binary_packages) (>= v2.4.17) ### CMake RNAnue is build using CMake. At first, clone the repository and change into the source directory. diff --git a/include/Analysis.hpp b/include/Analysis.hpp index a600f56b..9890e2b9 100644 --- a/include/Analysis.hpp +++ b/include/Analysis.hpp @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include // Boost #include @@ -12,26 +15,63 @@ #include #include #include +#include + + +// SeqAn3 +#include +#include // Class #include "IBPTree.hpp" +// define tags +using seqan3::operator""_tag; + namespace po = boost::program_options; namespace fs = boost::filesystem; namespace pt = boost::property_tree; +namespace ma = boost::math; class Analysis { public: Analysis(po::variables_map params); ~Analysis(); - void start(pt::ptree sample); - void process(std::string& single, std::string& splits); + void start(pt::ptree sample, pt::ptree condition); + void processSingle(std::string& single); + void processSplits(std::string& splits, std::string& output); + void normalize(); // normalize the frequencies to 1 + + // write output files (of the analysis) + void writeInteractionsHeader(std::ofstream& fout); + void writeAllIntsHeader(std::ofstream& fout); + void addToAllIntsHeader(std::ofstream& fout, std::string key); + void writeAllInts(); + + // other operations + void addToFreqMap(std::pair key, double value); + void addToSuppReads(dtp::IntPartner p1, dtp::IntPartner p2); + void addToCompl(dtp::IntPartner p1, dtp::IntPartner p2, double complementarity); + void addToHybEnergies(dtp::IntPartner p1, dtp::IntPartner p2, double hybenergy); + + // calculate filters + double calcGCS(std::vector& complementarities); + double calcGHS(std::vector& hybenergies); + double calcStat(dtp::IntKey key, int x); private: po::variables_map params; IBPTree features; - std::map, double> pdf; + std::map,double> freq; // strand, name + std::string condition; // buffers the current condition + // maps for storing filters and suppreads + std::map> suppreads; + std::map>> complementarities; + std::map>> hybenergies; + + int repcount; // counter for current replicate + int readcount; // total number of reads }; #endif //RNANUE_ANALYSIS_HPP \ No newline at end of file diff --git a/include/Clustering.hpp b/include/Clustering.hpp index 1bdaa895..7ad241ae 100644 --- a/include/Clustering.hpp +++ b/include/Clustering.hpp @@ -24,13 +24,13 @@ namespace fs = boost::filesystem; struct Segment { std::string refid; - uint32_t flag; + seqan3::sam_flag flag; uint32_t start; uint32_t end; - Segment() : refid(""), flag(0), start(0), end(0) {}; - Segment(std::string _refid, uint32_t _flag, unsigned int _start, unsigned int _end) : - refid(_refid), flag(_flag), start(_start), end(_end) {}; + Segment() : refid(""), flag(seqan3::sam_flag{}), start(0), end(0) {}; + Segment(std::string refid, seqan3::sam_flag flag, unsigned int start, unsigned int end) : + refid(refid), flag(flag), start(start), end(end) {}; }; struct Cluster { diff --git a/include/Config.hpp b/include/Config.hpp index 42f7d01e..0d9ab233 100644 --- a/include/Config.hpp +++ b/include/Config.hpp @@ -1,4 +1,4 @@ // the configured options and settings for Tutorial #define RNAnue_VERSION_MAJOR 0 #define RNAnue_VERSION_MINOR 2 -#define RNAnue_VERSION_PATCH 0 +#define RNAnue_VERSION_PATCH 1 diff --git a/include/DataTypes.hpp b/include/DataTypes.hpp index 11a917a4..fbcfd058 100644 --- a/include/DataTypes.hpp +++ b/include/DataTypes.hpp @@ -79,6 +79,26 @@ namespace dtp { }; using StatsMap = std::map; using SpliceJunctions = std::map>>; + + // Analysis + using PdfMap = std::map, double>; + struct IntPartner { + std::string partner; + std::string orientation; + IntPartner() : partner(""), orientation("") {} + IntPartner(std::string partner, std::string orientation) : partner(partner), orientation(orientation) {} + + bool operator<(const IntPartner& other) const { + if(partner < other.partner) { + return true; + } else if(partner == other.partner) { + return orientation < other.orientation; + } else { + return false; + } + } + }; + using IntKey = std::vector; } #endif //RNANUE_DATATYPES_HPP diff --git a/include/IBPTree.hpp b/include/IBPTree.hpp index cbd25602..f1044cf5 100644 --- a/include/IBPTree.hpp +++ b/include/IBPTree.hpp @@ -33,10 +33,12 @@ class IBPTree { // tree operations Node* getRoot(IntervalData& data); void insert(IntervalData& data); + void update(Node* node); // updates the keys in parent void insertIter(Node* node, IntervalData& data); void splitNode(Node* node, int index); - std::vector search(std::string chrom, dtp::Interval interval); - void searchIter(Node* node, const dtp::Interval& interval, std::vector& results); + std::vector> search(std::string chrom, dtp::Interval interval); + void searchIter(Node* node, const dtp::Interval& interval, + std::vector>& results); bool isOverlapping(dtp::Interval intvl1, dtp::Interval intvl2); diff --git a/include/Node.hpp b/include/Node.hpp index 9c1e974c..888db20e 100644 --- a/include/Node.hpp +++ b/include/Node.hpp @@ -13,7 +13,7 @@ class IntervalData { public: IntervalData(std::string chrom, char strand, std::string id, std::string name, - std::string biotype, dtp::Interval interval); + std::string biotype, dtp::Interval interval, IntervalData* split); ~IntervalData(); // operator overloading @@ -35,6 +35,8 @@ class IntervalData { void setInterval(dtp::Interval interval); dtp::SpliceJunctions getJunctions() const; void setJunctions(dtp::SpliceJunctions junctions); + IntervalData* getSplit() const; + void setSplit(IntervalData* split); // operations void addJunction(std::string name, std::pair junction); @@ -50,6 +52,7 @@ class IntervalData { std::string biotype; dtp::Interval interval; dtp::SpliceJunctions junctions; + IntervalData* split; }; class Node { @@ -60,7 +63,6 @@ class Node { std::vector children; Node* next; // link to the next node Node* parent; // link to the parent node - std::vector splits; // link to the corresponding split node bool isLeaf; // constructor diff --git a/include/Utility.hpp b/include/Utility.hpp index 28ce3095..968e3b76 100644 --- a/include/Utility.hpp +++ b/include/Utility.hpp @@ -41,12 +41,13 @@ namespace helper { // general operations bool withinRange(int a, int b, int range); std::string removeNonPrintable(const std::string str); - - - std::string getTime(); // reports the current time } +namespace stats { + double median(std::vector& values); +} + // sequence input/output namespace seqIO { bool filterReads(auto& qual, int quality, int minlen); diff --git a/src/Analysis.cpp b/src/Analysis.cpp index ad7d377b..969b27dd 100644 --- a/src/Analysis.cpp +++ b/src/Analysis.cpp @@ -1,7 +1,7 @@ #include "Analysis.hpp" // -Analysis::Analysis(po::variables_map _params) : params(_params) { +Analysis::Analysis(po::variables_map _params) : params(_params), repcount{0}, readcount{0} { int k = 3; // can be later extracted from config file // extract features (e.g., IBPTree) this->features = IBPTree(params, k); // create IBPT w/ annotations @@ -11,423 +11,358 @@ Analysis::Analysis(po::variables_map _params) : params(_params) { fs::path clusterFile = outDir / fs::path("clustering") / fs::path("clusters.tab"); this->features.iterateClusters(clusterFile.string()); } - this->pdf = std::map, double>(); - + this->freq = std::map, double>(); + this->suppreads = std::map>(); + this->complementarities = std::map>>(); + this->hybenergies = std::map>>(); } -Analysis::~Analysis() {} +Analysis::~Analysis() { +} -void Analysis::start(pt::ptree sample) { - this->pdf.clear(); // reset pdf +void Analysis::start(pt::ptree sample, pt::ptree condition) { + this->freq.clear(); // reset freq // retrieve input and output files pt::ptree input = sample.get_child("input"); std::string single = input.get("single"); std::string splits = input.get("splits"); - //iterate(single, splits); -} - -void Analysis::process(std::string& single, std::string& splits) { - -} - - + pt::ptree output = sample.get_child("output"); + std::string interactions = output.get("interactions"); + // + this->freq = std::map, double>(); + this->condition = condition.get("condition"); // store current condition + this->readcount = 0; + processSingle(single); + processSplits(splits, interactions); -/* -std::string line; -std::ifstream anno; + this->repcount++; // increase replicate count -anno.open(params["features"].as()); -// parse annotations -if(!anno.is_open()) { - perror("Error open"); - exit(EXIT_FAILURE); + normalize(); } -while(getline(anno, line)) { - if(line[0] == '#') { continue; } - - std::vector tokens; - std::istringstream iss(line); - std::string token; - while(std::getline(iss, token, '\t')) { - tokens.push_back(token); - } - - // boundaries - std::pair bnds = std::make_pair(std::stoi(tokens[3]),std::stoi(tokens[4])); - std::pair,std::string> con = std::make_pair(bnds,"strand="+tokens[6]+";"+tokens[8]); - std::size_t start_position; - if(features.count(tokens[0]) == 0) { // check of RefName is in map - if(start_position != std::string::npos) { - features.insert(std::pair,std::string>>>(tokens[0],{con})); +void Analysis::processSingle(std::string& single) { + seqan3::sam_file_input inputSingle{single}; // match reads + std::vector ref_lengths{}; // header + for(auto &info : inputSingle.header().ref_id_info) { + ref_lengths.push_back(std::get<0>(info)); + } + std::deque refIds = inputSingle.header().ref_ids(); + std::optional refOffset = std::nullopt; + std::string orient = ""; // e.g., sense (ss) or antisense (as) + char strand = ' '; + uint32_t start = 0, end = 0; + for(auto& rec : inputSingle) { + refOffset = rec.reference_position(); + start = rec.reference_position().value(); + end = rec.reference_position().value() + rec.sequence().size(); + if(static_cast(rec.flag() & seqan3::sam_flag::on_reverse_strand)) { strand = '-'; } else { strand = '+'; } + + std::vector> ovlps = features.search(refIds[rec.reference_id().value()], + {start, end}); + if(ovlps.size() > 0) { + for(auto& ovl : ovlps) { + if(strand == ovl.second->getStrand()) { orient = "sense"; } else { orient = "antisense"; } + addToFreqMap(std::make_pair(orient, ovl.second->getName()), 1.0/(double)ovlps.size()); // add to freq + } } - } else { - features[tokens[0]].push_back(con); + readcount++; } } -// read file -fs::path freqFile = fs::path(params["outdir"].as()) / fs::path("frequency.txt"); -std::ifstream freqHandle; -freqHandle.open(freqFile.string()); - -// iterate and store into frquency map -while(getline(freqHandle, line)) { - std::vector tokens; - std::istringstream iss(line); - std::string token; - while(std::getline(iss, token, '\t')) { - tokens.push_back(token); - } - frequency.insert(std::pair(tokens[0],std::stoi(tokens[1]))); -}*/ - - - - - -/* -// -void Analysis::createCountTable() { - std::map,std::vector,std::vector>>> counts; - - std::ifstream intsfile; - for(unsigned i=0;i tokens; - std::istringstream iss(line); - std::string token; - while(std::getline(iss, token, '\t')) { - tokens.push_back(token); - } - - std::tuple key1; - std::tuple key2; - key1 = std::make_tuple(tokens[5],tokens[8],tokens[13],tokens[16]); - key2 = std::make_tuple(tokens[5],tokens[8],tokens[13],tokens[16]); + std::vector ref_lengths{}; // header + for(auto &info : inputSplits.header().ref_id_info) { + ref_lengths.push_back(std::get<0>(info)); + } + std::deque refIds = inputSplits.header().ref_ids(); - // - if(counts.count(key1) == 0 && counts.count(key2) == 0) { - std::vector,std::vector>> content; + // variables + std::string outLine = ""; // buffers the output file for interaction entry + std::bitset<1> skip = 0; // skip if no match + char strand = ' '; // strand of segment + std::string orient = ""; // orientation of segment (ss or as) + uint32_t start = 0, end = 0; // start and end of segment + std::string refName = ""; // reference name - for(unsigned j=0; j,std::vector> vals{0,{},{}}; - content.push_back(vals); + // information from the tags + seqan3::sam_tag_dictionary tags; + std::pair filters; // complementarity and hybridization energy + std::vector aligns; // alignment of complementarity + + // segment information (e.g., gene name, product, etc.) - keys are the as or ss + std::map> segmentInfo = {}; + + for(auto&& rec : inputSplits | seqan3::views::chunk(2)) { + outLine = (*rec.begin()).id(); // QNAME + // reset + skip = 0; // reset skip + segmentInfo.clear(); // reset segment info + filters = std::make_pair(0.0, 0.0); // reset filters + aligns.clear(); + + // retrieve complementarity and energy + tags = (*rec.begin()).tags(); + filters = std::make_pair(std::get(tags["XC"_tag]), std::get(tags["XE"_tag])); + + dtp::IntKey intKey = {}; // key to store the interaction (partner1, orientation1, partner2, orientation2) + + for (auto &split: rec) { + if (static_cast(split.flag() & seqan3::sam_flag::on_reverse_strand)) { + strand = '-'; + } else { strand = '+'; } + outLine += "\t" + std::string(1, strand); // strand + start = split.reference_position().value(); + end = split.reference_position().value() + split.sequence().size(); + outLine += "\t" + std::to_string(start) + "\t" + std::to_string(end); // start & end + refName = refIds[split.reference_id().value()]; + outLine += "\t" + refName; // reference name + + tags = split.tags(); // retrieve the alignment information + aligns.push_back(std::get(tags["XA"_tag])); + + // match with features + std::vector> ovlps = features.search(refIds[split.reference_id().value()], + {start, end}); + segmentInfo = {}; // reset segmentInfo + if(ovlps.size() > 0) { + segmentInfo.insert({"sense", {}}); + segmentInfo.insert({"antisense", {}}); + for (auto &ovl: ovlps) { + if (strand == ovl.second->getStrand()) { orient = "sense"; } else { orient = "antisense"; } + segmentInfo[orient].push_back(ovl.second); // add to segmentInfo + addToFreqMap(std::make_pair(orient, ovl.second->getName()), 1.0/(double)ovlps.size()); // add to freq + + + /* + std::string info = ovl.second->getName() + "\t" + ovl.second->getBiotype() + "\t" + + ovl.second->getStrand() + "\t" + "." + "\t" + orient; + segmentInfo[orient].push_back(info);*/ } - std::get<0>(content[i]) = 1; - std::vector en = std::get<1>(content[i]); - std::vector cm = std::get<2>(content[i]); - - en.push_back(std::stof(tokens[17])); - cm.push_back(std::stof(tokens[18])); - - std::get<1>(content[i]) = en; - std::get<2>(content[i]) = cm; - - counts.insert(std::pair,std::vector,std::vector>>>(key1,content)); - - } else { - if(counts.count(key1) > 0) { - std::tuple,std::vector> oldVal; - oldVal = counts[key1][i]; - std::get<0>(oldVal) = std::get<0>(oldVal) + 1; - - std::vector oldValEnergy = std::get<1>(oldVal); - std::vector oldValCmpl = std::get<2>(oldVal); - - oldValEnergy.push_back(std::stof(tokens[17])); - oldValCmpl.push_back(std::stof(tokens[18])); - - std::get<1>(oldVal) = oldValEnergy; - std::get<2>(oldVal) = oldValCmpl; - - counts[key1][i] = oldVal; + // check which one to select + if (segmentInfo["sense"].size() == 1) { + IntervalData* data = segmentInfo["sense"][0]; + outLine += "\t" + data->getName() + "\t" + data->getBiotype() + "\t" + data->getStrand(); + outLine += "\t.\tsense"; + intKey.push_back(dtp::IntPartner(data->getName(),"sense")); } else { - if(counts.count(key2) > 0) { - std::tuple,std::vector> oldVal; - oldVal = counts[key2][i]; - std::get<0>(oldVal) = std::get<0>(oldVal) + 1; - - std::vector oldValEnergy = std::get<1>(oldVal); - std::vector oldValCmpl = std::get<2>(oldVal); - - oldValEnergy.push_back(std::stof(tokens[17])); - oldValCmpl.push_back(std::stof(tokens[18])); - - std::get<1>(oldVal) = oldValEnergy; - std::get<2>(oldVal) = oldValCmpl; - - counts[key2][i] = oldVal; + if (segmentInfo["antisense"].size() == 1) { + IntervalData* data = segmentInfo["antisense"][0]; + outLine += "\t" + data->getName() + "\t" + data->getBiotype() + "\t" + data->getStrand(); + outLine += "\t.\tsense"; + intKey.push_back(dtp::IntPartner(data->getName(),"antisense")); + } else { + skip = 1; // skip if not unique } } - } + } else { skip = 1; } // skip if no match } + outLine += "\t" + std::to_string(filters.first); // complementarity + outLine += "\t" + aligns[0] + "\t" + aligns[1]; // alignment + outLine += "\t" + std::to_string(filters.second) + "\n"; // energy + if (skip == 0) { + outInts << outLine; + addToSuppReads(intKey[0], intKey[1]); + addToCompl(intKey[0], intKey[1], filters.first); + addToHybEnergies(intKey[0], intKey[1], filters.second); + } // write to file (if not skipped) + readcount++; } + outInts.close(); +} +void Analysis::writeInteractionsHeader(std::ofstream& fout) { + fout << "qname\tfst_seg_strd\tfst_seg_strt\tfst_seg_end\tfst_seg_ref\tfst_seg_name\tfirst_seg_bt\t"; + fout << "fst_seg_anno_strd\tfst_seg_prod\tfst_seg_ori\tsec_seg_strd\tsec_seg_strt\tsec_seg_end\t"; + fout << "sec_seg_ref\tsec_seg_name\tsec_seg_bt\tsec_seg_anno_strd\tsec_seg_prod\tsec_seg_ori\t"; + fout << "cmpl\tfst_seg_compl_aln\tsec_seg_cmpl_aln\tmfe\n"; +} - // write back to file - std::string outfile = params["outdir"].as(); - fs::path outPath = fs::path(outfile) / "allints.txt"; - - std::ofstream outFileHandle; - outFileHandle.open(outPath.string()); +void Analysis::writeAllIntsHeader(std::ofstream& fout) { fout << "fst_rna\tsec_rna\tfst_rna_ori\tsec_rna_ori";} +void Analysis::addToAllIntsHeader(std::ofstream &fout, std::string key) { fout << "\t" << key; } - outFileHandle << "RNA1\tRNA2\tRNA1orientation\tRNA2orientation\t"; +void Analysis::normalize() { + // sum up all frequencies + double sum = 0.0; + for(auto& f : this->freq) { + sum += f.second; + } - for(unsigned i=0;ifreq) { + f.second = f.second/sum; } - outFileHandle << "\n"; - for(auto it=counts.begin(); it!=counts.end(); ++it) { - outFileHandle << std::get<0>(it->first) << "\t"; - outFileHandle << std::get<2>(it->first) << "\t"; - outFileHandle << std::get<1>(it->first) << "\t"; - outFileHandle << std::get<3>(it->first) << "\t"; + // print + /* + for(auto& f : freq) { + std::cout << f.first.first << " " << f.first.second << " " << f.second << std::endl; + }*/ +} - // - for(unsigned z=0;zsecond.size();++z) { - outFileHandle << std::get<0>(it->second[z]) << "\t"; +void Analysis::addToFreqMap(std::pair key, double value) { +if(freq.count(key) == 0) { + freq.insert(std::make_pair(key, value)); + } else { + freq[key] += value; + } +} - std::vector vecNrg = std::get<1>(it->second[z]); - std::vector vecCpl = std::get<2>(it->second[z]); +bool operator<(const dtp::IntKey& lhs, const dtp::IntKey& rhs) { + return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +} - // determine ges - float ges = 0.0; - if(vecNrg.size() == 1) { - ges = vecNrg[0]; +void Analysis::addToSuppReads(dtp::IntPartner p1, dtp::IntPartner p2) { + dtp::IntKey key; + if(p1 < p2) { key = {p1, p2}; } else { key = {p2, p1}; } + // check if key in map + if(suppreads.find(key) == suppreads.end()) { + std::vector suppReadsRepl = {}; // new list for all replicates (up to the current one) + for(int i=0;i& suppReadsRepl = this->suppreads[key]; + if(suppReadsRepl.size() < repcount) { // not yet at the current replicate (probably skipped a few) + for(int i=suppReadsRepl.size(); i repcount) { + suppReadsRepl[repcount] += 1.0; } else { - if(vecNrg.size() > 1) { - auto m = vecNrg.begin() + vecNrg.size()/2; - std::nth_element(vecNrg.begin(), m, vecNrg.end()); - double min = *min_element(vecNrg.begin(), vecNrg.end()); - ges = vecNrg[vecNrg.size()/2]; - } + suppReadsRepl.push_back(1.0); } - outFileHandle << ges << "\t"; + } + } +} - // determine gcs - float gcs = 0.0; - if(vecCpl.size() == 1) { - gcs = vecCpl[0]; +void Analysis::addToCompl(dtp::IntPartner p1, dtp::IntPartner p2, double complementarity) { + dtp::IntKey key; + if(p1 < p2) { key = {p1, p2}; } else { key = {p2, p1}; } + // check if key in map + if(complementarities.find(key) == complementarities.end()) { + std::vector> CmplsRepl = {}; // new list for all replicates (up to the current one) + for(int i=0;i>& cmplsRepl = this->complementarities[key]; + if(cmplsRepl.size() < repcount) { // not yet at the current replicate (probably skipped a few) + for(int i=cmplsRepl.size(); i repcount) { + cmplsRepl[repcount].push_back(complementarity); } else { - if(vecCpl.size() > 1) { - auto m = vecCpl.begin() + vecCpl.size()/2; - std::nth_element(vecCpl.begin(), m, vecCpl.end()); - double min = *min_element(vecCpl.begin(), vecCpl.end()); - gcs = vecCpl[vecCpl.size()/2]; - } + cmplsRepl.push_back({complementarity}); } - outFileHandle << gcs << "\t"; } - outFileHandle << "\n"; } - outFileHandle.close(); } - - -// -std::string Analysis::retrieveTagValue(std::string tags, std::string tagName, std::string oldValue) { - std::size_t start_position = tags.find(tagName+"="); - // gene name - if(start_position != std::string::npos) { - std::string sub = tags.substr(start_position+tagName.size()+1,tags.length()); - std::size_t end_position = sub.find(";"); - oldValue = sub.substr(0,end_position); +void Analysis::addToHybEnergies(dtp::IntPartner p1, dtp::IntPartner p2, double hybenergy) { + dtp::IntKey key; + if(p1 < p2) { key = {p1, p2}; } else { key = {p2, p1}; } + // check if key in map + if(hybenergies.find(key) == hybenergies.end()) { + std::vector> hybRepl = {}; // new list for all replicates (up to the current one) + for(int i=0;i>& hybRepl = this->hybenergies[key]; + if(hybRepl.size() < repcount) { // not yet at the current replicate (probably skipped a few) + for(int i=hybRepl.size(); i repcount) { + hybRepl[repcount].push_back(hybenergy); + } else { + hybRepl.push_back({hybenergy}); + } + } } - return oldValue; } +double Analysis::calcGCS(std::vector& complementarities) { + if(complementarities.size() == 0) { return 0.0; } + double median = stats::median(complementarities); + double maximum = *std::max_element(complementarities.begin(), complementarities.end()); + return median * maximum; +} -float Analysis::calc_pvalue(int x, int n, float p) { - // simulate draw from binomial distribution - std::binomial_distribution binomial(n, p); - // assign p value - float pval = 1.0 - cdf(binomial, x); - return pval; -}*/ - - - /* - // retrieve input and output files - pt::ptree input = sample.get_child("input"); - std::string splits = input.get("splits"); - pt::ptree output = sample.get_child("output"); - std::string interactions = output.get("interactions"); - - - interPaths.push_back(interactions); - - // input .sam record - seqan3::alignment_file_input fin{splits, - seqan3::fields{}}; - - std::vector flags; - std::vector refIDs; - std::vector> refOffsets; - - std::vector ref_lengths{}; - for(auto &info : fin.header().ref_id_info) { - ref_lengths.push_back(std::get<0>(info)); +double Analysis::calcGHS(std::vector& hybenergies) { + if(hybenergies.size() == 0) { return 1000.0; } + double median = stats::median(hybenergies); + double minimum = *std::min_element(hybenergies.begin(), hybenergies.end()); + if(median < 0 && minimum < 0) { + return -sqrt(median * minimum); + } else { + if(median > 0 && minimum > 0) { + return sqrt(median * minimum); + } else { + return sqrt(abs(median * minimum)); + } } - std::deque ref_ids = fin.header().ref_ids(); - -// uint32_t flag, start, end; - - // open file - std::ofstream outInts; - outInts.open(interactions); - - std::string entry; // stores output to write to file - - // variables for interactions file - std::string qNAME, flag; - uint32_t start, end; - std::string geneID, geneName, product, annoStrand; - float hybnrg, cmpl; +} - seqan3::sam_tag_dictionary tags; +double Analysis::calcStat(dtp::IntKey key, int x) { + std::pair p1 = {key[0].orientation, key[0].partner}; + std::pair p2 = {key[1].orientation, key[1].partner}; + double p; + if(freq.find(p1) == freq.end() || freq.find(p2) == freq.end()) { + p = 0.0; + } else { + if(p1 == p2) { + p = freq[p1]*freq[p2]; + } else { + p = 2*freq[p1]*freq[p2]; + } + } + ma::binomial_distribution binomial(this->readcount, p); + double pval = 1.0 - ma::cdf(binomial, x); + return pval; +} - outInts << "#QNAME\tSegment1Strand\tSegment1Start\tSegment1End\tSegment1RefName" - outInts << "\tSegment1Name\tSegment1AnnoStrand\tSegment1Product\tSegment1Orientation" - outInts << "\tSegment2Strand\tSegment2Start\tSegment2End\tSegment2RefName\tSegment2Name" - outInts << "\tSegment2AnnoStrand\tSegment2Product\tSegment1Orientation\tenergy\tcomplementarity\n"; - - int segCnt = 0; - int segCntMatch = 0; - int segFound = 0; - - for(auto && rec : fin | seqan3::views::chunk(2)) { - entry = ""; - hybnrg = 0.0; - cmpl = 0.0; - for(auto & split : rec) { - // seqan3::debug_stream << split << std::endl; - - qNAME = seqan3::get(split); - flag = "+"; - if(static_cast(seqan3::get(split) - & seqan3::sam_flag::on_reverse_strand)) { - flag = "-"; +void Analysis::writeAllInts() { + std::string outDirPath = params["outdir"].as(); + fs::path outDirDetectPath = fs::path(outDirPath) / fs::path("analysis"); + fs::path allIntsPath = fs::path(outDirDetectPath) / fs::path("allints.txt"); + std::ofstream fout(allIntsPath.string(), std::ios::out); + + if(fout.is_open()) { + for(auto& entry : suppreads) { + fout << entry.first[0].partner << "\t" << entry.first[1].partner; + fout << "\t" << entry.first[0].orientation << "\t" << entry.first[1].orientation; + for(int i=0;i(split).value(); - end = start + seqan3::get(split).size()-1; - - // refID - std::optional refIDidx = seqan3::get(split); - std::string refID = ref_ids[refIDidx.value()]; - - tags = seqan3::get(split); - auto nrg = tags["XE"_tag]; - auto cpl = tags["XC"_tag]; - hybnrg = std::get(nrg); - cmpl = std::get(cpl); - - std::vector,std::string>> feat; - if(features.count(refID) > 0) { - feat = features[refID]; - - std::pair geneNames; - geneID = "."; - geneName = "."; - product = "."; - annoStrand = "."; - for(unsigned i=0;i= feat[i].first.first && start <= feat[i].first.second) || - (end >= feat[i].first.first && end <= feat[i].first.second)) { - - // make sure that overlap is in exon - if(params["splicing"].as() && feat[i].second.find("exon") == std::string::npos) { - continue; - } - geneID = retrieveTagValue(feat[i].second, "ID", geneName); - geneName = retrieveTagValue(feat[i].second, "gene", geneName); - product = retrieveTagValue(feat[i].second, "product", product); - annoStrand = retrieveTagValue(feat[i].second, "strand", annoStrand); - - segFound = 1; // found a match (in annotation) for segment - } - } - - // search for geneNames in frequency map - geneNames = std::make_pair(retrieveTagValue(feat[0].second, "gene", geneName), retrieveTagValue(feat[1].second, "gene", geneName)); - - float value; - // check if geneNames first in frequency map - if(frequency.count(geneNames.first) > 0 && frequency.count(geneNames.second) > 0) { - if (geneNames.first == geneNames.second) { - value = frequency[geneNames.first] * frequency[geneNames.second]; - } else { - value = 2 * (frequency[geneNames.first] * frequency[geneNames.second]); - } - } else{ - value = 0; - } - - // calculate p-value - int draws = frequency[geneNames.first] + frequency[geneNames.second]; - float pval = calc_pvalue(segCntMatch, draws, value); - - - if(segFound == 1) { - if(segCnt == 0) { entry += qNAME + "\t";} - entry += flag + "\t"; - entry += std::to_string(start) + "\t"; - entry += std::to_string(end) + "\t"; - entry += refID + "\t"; - entry += geneName + "\t"; - entry += annoStrand + "\t"; - entry += "\"" + product + "\"\t"; - - if(flag == annoStrand) { - entry += "sense\t"; - } else { - entry += "antisense\t"; - } - - entry += pval + "\t"; - - segCntMatch++; - segFound = 0; + if(entry.second.size() < repcount) { + for(int i=entry.second.size();i refId = split.reference_id(); - uint32_t flag{0}; // SAMFLAG + seqan3::sam_flag flag = split.flag(); // SAMFLAG uint32_t start = split.reference_position().value(); uint32_t end = start + split.sequence().size()-1; Segment seg(ref_ids[refId.value()], flag, start, end); @@ -77,7 +77,7 @@ void Clustering::sumup() { if(outputFile.is_open()) { outputFile << "cluster" << clustID++ << "\t"; outputFile << result[i].elements[0].refid << "\t"; - if(result[i].elements[0].flag == 0) { + if(!static_cast(result[i].elements[0].flag & seqan3::sam_flag::on_reverse_strand)) { outputFile << "+" << "\t"; } else { outputFile << "-" << "\t"; @@ -86,7 +86,7 @@ void Clustering::sumup() { outputFile << result[i].elements[0].end << "\t"; outputFile << result[i].elements[1].refid << "\t"; - if(result[i].elements[1].flag == 0) { + if(!static_cast(result[i].elements[0].flag & seqan3::sam_flag::on_reverse_strand)) { outputFile << "+" << "\t"; } else { outputFile << "-" << "\t"; diff --git a/src/Data.cpp b/src/Data.cpp index cbfb701f..d882847f 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -409,6 +409,7 @@ void Data::clustering() { void Data::analysis() { std::cout << helper::getTime() << "Start the Analysis\n"; Analysis ana(params); - callInAndOut(std::bind(&Analysis::start, ana, std::placeholders::_1)); + callInAndOut(std::bind(&Analysis::start, &ana, std::placeholders::_1, std::placeholders::_2)); + ana.writeAllInts(); } diff --git a/src/IBPTree.cpp b/src/IBPTree.cpp index 3e9a0aa4..f551ee82 100644 --- a/src/IBPTree.cpp +++ b/src/IBPTree.cpp @@ -19,10 +19,7 @@ IBPTree::~IBPTree() {} // constructs the IBPTree (either from annotations/clusters or both) void IBPTree::construct() { std::string subcall = params["subcall"].as(); - // fill tree with annotation - if(subcall == "detect") { // in the case of detect only annotation is needed - iterateFeatures(params["features"].as()); - } + iterateFeatures(params["features"].as()); } void IBPTree::iterateFeatures(std::string featuresFile) { @@ -77,7 +74,7 @@ void IBPTree::iterateFeatures(std::string featuresFile) { intvl = nullptr; } intvl = new IntervalData(fields.seqid, fields.strand, id, name, biotype, - std::make_pair(fields.start, fields.end)); + std::make_pair(fields.start, fields.end), nullptr); } else { // only if the feature is an exon (needed for detect step) if(params["subcall"].as() == "detect") { @@ -102,12 +99,16 @@ void IBPTree::iterateFeatures(std::string featuresFile) { } void IBPTree::iterateClusters(std::string clusterFile) { - // iterate over clusters std::ifstream clusters(clusterFile); if(!clusters) { std::cout << helper::getTime() << " Cluster file " << clusterFile << " could not be opened!\n"; EXIT_FAILURE; } + + IntervalData* firstSegment = nullptr; + IntervalData* secondSegment = nullptr; + + // variables to buffer the current cluster std::string name = ""; std::pair chroms = std::make_pair("", ""); std::pair start = std::make_pair(0,0); @@ -115,6 +116,10 @@ void IBPTree::iterateClusters(std::string clusterFile) { std::pair strand = std::make_pair(' ', ' '); std::pair counts = std::make_pair(0,0); std::pair size = std::make_pair(0,0); + int nosplits = 0; + + IntervalData* intvl1 = nullptr; + IntervalData* intvl2 = nullptr; std::string line; if(getline(clusters, line)) {} // ignore the first line (header) @@ -130,13 +135,62 @@ void IBPTree::iterateClusters(std::string clusterFile) { start = std::make_pair(std::stoi(tokens[3]), std::stoi(tokens[7])); end = std::make_pair(std::stoi(tokens[4]), std::stoi(tokens[8])); strand = std::make_pair(tokens[2][0], tokens[6][0]); + nosplits = std::stoi(tokens[9]); + + if(nosplits < 2) { continue; } // ignore clusters with less than 2 splits reads + std::vector> fstOvlps = search(chroms.first, + {start.first, end.first}); + + if(fstOvlps.size() > 0) { + for(auto& res : fstOvlps) { + dtp::Interval tmpIntvl = res.second->getInterval(); + if(start.first < tmpIntvl.first && strand.first == res.second->getStrand()) { + res.second->setId(name + "/" + res.second->getId()); + res.second->setName(name + "/" + res.second->getName()); + res.second->setInterval({start.first, res.second->getInterval().second}); + update(res.first->parent); + } + if(end.first > tmpIntvl.second && strand.first == res.second->getStrand()) { + res.second->setId(res.second->getId() + "/" + name); + res.second->setName(res.second->getName() + "/" + name); + res.second->setInterval({res.second->getInterval().first, end.first}); + update(res.first->parent); + } + intvl1 = res.second; + } + } else { + intvl1 = new IntervalData(chroms.second, strand.second, name, name, "cluster", + std::make_pair(start.second, end.second), nullptr); + insert(*intvl1); + } + + std::vector> secOvlps = search(chroms.second, + {start.second, end.second}); + if(secOvlps.size() > 0) { + for(auto& res : secOvlps) { + dtp::Interval tmpIntvl = res.second->getInterval(); + if(start.second < tmpIntvl.first && strand.second == res.second->getStrand()) { + res.second->setId(name + "/" + res.second->getId()); + res.second->setName(name + "/" + res.second->getName()); + res.second->setInterval({start.second, res.second->getInterval().second}); + update(res.first->parent); + } + if(end.second > tmpIntvl.second && strand.second == res.second->getStrand()) { + res.second->setId(res.second->getId() + "/" + name); + res.second->setName(res.second->getName() + "/" + name); + res.second->setInterval({res.second->getInterval().first, end.second}); + update(res.first->parent); + } + intvl2 = res.second; + } + } else { + intvl2 = new IntervalData(chroms.second, strand.second, name, name, "cluster", + std::make_pair(start.second, end.second), nullptr); + insert(*intvl2); + } + intvl1->setSplit(intvl2); + intvl2->setSplit(intvl1); - IntervalData* intvl1 = new IntervalData(chroms.first, strand.first, name, "", "", - std::make_pair(start.first, end.first)); - IntervalData* intvl2 = new IntervalData(chroms.second, strand.second, name, "", "", - std::make_pair(start.second, end.second)); - insert(*intvl1); - insert(*intvl2); } } @@ -204,9 +258,9 @@ void IBPTree::splitNode(Node* parent, int index) { } } -std::vector IBPTree::search(std::string chrom, dtp::Interval interval) { +std::vector> IBPTree::search(std::string chrom, dtp::Interval interval) { Node* root = this->rootnodes[chrom]; // search for the root node - std::vector result; + std::vector> result; searchIter(root, interval, result); std::sort(result.begin(), result.end()); @@ -215,11 +269,12 @@ std::vector IBPTree::search(std::string chrom, dtp::Interval inte return result; } -void IBPTree::searchIter(Node* node, const dtp::Interval& interval, std::vector& results) { +void IBPTree::searchIter(Node* node, const dtp::Interval& interval, + std::vector>& results) { if(node->isLeaf) { for(int i=0; ikeys.size();++i) { if(isOverlapping(interval, node->keys[i].first)) { - results.push_back(node->keys[i].second); + results.push_back(std::make_pair(node, node->keys[i].second)); } } @@ -241,6 +296,28 @@ void IBPTree::searchIter(Node* node, const dtp::Interval& interval, std::vector< } } +void IBPTree::update(Node* node) { + // update parent of node (until root is reached) + if(node == nullptr) { + return; + } else { + for(int i=0; ikeys.size(); ++i) { + dtp::Interval intvl = node->keys[i].first; + std::cout << node->children[i]->keys.size() << "\n"; + for(int j=0; jchildren[i]->keys.size(); ++j) { + if(node->children[i]->keys[j].first < intvl) { + intvl.first = node->children[i]->keys[j].first.first; + } + if(node->children[i]->keys[j].first.second > intvl.second) { + intvl.second = node->children[i]->keys[j].first.second; + } + } + node->keys[i].first = intvl; + } + update(node->parent); + } +} + bool IBPTree::isOverlapping(dtp::Interval intvl1, dtp::Interval intvl2) { dtp::Interval intvl = {std::max(intvl1.first, intvl2.first), std::min(intvl1.second, intvl2.second)}; diff --git a/src/Node.cpp b/src/Node.cpp index 1ab6e95f..247e6694 100644 --- a/src/Node.cpp +++ b/src/Node.cpp @@ -1,9 +1,8 @@ #include "Node.hpp" IntervalData::IntervalData(std::string chrom, char strand, std::string id, std::string name, - std::string biotype, dtp::Interval interval) : - chrom(chrom), strand(strand), id(id), name(name), biotype(biotype), interval(interval) { -} + std::string biotype, dtp::Interval interval, IntervalData* split) : + chrom(chrom), strand(strand), id(id), name(name), biotype(biotype), interval(interval), split{split} {} IntervalData::~IntervalData() {} bool IntervalData::operator>(const dtp::Interval& other) const { return this->interval.first > other.first; } bool IntervalData::operator<(const dtp::Interval& other) const { return this->interval.first < other.first; } @@ -13,6 +12,8 @@ std::string IntervalData::getChrom() const { return chrom; } void IntervalData::setChrom(std::string chrom) { this->chrom = chrom; } char IntervalData::getStrand() const { return strand; } void IntervalData::setStrand(char strand) { this->strand = strand; } +std::string IntervalData::getId() const { return id; } +void IntervalData::setId(std::string id) { this->id = id; } std::string IntervalData::getName() const { return name; } void IntervalData::setName(std::string name) { this->name = name; } std::string IntervalData::getBiotype() const { return biotype; } @@ -21,6 +22,8 @@ dtp::Interval IntervalData::getInterval() { return interval; } void IntervalData::setInterval(dtp::Interval interval) { this->interval = interval; } dtp::SpliceJunctions IntervalData::getJunctions() const { return junctions; } void IntervalData::setJunctions(dtp::SpliceJunctions junctions) { this->junctions = junctions; } +IntervalData* IntervalData::getSplit() const { return split; } +void IntervalData::setSplit(IntervalData* split) { this->split = split; } // operations void IntervalData::addJunction(std::string name, std::pair junction) { @@ -44,9 +47,8 @@ bool IntervalData::isSubset(int start, int end) { return (start >= this->interval.first && end <= this->interval.second); } - // create new node -Node::Node(int k) : order{k}, keys{}, children{}, next{nullptr}, parent{nullptr}, splits{}, isLeaf{false} {} +Node::Node(int k) : order{k}, keys{}, children{}, next{nullptr}, parent{nullptr}, isLeaf{false} {} // operations void Node::addInterval(IntervalData& data) { diff --git a/src/SplitReadCalling.cpp b/src/SplitReadCalling.cpp index fec3dc47..7a8855fe 100644 --- a/src/SplitReadCalling.cpp +++ b/src/SplitReadCalling.cpp @@ -334,9 +334,9 @@ bool SplitReadCalling::filter(auto& sequence, uint32_t cigarmatch) { bool SplitReadCalling::matchSpliceSites(dtp::Interval& spliceSites, std::optional refId) { if(params["splicing"].as>() == std::bitset<1>("1")) { - std::vector ovlps = features.search(this->refIds[refId.value()], spliceSites); + std::vector> ovlps = features.search(this->refIds[refId.value()], spliceSites); for(int i=0;igetJunctions(); + dtp::SpliceJunctions junctions = ovlps[i].second->getJunctions(); for(auto& [name, junc] : junctions) { for(int j=1;j& values){ + std::sort(values.begin(), values.end()); + size_t size = values.size(); + if(size % 2 == 0) { + return (values[size / 2 - 1] + values[size / 2]) / 2; + } else { + return values[size / 2]; + } +} + + // ############ SEQIO ############ bool seqIO::filterReads(auto& qual, int quality, int minlen) { // determine the length