Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prediction #69

Open
wants to merge 10 commits into
base: prediction
Choose a base branch
from
2 changes: 2 additions & 0 deletions src/Common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ struct Bustools_opt {
std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this
double predict_t = 0.0; //this is how far to predict, t=10 means that we will predict the change in expression at 10 times the number of reads

/* clusterhist */
std::string cluster_input_file;

/* project */
std::string map;
Expand Down
227 changes: 227 additions & 0 deletions src/bustools_clusterhist.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
#include <iostream>
#include <fstream>
#include <cstring>

#include "Common.hpp"
#include "BUSData.h"

#include "bustools_clusterhist.h"
#include <random>
#include <map>

void bustools_clusterhist(Bustools_opt& opt) {
BUSHeader h;
size_t nr = 0;
size_t N = 100000;
uint32_t bclen = 0;
BUSData* p = new BUSData[N];

// read and parse the equivelence class files

std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
std::vector<std::vector<int32_t>> ecmap;

std::unordered_map<std::string, int32_t> txnames;
parseTranscripts(opt.count_txp, txnames);
std::vector<int32_t> genemap(txnames.size(), -1);
std::unordered_map<std::string, int32_t> genenames;
parseGenes(opt.count_genes, txnames, genemap, genenames);
parseECs(opt.count_ecs, h);
ecmap = std::move(h.ecs);
ecmapinv.reserve(ecmap.size());
for (int32_t ec = 0; ec < ecmap.size(); ec++) {
ecmapinv.insert({ ecmap[ec], ec });
}
std::vector<std::vector<int32_t>> ec2genes;
create_ec2genes(ecmap, genemap, ec2genes);


std::string gene_ofn = opt.output + "genes.txt";
std::string histDir = opt.output + "cluster_hists/";


std::vector<BUSData> v;
v.reserve(N);
uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL;
//temporary data
std::vector<int32_t> ecs;
std::vector<int32_t> glist;
ecs.reserve(100);
std::vector<int32_t> u;
u.reserve(100);

//Read the cluster file
std::vector<std::string> clusterNames;
std::unordered_map<uint64_t, size_t> bcClusters;
{
std::ifstream ifs(opt.cluster_input_file);
uint32_t flag = 0;
while (!ifs.eof()) {
std::string bc, cluster;
ifs >> bc >> cluster;
if (!cluster.empty()) {
uint64_t bcBin = stringToBinary(bc, flag);
std::size_t clust = -1;
auto it = std::find(clusterNames.begin(), clusterNames.end(), cluster);
if (it != clusterNames.end()) {
clust = (it - clusterNames.begin());
}
else {
clust = clusterNames.size();
clusterNames.push_back(cluster);
}
bcClusters[bcBin] = clust;
}
}
}
// for (size_t i = 0; i < 5; ++i) {
// std::cout << "rc: " << rc << "\n";
// }

//Allocate histograms

//Indexed as gene*histmax + histIndex
size_t n_genes = genenames.size();
const uint32_t histmax = 100;//set molecules with more than histmax copies to histmax
typedef std::vector<double> Histograms;
std::vector<Histograms> clusterHistograms;
for (std::size_t i = 0; i < clusterNames.size(); ++i) {
clusterHistograms.push_back(std::vector<double>(n_genes * histmax, 0));
}


//barcodes
auto write_barcode = [&](const std::vector<BUSData>& v) {
if (v.empty()) {
return;
}
auto it = bcClusters.find(v[0].barcode);
if (it == bcClusters.end()) {
return; //This is ok, could be a cell that has been filtered in quality check
}

auto& hist = clusterHistograms[it->second];

size_t n = v.size();

for (size_t i = 0; i < n; ) {
size_t j = i + 1;
for (; j < n; j++) {
if (v[i].UMI != v[j].UMI) {
break;
}
}

// v[i..j-1] share the same UMI
ecs.resize(0);
uint32_t counts = 0;
for (size_t k = i; k < j; k++) {
ecs.push_back(v[k].ec);
counts += v[k].count;
}

intersect_genes_of_ecs(ecs, ec2genes, glist);
if (glist.size() == 1) {
//Fill in histograms for prediction.
if (glist[0] < n_genes) { //crasches with an invalid gene file otherwise
hist[glist[0] * histmax + std::min(counts - 1, histmax - 1)] += 1.0; //histmax-1 since histograms[g][0] is the histogram value for 1 copy and so forth
}
else {
std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n";
}
}
i = j; // increment
}
};

for (const auto& infn : opt.files) {
std::streambuf* inbuf;
std::ifstream inf;
if (!opt.stream_in) {
inf.open(infn.c_str(), std::ios::binary);
inbuf = inf.rdbuf();
}
else {
inbuf = std::cin.rdbuf();
}
std::istream in(inbuf);

parseHeader(in, h);
bclen = h.bclen;

int rc = 0;
while (true) {
in.read((char*)p, N * sizeof(BUSData));
size_t rc = in.gcount() / sizeof(BUSData);
//std::cout << "rc: " << rc << "\n";
nr += rc;
if (rc == 0) {
break;
}


for (size_t i = 0; i < rc; i++) {
if (p[i].barcode != current_bc) {
// output whatever is in v
if (!v.empty()) {
write_barcode(v);
}
v.clear();
current_bc = p[i].barcode;
}
v.push_back(p[i]);

}
}
if (!v.empty()) {
write_barcode(v);
}

if (!opt.stream_in) {
inf.close();
}
}
delete[] p; p = nullptr;



// write genes file
writeGenes(gene_ofn, genenames);

//write histogram files
for (std::size_t f = 0; f < clusterNames.size(); ++f) {

auto& histograms = clusterHistograms[f];
std::string hist_ofn = histDir + clusterNames[f] + ".txt";

std::ofstream histof;
histof.open(hist_ofn);

for (std::size_t g = 0; g < genenames.size(); ++g) {
//Indexed as gene*histmax + histIndex
std::size_t offs = g * histmax;

//first figure out the length of the histogram, don't write that to make the file smaller
std::size_t histEnd = histmax - 1;
for (; histEnd != 0; --histEnd) {
if (histograms[offs + histEnd] != 0) {
break;
}
}
for (size_t c = 0; c <= histEnd; ++c) {
if (c != 0) {
histof << '\t';
}
histof << histograms[offs + c];
}

histof << "\n";
}
histof.close();
}


//std::cerr << "bad counts = " << bad_count <<", rescued =" << rescued << ", compacted = " << compacted << std::endl;

//std::cerr << "Read in " << nr << " BUS records" << std::endl;
}
3 changes: 3 additions & 0 deletions src/bustools_clusterhist.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#include "Common.hpp"

void bustools_clusterhist(Bustools_opt &opt);
Loading