diff --git a/news.org b/news.org index 31dc2ed..50ebf90 100644 --- a/news.org +++ b/news.org @@ -1,4 +1,12 @@ #+title: News and Changes +* v0.6.0 +- add =BcfReader::getStatus= +- fix =BcfReader::getVariantsCount= +- improve memory safety + +* v0.5.2 +- throw errors if a query region doesn't exist + * v0.5.1 - add =BcfHeader::updateSamples= diff --git a/test/bcf-reader.cpp b/test/bcf-reader.cpp index 78fe96c..11fe014 100644 --- a/test/bcf-reader.cpp +++ b/test/bcf-reader.cpp @@ -1,5 +1,6 @@ #include "../vcfpp.h" #include "catch.hh" +#include using namespace vcfpp; using namespace std; @@ -68,7 +69,7 @@ TEST_CASE("parse PL in vcf - vector", "[bcf-reader]") { string vcffile{"test-GL.vcf.gz"}; BcfWriter bw(vcffile, "VCF4.2"); - bw.header.addContig("chr20"); + bw.header.addContig("chr20"); bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); bw.header.addFORMAT("GT", "1", "String", "Genotype"); bw.header.addFORMAT("PL", "G", "Integer", "List of Phred-scaled genotype likelihoods"); @@ -78,8 +79,8 @@ TEST_CASE("parse PL in vcf - vector", "[bcf-reader]") BcfReader br(vcffile); BcfRecord var(br.header); vector pl; - REQUIRE(br.getNextVariant(var)==true); - var.getFORMAT("PL",pl); + REQUIRE(br.getNextVariant(var) == true); + var.getFORMAT("PL", pl); for(auto g : pl) cout << g << endl; } @@ -87,18 +88,19 @@ TEST_CASE("parse GL in vcf - vector", "[bcf-reader]") { string vcffile{"test-GL.vcf.gz"}; BcfWriter bw(vcffile, "VCF4.2"); - bw.header.addContig("chr20"); + bw.header.addContig("chr20"); bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); bw.header.addFORMAT("GT", "1", "String", "Genotype"); bw.header.addFORMAT("GL", "G", "Float", "List of log scale genotype likelihoods"); for(auto & s : {"id01", "id02"}) bw.header.addSample(s); // add 3 samples - bw.writeLine("chr20\t2006060\trs146931526\tG\tA\t100\tPASS\tAF=0.02\tGT:GL\t0/1:-323.03,-99.29,-802.53\t1/1:-133.03,-299.29,-902.53"); + bw.writeLine("chr20\t2006060\trs146931526\tG\tA\t100\tPASS\tAF=0.02\tGT:GL\t0/" + "1:-323.03,-99.29,-802.53\t1/1:-133.03,-299.29,-902.53"); bw.close(); BcfReader br(vcffile); BcfRecord var(br.header); vector gl; - REQUIRE(br.getNextVariant(var)==true); - var.getFORMAT("GL",gl); + REQUIRE(br.getNextVariant(var) == true); + var.getFORMAT("GL", gl); for(auto g : gl) cout << g << endl; } @@ -106,7 +108,7 @@ TEST_CASE("parse vcf with multialleles - vector", "[bcf-reader]") { string vcffile{"test-multialleles.vcf.gz"}; BcfWriter bw(vcffile, "VCF4.2"); - bw.header.addContig("chr20"); + bw.header.addContig("chr20"); bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); bw.header.addFORMAT("GT", "1", "String", "Genotype"); for(auto & s : {"id01", "id02", "id03"}) bw.header.addSample(s); // add 3 samples @@ -116,7 +118,7 @@ TEST_CASE("parse vcf with multialleles - vector", "[bcf-reader]") BcfReader br("test-multialleles.vcf.gz"); BcfRecord var(br.header); vector gt; - REQUIRE(br.getNextVariant(var)==true); + REQUIRE(br.getNextVariant(var) == true); auto l2 = var.asString(); REQUIRE(l2 == l1 + "\n"); var.getGenotypes(gt); @@ -127,7 +129,7 @@ TEST_CASE("parse EV in vcf - vector", "[bcf-reader]") { string vcffile{"test-GL.vcf.gz"}; BcfWriter bw(vcffile, "VCF4.2"); - bw.header.addContig("chr20"); + bw.header.addContig("chr20"); bw.header.addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); bw.header.addFORMAT("GT", "1", "String", "Genotype"); bw.header.addFORMAT("EV", "G", "String", "Classes of evidence supporting final genotype"); @@ -137,10 +139,10 @@ TEST_CASE("parse EV in vcf - vector", "[bcf-reader]") BcfReader br(vcffile); BcfRecord var(br.header); vector ev; - REQUIRE(br.getNextVariant(var)==true); - var.getFORMAT("EV",ev); - REQUIRE(ev[0]=="RD"); - REQUIRE(ev[1]=="SR,PE"); + REQUIRE(br.getNextVariant(var) == true); + var.getFORMAT("EV", ev); + REQUIRE(ev[0] == "RD"); + REQUIRE(ev[1] == "SR,PE"); } TEST_CASE("throw error when file is not valid", "[bcf-reader]") @@ -150,3 +152,43 @@ TEST_CASE("throw error when file is not valid", "[bcf-reader]") BcfReader bw; CHECK_THROWS(bw.open("ff://no-access.vcf.gz")); } + +TEST_CASE("throw error if the region is not valid", "[bcf-reader]") +{ + BcfReader * br = nullptr; + try + { + br = new BcfReader("test.vcf.gz", "XXXX"); + delete br; + } + catch(exception & e) + { + cout << e.what(); + } +} + +TEST_CASE("can check the status of a region", "[bcf-reader]") +{ + int status; + BcfReader br("test.vcf.gz"); + status = br.getStatus("chr21:5030089-5030090"); + REQUIRE(status == 0); // valid but empty + status = br.getStatus("chr21:5030089-"); + REQUIRE(status == 1); // valid and not empty + status = br.getStatus("chr22:5030089"); + REQUIRE(status == -2); // not valid or not found in the VCF + BcfReader br2("test-no-index.vcf.gz"); + status = br2.getStatus("chr21"); + REQUIRE(status == -1); // no index file found +} + +TEST_CASE("can count the number of variants in a valid region", "[bcf-reader]") +{ + // BcfReader br("test.vcf.gz", "chr21:5030089-5030090"); + int nVariants = -1; + BcfReader br("test.vcf.gz"); + nVariants = br.getVariantsCount("chr21:5030089-5030090"); + REQUIRE(nVariants == 0); + nVariants = br.getVariantsCount("chr21:5030089-"); + REQUIRE(nVariants == 13); +} diff --git a/test/test-no-index.vcf.gz b/test/test-no-index.vcf.gz new file mode 100644 index 0000000..9438b0c Binary files /dev/null and b/test/test-no-index.vcf.gz differ diff --git a/vcfpp.h b/vcfpp.h index fd9acf5..8d12822 100644 --- a/vcfpp.h +++ b/vcfpp.h @@ -2,7 +2,7 @@ * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h * @author Zilong Li * @email zilong.dk@gmail.com - * @version v0.5.2 + * @version v0.6.0 * @breif a single C++ file for manipulating VCF * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file. ******************************************************************************/ @@ -49,6 +49,7 @@ // make sure you have htslib installed extern "C" { +# include # include # include # include @@ -94,6 +95,9 @@ using isFormatVector = typename std::enable_if>::value, bool>::type; +namespace details +{ + template isScalar isMissing(T const & v) { @@ -151,7 +155,7 @@ inline std::vector split_string(const std::string & s, const std::s } // deleter for htsFile -struct htsFile_close +struct hts_file_close { void operator()(htsFile * x) { @@ -159,8 +163,35 @@ struct htsFile_close } }; +// deleter for hts iterator +struct hts_iter_close +{ + void operator()(hts_itr_t * x) + { + if(x) hts_itr_destroy(x); + } +}; + +// deleter for hts idx +struct hts_idx_close +{ + void operator()(hts_idx_t * x) + { + if(x) hts_idx_destroy(x); + } +}; + +// deleter for tabix idx +struct tabix_idx_close +{ + void operator()(tbx_t * x) + { + if(x) tbx_destroy(x); + } +}; + // deleter for variant -struct variant_close +struct bcf_line_close { void operator()(bcf1_t * x) { @@ -177,6 +208,8 @@ struct bcf_hdr_close } }; +} // namespace details + /** * @class BcfHeader * @brief Object represents header of the VCF, offering methods to access and modify the tags @@ -380,7 +413,7 @@ class BcfHeader * */ void updateSamples(const std::string & samples) { - auto ss = split_string(samples, ","); + auto ss = details::split_string(samples, ","); const int nsamples = nSamples(); if(nsamples != (int)ss.size()) throw std::runtime_error("You provide either too few or too many samples"); @@ -466,7 +499,7 @@ class BcfRecord private: BcfHeader * header; - std::shared_ptr line = std::shared_ptr(bcf_init(), variant_close()); // variant + std::shared_ptr line = std::shared_ptr(bcf_init(), details::bcf_line_close()); // variant bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr) bcf_fmt_t * fmt = NULL; bcf_info_t * info = NULL; @@ -1517,9 +1550,9 @@ class BcfReader { private: std::shared_ptr fp; // hts file - hts_idx_t * hidx = NULL; // hts index file - tbx_t * tidx = NULL; // .tbi .csi index file for vcf files - hts_itr_t * itr = NULL; // tabix iterator + std::shared_ptr hidx; // hts index file + std::shared_ptr tidx; // .tbi .csi index file for vcf files + std::shared_ptr itr; // hts iterator kstring_t s = {0, 0, NULL}; // kstring std::string fname; bool isBcf; // if the input file is bcf or vcf; @@ -1529,7 +1562,7 @@ class BcfReader BcfHeader header; /// number of samples in the VCF int nsamples; - /// number of samples in the VCF + /// a vector of samples name in the VCF std::vector SamplesName; /// Construct an empty BcfReader @@ -1575,24 +1608,25 @@ class BcfReader ~BcfReader() { - close(); + if(s.s) free(s.s); } - /// close the BcfReader object. + /// Close the VCF file and its associated files void close() { if(fp) fp.reset(); - if(itr) hts_itr_destroy(itr); - if(hidx) hts_idx_destroy(hidx); - if(tidx) tbx_destroy(tidx); - if(s.s) free(s.s); + if(hidx) hidx.reset(); + if(itr) itr.reset(); + if(tidx) tidx.reset(); } - /// open a VCF/BCF/STDIN file for streaming in + /// Open a VCF/BCF/STDIN file for streaming in void open(const std::string & file) { + if(!fname.empty() && fname != file) + throw std::runtime_error("does not support re-open a file yet. " + fname); fname = file; - fp = std::shared_ptr(hts_open(file.c_str(), "r"), htsFile_close()); + fp = std::shared_ptr(hts_open(fname.c_str(), "r"), details::hts_file_close()); if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); header.hdr = bcf_hdr_read(fp.get()); nsamples = bcf_hdr_nsamples(header.hdr); @@ -1611,12 +1645,44 @@ class BcfReader return header; } - /** @brief get the number of records of given region */ - uint64_t getVariantsCount(BcfRecord & r, const std::string & region) + /** + * @brief query the status of a given region in the VCF + * @return -2: the region is not a valid bcftools-like format, + or it is not presenting in the VCF even though it's bcftols-like format. + -1: there is no index file found. + 0: the region is valid but empty. + 1: vaild and not empty. + */ + int getStatus(const std::string & region) + { + try + { + setRegion(region); + BcfRecord v(header); + if(!getNextVariant(v)) return 0; + } + catch(const std::invalid_argument & e) + { + return -1; + } + catch(const std::runtime_error & e) + { + return -2; + } + return 1; + } + + /** + * @brief count the number of variants by iterating through a given region. + * @note If you want to continue work on that region, remember to reset the region by setRegion! + Also, check the status of the region first to handle the different cases + */ + int getVariantsCount(const std::string & region) { - uint64_t c{0}; - while(getNextVariant(r)) c++; - setRegion(region); // reset the region + int c{0}; + setRegion(region); + BcfRecord v(header); + while(getNextVariant(v)) c++; return c; } @@ -1633,54 +1699,61 @@ class BcfReader } /** - * @brief explicitly stream to specific region - * @param region the string is samtools-like format which is chr:start-end + * @brief explicitly stream to specific region. throw invalid_argument error if index file not found. + * throw runtime_error if the region was not a valid bcftools-like format or was not presenting in the + * VCF. + * @param region the string for region is samtools-like format, which can be 'chr', 'chr:start' and + * 'chr:start-end' * */ void setRegion(const std::string & region) { // 1. check and load index first // 2. query iterval region // 3. if region is empty, use "." - if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz")) + if(details::isEndWith(fname, "bcf") || details::isEndWith(fname, "bcf.gz")) { isBcf = true; - hidx = bcf_index_load(fname.c_str()); - if(itr) bcf_itr_destroy(itr); // reset current region. + hidx = std::shared_ptr(bcf_index_load(fname.c_str()), details::hts_idx_close()); + if(itr) itr.reset(); // reset current region. if(region.empty()) - itr = bcf_itr_querys(hidx, header.hdr, "."); + itr = std::shared_ptr(bcf_itr_querys(hidx.get(), header.hdr, "."), + details::hts_iter_close()); else - itr = bcf_itr_querys(hidx, header.hdr, region.c_str()); + itr = std::shared_ptr(bcf_itr_querys(hidx.get(), header.hdr, region.c_str()), + details::hts_iter_close()); } else { isBcf = false; - tidx = tbx_index_load(fname.c_str()); - if(tidx == NULL) throw std::runtime_error("error in loading tabix index!"); - if(itr) tbx_itr_destroy(itr); // reset current region. + tidx = std::shared_ptr(tbx_index_load(fname.c_str()), details::tabix_idx_close()); + if(tidx.get() == NULL) throw std::invalid_argument(" no tabix index found!"); + if(itr) itr.reset(); // reset if(region.empty()) - itr = tbx_itr_querys(tidx, "."); + itr = std::shared_ptr(tbx_itr_querys(tidx.get(), "."), details::hts_iter_close()); else - itr = tbx_itr_querys(tidx, region.c_str()); - if(itr == NULL) throw std::runtime_error("no interval region found!"); + itr = std::shared_ptr(tbx_itr_querys(tidx.get(), region.c_str()), + details::hts_iter_close()); } + if(itr.get() == NULL) + throw std::runtime_error("region was not found! make sure the region format is correct"); } /** @brief read in the next variant - * @param r r is a BcfRecord object to be filled in. */ + * @param r the BcfRecord object to be filled in. */ bool getNextVariant(BcfRecord & r) { int ret = -1; - if(itr != NULL) + if(itr.get() != NULL) { if(isBcf) { - ret = bcf_itr_next(fp.get(), itr, r.line.get()); + ret = bcf_itr_next(fp.get(), itr.get(), r.line.get()); bcf_unpack(r.line.get(), BCF_UN_ALL); return (ret >= 0); } else { - int slen = tbx_itr_next(fp.get(), tidx, itr, &s); + int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s); if(slen > 0) { ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error @@ -1707,7 +1780,7 @@ class BcfWriter { private: std::shared_ptr fp; // hts file - std::shared_ptr b = std::shared_ptr(bcf_init(), variant_close()); // variant + std::shared_ptr b = std::shared_ptr(bcf_init(), details::bcf_line_close()); // variant int ret; bool isHeaderWritten = false; const BcfHeader * hp; @@ -1783,8 +1856,8 @@ class BcfWriter */ void open(const std::string & fname) { - auto mode = getMode(fname, "w"); - fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); + auto mode = details::getMode(fname, "w"); + fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close()); if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); } @@ -1799,7 +1872,7 @@ class BcfWriter */ void open(const std::string & fname, const std::string & mode) { - fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); + fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close()); if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); } @@ -1872,7 +1945,7 @@ class BcfWriter } /// streams out the given variant of BcfRecord type - inline bool writeRecord(BcfRecord & v) + bool writeRecord(BcfRecord & v) { if(!isHeaderWritten) writeHeader(); if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false;