Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove VariantContexts from most fgbio tools #836

Open
wants to merge 7 commits into
base: ks_remove_variantcontextsetbuilder
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
package com.fulcrumgenomics.bam

import java.util

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamSource
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
Expand All @@ -36,6 +35,7 @@ import com.fulcrumgenomics.fasta.SequenceDictionary
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.util.Metric.Count
import com.fulcrumgenomics.util.{Metric, ProgressLogger, Rscript, Sequences}
import com.fulcrumgenomics.vcf.api.VcfSource
import com.fulcrumgenomics.vcf.{ByIntervalListVariantContextIterator, VariantMask}
import htsjdk.samtools.SAMRecord
import htsjdk.samtools.filter.{DuplicateReadFilter, FailsVendorReadQualityFilter, SamRecordFilter, SecondaryOrSupplementaryFilter}
Expand Down Expand Up @@ -197,9 +197,9 @@ class ErrorRateByReadPosition
case None =>
new VariantMask(Iterator.empty, dict)
case Some(path) =>
val reader = new VCFFileReader(path.toFile, this.intervals.isDefined)
val reader = VcfSource(path)
intervals match {
case None => new VariantMask(reader.iterator(), dict)
case None => new VariantMask(reader.iterator, dict)
case Some(i) => new VariantMask(ByIntervalListVariantContextIterator(reader, i), dict)
}
}
Expand Down
155 changes: 81 additions & 74 deletions src/main/scala/com/fulcrumgenomics/vcf/AssessPhasing.scala

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,10 @@
package com.fulcrumgenomics.vcf

import java.util.NoSuchElementException

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.fasta.SequenceDictionary
import com.fulcrumgenomics.vcf.api.{Variant, VcfSource}
import htsjdk.samtools.util._
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader

import scala.annotation.tailrec

Expand All @@ -42,9 +40,9 @@ object ByIntervalListVariantContextIterator {
*
* All variants will be read in and compared to the intervals.
*/
def apply(iterator: Iterator[VariantContext],
def apply(iterator: Iterator[Variant],
intervalList: IntervalList,
dict: SequenceDictionary): Iterator[VariantContext] = {
dict: SequenceDictionary): Iterator[Variant] = {
new OverlapDetectionVariantContextIterator(iterator, intervalList, dict)
}

Expand All @@ -56,29 +54,29 @@ object ByIntervalListVariantContextIterator {
* The VCF will be queried when moving to the next variant context, and so may be quite slow
* if we jump around the VCF a lot.
*/
def apply(reader: VCFFileReader,
intervalList: IntervalList): Iterator[VariantContext] = {
def apply(reader: VcfSource,
intervalList: IntervalList): Iterator[Variant] = {
new IndexQueryVariantContextIterator(reader, intervalList)
}
}

private class OverlapDetectionVariantContextIterator(val iter: Iterator[VariantContext],
private class OverlapDetectionVariantContextIterator(val iter: Iterator[Variant],
val intervalList: IntervalList,
val dict: SequenceDictionary)
extends Iterator[VariantContext] {
extends Iterator[Variant] {

require(dict != null)
private val intervals = intervalList.uniqued(false).iterator().buffered
private var nextVariantContext: Option[VariantContext] = None
private var nextVariant: Option[Variant] = None

this.advance()

def hasNext: Boolean = this.nextVariantContext.isDefined
def hasNext: Boolean = this.nextVariant.isDefined

def next(): VariantContext = {
this.nextVariantContext match {
def next(): Variant = {
this.nextVariant match {
case None => throw new NoSuchElementException("Called next when hasNext is false")
case Some(ctx) => yieldAndThen(ctx) { this.nextVariantContext = None; this.advance() }
case Some(ctx) => yieldAndThen(ctx) { this.nextVariant = None; this.advance() }
}
}

Expand Down Expand Up @@ -108,16 +106,16 @@ private class OverlapDetectionVariantContextIterator(val iter: Iterator[VariantC
}

if (intervals.isEmpty) { } // no more intervals
else if (overlaps(ctx, intervals.head)) { nextVariantContext = Some(ctx) } // overlaps!!!
else if (overlaps(ctx, intervals.head)) { nextVariant = Some(ctx) } // overlaps!!!
else if (iter.isEmpty) { } // no more variants
else { this.advance() } // move to the next context
}
}

/** NB: if a variant overlaps multiple intervals, only returns it once. */
private class IndexQueryVariantContextIterator(private val reader: VCFFileReader, intervalList: IntervalList)
extends Iterator[VariantContext] {
private var iter: Option[Iterator[VariantContext]] = None
private class IndexQueryVariantContextIterator(private val reader: VcfSource, intervalList: IntervalList)
extends Iterator[Variant] {
private var iter: Option[Iterator[Variant]] = None
private val intervals = intervalList.iterator()
private var previousInterval: Option[Interval] = None

Expand All @@ -127,7 +125,7 @@ private class IndexQueryVariantContextIterator(private val reader: VCFFileReader
this.iter.exists(_.hasNext)
}

def next(): VariantContext = {
def next(): Variant = {
this.iter match {
case None => throw new NoSuchElementException("Called next when hasNext is false")
case Some(i) => yieldAndThen(i.next()) { advance() }
Expand All @@ -136,10 +134,10 @@ private class IndexQueryVariantContextIterator(private val reader: VCFFileReader

def remove(): Unit = throw new UnsupportedOperationException

private def overlapsInterval(ctx: VariantContext, interval: Interval): Boolean = {
if (!ctx.getContig.equals(interval.getContig)) false // different contig
else if (interval.getStart <= ctx.getStart && ctx.getStart <= interval.getEnd) true // start falls within this interval, count it
else if (ctx.getStart < interval.getStart && interval.getEnd <= ctx.getEnd) true // the variant encloses the interval
private def overlapsInterval(variant: Variant, interval: Interval): Boolean = {
if (!variant.getContig.equals(interval.getContig)) false // different contig
else if (interval.getStart <= variant.getStart && variant.getStart <= interval.getEnd) true // start falls within this interval, count it
else if (variant.getStart < interval.getStart && interval.getEnd <= variant.getEnd) true // the variant encloses the interval
else false
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
package com.fulcrumgenomics.vcf

import com.fulcrumgenomics.fasta.SequenceDictionary
import htsjdk.variant.variantcontext.{VariantContext, VariantContextComparator}
import com.fulcrumgenomics.vcf.api.Variant

object JointVariantContextIterator {
def apply(iters: Seq[Iterator[VariantContext]],
def apply(iters: Seq[Iterator[Variant]],
dict: SequenceDictionary
): JointVariantContextIterator = {
new JointVariantContextIterator(
Expand All @@ -37,8 +37,8 @@ object JointVariantContextIterator {
)
}

def apply(iters: Seq[Iterator[VariantContext]],
comp: VariantContextComparator
def apply(iters: Seq[Iterator[Variant]],
comp: VariantComparator
): JointVariantContextIterator = {
new JointVariantContextIterator(
iters=iters,
Expand All @@ -51,25 +51,25 @@ object JointVariantContextIterator {
* Iterates over multiple variant context iterators such that we return a list of contexts for the union of sites
* across the iterators. If samples is given, we subset each variant context to just that sample.
*/
class JointVariantContextIterator private(iters: Seq[Iterator[VariantContext]],
dictOrComp: Either[SequenceDictionary, VariantContextComparator]
class JointVariantContextIterator private(iters: Seq[Iterator[Variant]],
dictOrComp: Either[SequenceDictionary, VariantComparator]
)
extends Iterator[Seq[Option[VariantContext]]] {
import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary
extends Iterator[Seq[Option[Variant]]] {

if (iters.isEmpty) throw new IllegalArgumentException("No iterators given")

private val iterators = iters.map(_.buffered)
private val comparator = dictOrComp match {
case Left(dict) => new VariantContextComparator(dict.asSam)
case Left(dict) => VariantComparator(dict)
case Right(comp) => comp
}

def hasNext: Boolean = iterators.exists(_.nonEmpty)

def next(): Seq[Option[VariantContext]] = {
def next(): Seq[Option[Variant]] = {
val minCtx = iterators.filter(_.nonEmpty).map(_.head).sortWith {
case (left: VariantContext, right: VariantContext) => comparator.compare(left, right) < 0
// this just checks that the variants are the the same position? Shouldn't be difficult to replace.
case (left: Variant, right: Variant) => comparator.compare(left, right) < 0
}.head
// TODO: could use a TreeSet to store the iterators, examine the head of each iterator, then pop the iterator with the min,
// and add that iterator back in.
Expand All @@ -79,3 +79,21 @@ extends Iterator[Seq[Option[VariantContext]]] {
}
}
}

private object VariantComparator {
def apply(dict: SequenceDictionary): VariantComparator = {
new VariantComparator(dict)
}
}

/** A class for comparing Variants using a sequence dictionary */
private class VariantComparator(dict: SequenceDictionary) {
/** Function for comparing two variants. Returns negative if left < right, and positive if right > left
* To mimic the VariantContextComparator, throws an exception if the contig isn't found. */
def compare(left: Variant, right: Variant): Int = {
val idx1 = this.dict(name=left.chrom).index
val idx2 = this.dict(name=right.chrom).index
if (idx1 - idx2 == 0) left.pos - right.pos
else idx1 - idx2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
package com.fulcrumgenomics.vcf

import java.util

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.util.Io
import com.fulcrumgenomics.vcf.MakeTwoSampleMixtureVcf._
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.vcf.api.VcfSource
import htsjdk.samtools.util.IntervalList
import htsjdk.variant.variantcontext._
import htsjdk.variant.vcf.{VCFFilterHeaderLine, _}
Expand Down Expand Up @@ -165,9 +165,13 @@ class MakeTwoSampleMixtureVcf

/** Generates an iterator over the whole file, or over the intervals if provided. */
def iterator(in: VCFFileReader, intervals: Option[PathToIntervals]): Iterator[VariantContext] = {
// val tmpFile = Io.makeTempFile("temp patch", ".vcf")
// MakeMixtureVcf.makeWriter(tmpFile, in.getFileHeader, lines=Seq.empty, samples=in.getFileHeader.getSampleNamesInOrder.toSeq:_*)
// val tmpIn = VcfSource(tmpFile)
intervals match {
case None => in.iterator()
case Some(path) => ByIntervalListVariantContextIterator(in, IntervalList.fromFile(path.toFile).uniqued(false))
case Some(path) => in.iterator()
// case Some(path) => ByIntervalListVariantContextIterator(in, IntervalList.fromFile(path.toFile).uniqued(false))
}
}
}
28 changes: 14 additions & 14 deletions src/main/scala/com/fulcrumgenomics/vcf/UpdateVcfContigNames.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.fasta.SequenceDictionary
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, ProgressLogger}
import com.fulcrumgenomics.vcf.api.{VcfContigHeader, VcfHeader, VcfSource, VcfWriter}
import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriterBuilder}
import htsjdk.variant.vcf.{VCFFileReader, VCFHeader}
Expand All @@ -55,20 +56,19 @@ class UpdateVcfContigNames

override def execute(): Unit = {
val dict = SequenceDictionary(this.dict)
val reader = new VCFFileReader(this.input)
val reader = VcfSource(this.input)
val header = {
import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary
val h: VCFHeader = new VCFHeader(reader.getFileHeader)
h.setSequenceDictionary(dict.asSam)
h
VcfHeader(
contigs=dict.map{l => VcfContigHeader(index=l.index, name=l.name, length=Some(l.length))}.toIndexedSeq,
infos=reader.header.infos,
formats=reader.header.formats,
filters=reader.header.filters,
others=reader.header.others,
samples=reader.header.samples
)
}
val writer = {
new VariantContextWriterBuilder()
.setOutputPath(this.output)
.setOption(Options.INDEX_ON_THE_FLY)
.build()
}
writer.writeHeader(header)

val writer = VcfWriter(this.output, header)

// go through all the records
val progress = ProgressLogger(logger, noun = "variants", verb = "written")
Expand All @@ -78,9 +78,9 @@ class UpdateVcfContigNames
if (skipMissing) logger.warning(s"Did not find contig ${v.getContig} in the sequence dictionary.")
else throw new IllegalStateException(s"Did not find contig ${v.getContig} in the sequence dictionary.")
case Some(info) =>
val newV = new VariantContextBuilder(v).chr(info.name).make()
val newV = v.copy(chrom=info.name)
progress.record(newV.getContig, newV.getStart)
writer.add(newV)
writer.write(newV)
}
}
progress.logLast()
Expand Down
19 changes: 9 additions & 10 deletions src/main/scala/com/fulcrumgenomics/vcf/VariantMask.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,24 @@ package com.fulcrumgenomics.vcf

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.fasta.SequenceDictionary
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
import com.fulcrumgenomics.vcf.api.{Variant, VcfSource}

import scala.collection.mutable

object VariantMask {
/** Generate a variant mask from the provided VCF. */
def apply(path: PathToVcf): VariantMask = apply(new VCFFileReader(path.toFile))
def apply(path: PathToVcf): VariantMask = apply(VcfSource(path))

/** Generate a variant mask from the provided VCF reader. */
def apply(reader: VCFFileReader): VariantMask = {
import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary
val dict = reader.getFileHeader.getSequenceDictionary.fromSam
def apply(reader: VcfSource): VariantMask = {
val dict = reader.header.dict
// val dict = reader.getFileHeader.getSequenceDictionary.fromSam
require(dict != null && dict.length > 0, "Generating a VariantMask requires VCFs contain contig lines.")
apply(reader.iterator(), dict)
apply(reader.iterator, dict)
}

/** Generates a VariantMask from the variants in the provided iterator. */
def apply(variants: Iterator[VariantContext], dict: SequenceDictionary) = new VariantMask(variants, dict)
def apply(variants: Iterator[Variant], dict: SequenceDictionary) = new VariantMask(variants, dict)
}

/**
Expand All @@ -55,7 +54,7 @@ object VariantMask {
* @param variants a coordinate sorted iterator of variants
* @param dict the sequence dictionary for the reference
*/
class VariantMask(variants: Iterator[VariantContext], val dict: SequenceDictionary) {
class VariantMask(variants: Iterator[Variant], val dict: SequenceDictionary) {
private val iterator = variants.bufferBetter
private var currentMask: mutable.BitSet = new mutable.BitSet(0)
private var currentIndex: Int = -1
Expand All @@ -74,7 +73,7 @@ class VariantMask(variants: Iterator[VariantContext], val dict: SequenceDictiona
val bits = new mutable.BitSet(ref.length)
iterator.dropWhile(v => dict(v.getContig).index < refIndex)
.takeWhile(v => dict(v.getContig).index == refIndex)
.filterNot(v => v.isFiltered)
// .filterNot(v => v.isFiltered) TODO fix this
.foreach { v =>
forloop (from=v.getStart, until=v.getEnd+1) { i => bits(i-1) = true }
}
Expand Down
Loading