diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala index 63d80619a0..0daa117564 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala @@ -20,10 +20,7 @@ package org.bdgenomics.adam.models import scala.util.Try /** - * Created by bryan on 4/17/15. - * * An alphabet of symbols and related operations - * */ trait Alphabet { @@ -45,11 +42,16 @@ trait Alphabet { symbols.flatMap(symbol => Seq(symbol.label.toLower -> symbol, symbol.label.toUpper -> symbol)).toMap /** + * Reverses the string and compliments each residue. + * + * Fails if a residue has no complement. * * @param s Each char in this string represents a symbol on the alphabet. * If the char is not in the alphabet then a NoSuchElementException is thrown * @return the reversed complement of the given string. * @throws IllegalArgumentException if the string contains a symbol which is not in the alphabet + * + * @see reverseComplement */ def reverseComplementExact(s: String): String = { reverseComplement( @@ -59,18 +61,26 @@ trait Alphabet { } /** + * Reverses the string and compliments each residue. + * + * If a residue has no known complement, that residue is replaced with a + * placeholder "not-found" value. * * @param s Each char in this string represents a symbol on the alphabet. * @param notFound If the char is not in the alphabet then this function is called. * default behavior is to return a new Symbol representing the unknown character, * so that the unknown char is treated as the complement * @return the reversed complement of the given string. + * + * @see reverseComplementExact */ def reverseComplement(s: String, notFound: (Char => Symbol) = ((c: Char) => Symbol(c, c))) = { s.map(x => Try(apply(x)).getOrElse(notFound(x)).complement).reverse } - /** number of symbols in the alphabet */ + /** + * The number of symbols in the alphabet. + */ def size = symbols.size /** @@ -78,11 +88,11 @@ trait Alphabet { * @return the given symbol */ def apply(c: Char): Symbol = symbolMap(c) - } /** - * A symbol in an alphabet + * A symbol in an alphabet. + * * @param label a character which represents the symbol * @param complement acharacter which represents the complement of the symbol */ @@ -103,6 +113,9 @@ class DNAAlphabet extends Alphabet { ) } +/** + * Singleton object with references to all supported alphabets. + */ object Alphabet { val dna = new DNAAlphabet } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Attribute.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Attribute.scala index 04cb5848df..fc4386f3b8 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Attribute.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Attribute.scala @@ -18,17 +18,20 @@ package org.bdgenomics.adam.models /** - * A wrapper around the attrTuple (key) and value pair. Includes the attrTuple-type explicitly, rather than - * embedding the corresponding information in the type of 'value', because otherwise it'd be difficult - * to extract the correct type for Byte and NumericSequence values. + * A wrapper around the attrTuple (key) and value pair seen in many formats. * - * Roughly analogous to Picards SAMTagAndValue. + * Includes the attrTuple-type explicitly, rather than embedding the + * corresponding information in the type of 'value', because otherwise it'd be + * difficult to extract the correct type for Byte and NumericSequence values. + * + * This class is roughly analogous to htsjdk's SAMTagAndValue. * * @param tag The string key associated with this pair. * @param tagType An enumerated value representing the type of the 'value' parameter. * @param value The 'value' half of the pair. */ case class Attribute(tag: String, tagType: TagType.Value, value: Any) { + override def toString: String = { val byteSequenceTypes = Array(TagType.NumericByteSequence, TagType.NumericUnsignedByteSequence) val intSequenceTypes = Array(TagType.NumericIntSequence, TagType.NumericUnsignedIntSequence) @@ -47,25 +50,82 @@ case class Attribute(tag: String, tagType: TagType.Value, value: Any) { } } +/** + * An enumeration that describes the different data types that can be stored in + * an attribute. + */ object TagType extends Enumeration { + /** + * A representation of the type of data stored in a tagged field. + * + * @param abbreviation A string describing the data type underlying the + * attribute. The string values that are stored with the attribute come from + * the SAM file format spec: http://samtools.sourceforge.net/SAMv1.pdf + */ class TypeVal(val abbreviation: String) extends Val(nextId, abbreviation) { override def toString(): String = abbreviation } - def TypeValue(abbreviation: String): Val = new TypeVal(abbreviation) - // These String values come from the SAM file format spec: http://samtools.sourceforge.net/SAMv1.pdf + private def TypeValue(abbreviation: String): Val = new TypeVal(abbreviation) + + /** + * An attribute storing a character. SAM "A". + */ val Character = TypeValue("A") + + /** + * An attribute storing an integer. SAM "i". + */ val Integer = TypeValue("i") + + /** + * An attribute storing a floating point value. SAM "f". + */ val Float = TypeValue("f") + + /** + * An attribute storing a string. SAM "Z". + */ val String = TypeValue("Z") + + /** + * An attribute storing hex formatted bytes. SAM "H". + */ val ByteSequence = TypeValue("H") + + /** + * An attribute storing a numeric array of signed bytes. SAM "B:c". + */ val NumericByteSequence = TypeValue("B:c") + + /** + * An attribute storing a numeric array of signed ints. SAM "B:i". + */ val NumericIntSequence = TypeValue("B:i") + + /** + * An attribute storing a numeric array of signed short ints. SAM "B:i". + */ val NumericShortSequence = TypeValue("B:s") + + /** + * An attribute storing a numeric array of unsigned bytes. SAM "B:C". + */ val NumericUnsignedByteSequence = TypeValue("B:C") + + /** + * An attribute storing a numeric array of unsigned ints. SAM "B:I". + */ val NumericUnsignedIntSequence = TypeValue("B:I") + + /** + * An attribute storing a numeric array of unsigned short ints. SAM "B:i". + */ val NumericUnsignedShortSequence = TypeValue("B:S") - val NumericFloatSequence = TypeValue("B:f") + /** + * An attribute storing a numeric array of floats. SAM "B:f". + */ + val NumericFloatSequence = TypeValue("B:f") } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala index e414816d78..99ea6ff38c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala @@ -21,9 +21,9 @@ import org.apache.spark.rdd.RDD import org.bdgenomics.formats.avro.Feature /** - * Converts from avro Feature to Coverage. + * Singleton object for converting from Avro Feature to Coverage. */ -object Coverage { +private[adam] object Coverage { /** * Creates Coverage from ReferenceRegion and coverage count in that ReferenceRegion. @@ -32,7 +32,7 @@ object Coverage { * @param count Coverage count for each base pair in region * @return Coverage spanning the specified ReferenceRegion */ - private[adam] def apply(region: ReferenceRegion, count: Double): Coverage = { + def apply(region: ReferenceRegion, count: Double): Coverage = { Coverage(region.referenceName, region.start, region.end, count) } @@ -42,8 +42,11 @@ object Coverage { * @param feature Feature to create coverage from * @return Coverage spanning the specified feature */ - private[adam] def apply(feature: Feature): Coverage = { - Coverage(feature.getContigName, feature.getStart, feature.getEnd, feature.getScore) + def apply(feature: Feature): Coverage = { + Coverage(feature.getContigName, + feature.getStart, + feature.getEnd, + feature.getScore) } /** @@ -52,20 +55,23 @@ object Coverage { * @param rdd RDD of Features to extract Coverage from * @return RDD of Coverage spanning all features in rdd */ - private[adam] def apply(rdd: RDD[Feature]): RDD[Coverage] = { + def apply(rdd: RDD[Feature]): RDD[Coverage] = { rdd.map(f => Coverage(f)) } } /** * Coverage record for CoverageRDD. - * Contains Region indexed by contig name, start and end, as well as count of coverage at - * each base pair in that region. * - * @param contigName Specifies chromosomal location of coverage - * @param start Specifies start position of coverage - * @param end Specifies end position of coverage - * @param count Specifies count of coverage at location + * Contains Region indexed by contig name, start and end, as well as the average + * coverage at each base pair in that region. + * + * @param contigName The chromosome that this coverage was observed on. + * @param start The start coordinate of the region where this coverage value was + * observed. + * @param end The end coordinate of the region where this coverage value was + * observed. + * @param count The average coverage across this region. */ case class Coverage(contigName: String, start: Long, end: Long, count: Double) { @@ -75,12 +81,12 @@ case class Coverage(contigName: String, start: Long, end: Long, count: Double) { * @return Feature built from Coverage */ def toFeature: Feature = { - val fb = Feature.newBuilder() - fb.setContigName(contigName) - fb.setStart(start) - fb.setEnd(end) - fb.setScore(count) - fb.build() + Feature.newBuilder() + .setContigName(contigName) + .setStart(start) + .setEnd(end) + .setScore(count) + .build() } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala index 68822e22e5..c1a6523aef 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/NonoverlappingRegions.scala @@ -32,7 +32,7 @@ package org.bdgenomics.adam.models * * @param regions The input-set of regions. */ -class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializable { +private[adam] class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializable { assert(regions != null, "regions parameter cannot be null") @@ -43,14 +43,20 @@ class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializ assert(regions.size > 0, "regions list must be non-empty") assert(regions.head != null, "regions must have at least one non-null entry") + /** + * The name of the reference contig this covers. + */ val referenceName: String = regions.head.referenceName // invariant: all the values in the 'regions' list have the same referenceId assert(regions.forall(_.referenceName == referenceName)) - // We represent the distinct unions, the 'nonoverlapping-set' of regions, as a set of endpoints, - // so that we can do reasonably-fast binary searching on them to determine the slice of nonoverlapping-set - // regions that are overlapped by a new, query region (see findOverlappingRegions, below). + /** + * We represent the distinct unions, the 'nonoverlapping-set' of regions, as a + * set of endpoints, so that we can do reasonably-fast binary searching on + * them to determine the slice of nonoverlapping-set regions that are + * overlapped by a new, query region (see findOverlappingRegions, below). + */ val endpoints: Array[Long] = mergeRegions(regions.toSeq.sortBy(r => r.start)).flatMap(r => Seq(r.start, r.end)).distinct.sorted.toArray @@ -71,13 +77,13 @@ class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializ } } - def mergeRegions(regs: Seq[(ReferenceRegion)]): List[ReferenceRegion] = + private def mergeRegions(regs: Seq[(ReferenceRegion)]): List[ReferenceRegion] = regs.aggregate(List[ReferenceRegion]())( (lst: List[ReferenceRegion], p: (ReferenceRegion)) => updateListWithRegion(lst, p), (a, b) => a ++ b ) - def binaryPointSearch(pos: Long, lessThan: Boolean): Int = { + private def binaryPointSearch(pos: Long, lessThan: Boolean): Int = { var i = 0 var j = endpoints.size - 1 @@ -94,6 +100,12 @@ class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializ if (lessThan) i else j } + /** + * Finds all regions that a query region overlaps. + * + * @param query The region to check for overlaps. + * @return All the regions that overlap the query region. + */ def findOverlappingRegions(query: ReferenceRegion): Seq[ReferenceRegion] = { assert(query != null, "query region was null") @@ -153,9 +165,8 @@ class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializ * completely outside the hull of all the input-set regions. * * @param regionable The input value - * @tparam U - * @return a boolean -- the input value should only participate in the regionJoin if the return value - * here is 'true'. + * @tparam U The type of the value in the key-value tuple. + * @return True if we have a region that overlaps the region key in a tuple. */ def hasRegionsFor[U](regionable: (ReferenceRegion, U)): Boolean = { !(regionable._1.end <= endpoints.head || regionable._1.start >= endpoints.last) @@ -165,11 +176,20 @@ class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializ "%s:%d-%d (%s)".format(referenceName, endpoints.head, endpoints.last, endpoints.mkString(",")) } -object NonoverlappingRegions { +private[adam] object NonoverlappingRegions { + + /** + * Builds a non-overlapping contig map from a seq of region tagged data. + * + * @param values A seq of data which is keyed by the region it overlaps. + * @return Returns a nonoverlapping region map. + * + * @tparam T The type of the value in the input seq. + */ def apply[T](values: Seq[(ReferenceRegion, T)]) = new NonoverlappingRegions(values.map(_._1)) - def alternating[T](seq: Seq[T], includeFirst: Boolean): Seq[T] = { + private[models] def alternating[T](seq: Seq[T], includeFirst: Boolean): Seq[T] = { val inds = if (includeFirst) { 0 until seq.size } else { 1 until seq.size + 1 } seq.zip(inds).filter(p => p._2 % 2 == 0).map(_._1) } @@ -186,23 +206,56 @@ object NonoverlappingRegions { * be valid reference names with respect to the sequence * dictionary. */ -class MultiContigNonoverlappingRegions(regions: Seq[(String, Iterable[ReferenceRegion])]) extends Serializable { +private[adam] class MultiContigNonoverlappingRegions( + regions: Seq[(String, Iterable[ReferenceRegion])]) extends Serializable { assert( regions != null, "Regions was set to null" ) - val regionMap: Map[String, NonoverlappingRegions] = + private val regionMap: Map[String, NonoverlappingRegions] = Map(regions.map(r => (r._1, new NonoverlappingRegions(r._2))): _*) + /** + * Given a "regionable" value (corresponds to a ReferencRegion through an implicit ReferenceMapping), + * return the set of nonoverlapping-regions to be used as partitions for the input value in a + * region-join. Basically, return the set of any non-empty nonoverlapping-regions that overlap the + * region corresponding to this input. + * + * @param regionable The input, which corresponds to a region + * @tparam U The type of the input + * @return An Iterable[ReferenceRegion], where each element of the Iterable is a nonoverlapping-region + * defined by 1 or more input-set regions. + */ def regionsFor[U](regionable: (ReferenceRegion, U)): Iterable[ReferenceRegion] = regionMap.get(regionable._1.referenceName).fold(Iterable[ReferenceRegion]())(_.regionsFor(regionable)) + /** + * A quick filter, to find out if we even need to examine a particular input value for keying by + * nonoverlapping-regions. Basically, reject the input value if its corresponding region is + * completely outside the hull of all the input-set regions. + * + * @param regionable The input value + * @tparam U + * @return a boolean -- the input value should only participate in the regionJoin if the return value + * here is 'true'. + */ def filter[U](value: (ReferenceRegion, U)): Boolean = regionMap.get(value._1.referenceName).fold(false)(_.hasRegionsFor(value)) } -object MultiContigNonoverlappingRegions { +private[adam] object MultiContigNonoverlappingRegions { + + /** + * Builds a non-overlapping region map from a seq of region tagged data. + * + * Can cover multiple contigs. + * + * @param values A seq of data which is keyed by the region it overlaps. + * @return Returns a nonoverlapping region map. + * + * @tparam T The type of the value in the input seq. + */ def apply[T](values: Seq[(ReferenceRegion, T)]): MultiContigNonoverlappingRegions = { new MultiContigNonoverlappingRegions( values.map(kv => (kv._1.referenceName, kv._1)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala index 858518fe67..029ab6e736 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ProgramRecord.scala @@ -20,8 +20,14 @@ package org.bdgenomics.adam.models import htsjdk.samtools.SAMProgramRecord import org.bdgenomics.adam.rdd.ADAMContext._ -object ProgramRecord { +private[models] object ProgramRecord { + /** + * Builds a program record model from a SAM program record. + * + * @param pr The SAM program record to build from. + * @return Returns a serializable program record model. + */ def apply(pr: SAMProgramRecord): ProgramRecord = { // ID is a required field val id: String = pr.getId @@ -36,13 +42,26 @@ object ProgramRecord { } } -case class ProgramRecord( +/** + * A serializable equivalent to the htsjdk SAMProgramRecord. + * + * @param id The ID of the program record line. + * @param commandLine An optional command line that was run. + * @param name An optional name for the command/tool that was run. + * @param version An optional version for the command/tool that was run. + * @param previousID An optional ID for the ID of the previous stage that was + * run. + */ +private[models] case class ProgramRecord( id: String, commandLine: Option[String], name: Option[String], version: Option[String], previousID: Option[String]) { + /** + * @return Exports back to the htsjdk SAMProgramRecord. + */ def toSAMProgramRecord(): SAMProgramRecord = { val pr = new SAMProgramRecord(id) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadBucket.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadBucket.scala deleted file mode 100644 index a80d3cb736..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReadBucket.scala +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.models - -import com.esotericsoftware.kryo.{ Kryo, Serializer } -import com.esotericsoftware.kryo.io.{ Input, Output } -import org.bdgenomics.adam.serialization.AvroSerializer -import org.bdgenomics.formats.avro.AlignmentRecord - -/** - * This class is similar to SingleReadBucket, except it breaks the reads down further. - * - * Rather than stopping at primary/secondary/unmapped, this will break it down further into whether they are paired - * or unpaired, and then whether they are the first or second of the pair. - * - * This is useful as this will usually map a single read in any of the sequences. - */ -case class ReadBucket( - unpairedPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedFirstPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedSecondPrimaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - unpairedSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedFirstSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - pairedSecondSecondaryMappedReads: Iterable[AlignmentRecord] = Seq.empty, - unmappedReads: Iterable[AlignmentRecord] = Seq.empty) { - def allReads(): Iterable[AlignmentRecord] = - unpairedPrimaryMappedReads ++ - pairedFirstPrimaryMappedReads ++ - pairedSecondPrimaryMappedReads ++ - unpairedSecondaryMappedReads ++ - pairedFirstSecondaryMappedReads ++ - pairedSecondSecondaryMappedReads ++ - unmappedReads -} - -class ReadBucketSerializer extends Serializer[ReadBucket] { - val recordSerializer = new AvroSerializer[AlignmentRecord]() - - def writeArray(kryo: Kryo, output: Output, reads: Iterable[AlignmentRecord]): Unit = { - output.writeInt(reads.size, true) - for (read <- reads) { - recordSerializer.write(kryo, output, read) - } - } - - def readArray(kryo: Kryo, input: Input): Seq[AlignmentRecord] = { - val numReads = input.readInt(true) - (0 until numReads).foldLeft(List[AlignmentRecord]()) { - (a, b) => recordSerializer.read(kryo, input, classOf[AlignmentRecord]) :: a - } - } - - def write(kryo: Kryo, output: Output, bucket: ReadBucket) = { - writeArray(kryo, output, bucket.unpairedPrimaryMappedReads) - writeArray(kryo, output, bucket.pairedFirstPrimaryMappedReads) - writeArray(kryo, output, bucket.pairedSecondPrimaryMappedReads) - writeArray(kryo, output, bucket.unpairedSecondaryMappedReads) - writeArray(kryo, output, bucket.pairedFirstSecondaryMappedReads) - writeArray(kryo, output, bucket.pairedSecondSecondaryMappedReads) - writeArray(kryo, output, bucket.unmappedReads) - } - - def read(kryo: Kryo, input: Input, klazz: Class[ReadBucket]): ReadBucket = { - val unpairedPrimaryReads = readArray(kryo, input) - val pairedFirstPrimaryMappedReads = readArray(kryo, input) - val pairedSecondPrimaryMappedReads = readArray(kryo, input) - val unpairedSecondaryReads = readArray(kryo, input) - val pairedFirstSecondaryMappedReads = readArray(kryo, input) - val pairedSecondSecondaryMappedReads = readArray(kryo, input) - val unmappedReads = readArray(kryo, input) - new ReadBucket( - unpairedPrimaryReads, - pairedFirstPrimaryMappedReads, - pairedSecondPrimaryMappedReads, - unpairedSecondaryReads, - pairedFirstSecondaryMappedReads, - pairedSecondSecondaryMappedReads, - unmappedReads - ) - } -} - -object ReadBucket { - implicit def singleReadBucketToReadBucket(bucket: SingleReadBucket): ReadBucket = { - // check that reads are either first or second read from fragment - bucket.primaryMapped.foreach(r => require( - r.getReadInFragment >= 0 && r.getReadInFragment <= 1, - "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadInFragment) - )) - bucket.secondaryMapped.foreach(r => require( - r.getReadInFragment >= 0 && r.getReadInFragment <= 1, - "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadInFragment) - )) - bucket.unmapped.foreach(r => require( - r.getReadInFragment >= 0 && r.getReadInFragment <= 1, - "Read %s is not first or second read from pair (num = %d).".format(r, r.getReadInFragment) - )) - - val (pairedPrimary, unpairedPrimary) = bucket.primaryMapped.partition(_.getReadPaired) - val (pairedFirstPrimary, pairedSecondPrimary) = pairedPrimary.partition(_.getReadInFragment == 0) - val (pairedSecondary, unpairedSecondary) = bucket.secondaryMapped.partition(_.getReadPaired) - val (pairedFirstSecondary, pairedSecondSecondary) = pairedSecondary.partition(_.getReadInFragment == 0) - - new ReadBucket( - unpairedPrimary, - pairedFirstPrimary, - pairedSecondPrimary, - unpairedSecondary, - pairedFirstSecondary, - pairedSecondSecondary, - bucket.unmapped - ) - } -} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala index b39bf1316e..a5548c2109 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/RecordGroupDictionary.scala @@ -17,11 +17,14 @@ */ package org.bdgenomics.adam.models -import htsjdk.samtools.{ SAMFileHeader, SamReader, SAMReadGroupRecord } +import htsjdk.samtools.{ SAMFileHeader, SAMReadGroupRecord } import java.util.Date import org.bdgenomics.formats.avro.{ RecordGroupMetadata, Sample } import scala.collection.JavaConversions._ +/** + * Singleton object for creating dictionaries of record groups. + */ object RecordGroupDictionary { /** @@ -36,31 +39,30 @@ object RecordGroupDictionary { } /** - * Builds a record group dictionary from a SAM file reader by collecting the header, and the - * read groups attached to the header. - * - * @param samReader SAM file header with attached read groups. - * @return Returns a new record group dictionary with the read groups attached to the file header. + * @return Returns a record group dictionary that contains no record groups. */ - def fromSAMReader(samReader: SamReader): RecordGroupDictionary = { - fromSAMHeader(samReader.getFileHeader) - } - def empty: RecordGroupDictionary = new RecordGroupDictionary(Seq.empty) } /** - * Builds a dictionary containing record groups. Record groups must have a unique name across all - * samples in the dictionary. This dictionary provides numerical IDs for each group; these IDs - * are only consistent when referencing a single dictionary. + * Builds a dictionary containing record groups. + * + * Record groups must have a unique name across all samples in the dictionary. + * This dictionary provides numerical IDs for each group; these IDs are only + * consistent when referencing a single dictionary. * * @param recordGroups A seq of record groups to popualate the dictionary. * - * @throws AssertionError Throws an assertion error if there are multiple record groups with the - * same name. + * @throws IllegalArgumentError Throws an assertion error if there are multiple record + * groups with the same name. */ case class RecordGroupDictionary(recordGroups: Seq[RecordGroup]) { - lazy val recordGroupMap = recordGroups.map(v => (v.recordGroupName, v)) + + /** + * A representation of this record group dictionary that can be indexed into + * by record group name. + */ + val recordGroupMap = recordGroups.map(v => (v.recordGroupName, v)) .sortBy(_._1) .zipWithIndex .map(kv => { @@ -68,11 +70,17 @@ case class RecordGroupDictionary(recordGroups: Seq[RecordGroup]) { (name, (group, index)) }).toMap - assert( + require( recordGroupMap.size == recordGroups.length, "Read group dictionary contains multiple samples with identical read group names." ) + /** + * Merges together two record group dictionaries. + * + * @param that The record group dictionary to merge with. + * @return The merged record group dictionary. + */ def ++(that: RecordGroupDictionary): RecordGroupDictionary = { new RecordGroupDictionary(recordGroups ++ that.recordGroups) } @@ -118,7 +126,11 @@ case class RecordGroupDictionary(recordGroups: Seq[RecordGroup]) { } } +/** + * Singleton object for creating RecordGroups. + */ object RecordGroup { + /** * Converts a SAMReadGroupRecord into a RecordGroup. * @@ -126,9 +138,9 @@ object RecordGroup { * @return Returns an equivalent ADAM format record group. */ def apply(samRGR: SAMReadGroupRecord): RecordGroup = { - assert( + require( samRGR.getSample != null, - "Sample ID is not set for read group " + samRGR.getReadGroupId + "Sample ID is not set for read group %s".format(samRGR.getReadGroupId) ) new RecordGroup( samRGR.getSample, @@ -184,6 +196,29 @@ object RecordGroup { } } +/** + * A record group represents a set of reads that were + * sequenced/processed/prepped/analyzed together. + * + * @param sample The sample these reads are from. + * @param recordGroupName The name of this record group. + * @param sequencingCenter The optional name of the place where these reads + * were sequenced. + * @param description An optional description for this record group. + * @param runDateEpoch An optional Unix epoch timestamp for when these reads + * were run through the sequencer. + * @param flowOrder An optional string of nucleotides that were used for each + * flow of each read. + * @param keySequence An optional string of nucleotides that are the key for + * this read. + * @param library An optional library name. + * @param predictedMedianInsertSize An optional prediction of the read insert + * size for this library prep. + * @param platform An optional description for the platform this group was + * sequenced on. + * @param platformUnit An optional ID for the sequencer this group was sequenced + * on. + */ case class RecordGroup( sample: String, recordGroupName: String, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala index e5995268c4..080f30213a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePosition.scala @@ -21,12 +21,27 @@ import com.esotericsoftware.kryo.{ Kryo, Serializer } import com.esotericsoftware.kryo.io.{ Input, Output } import org.bdgenomics.formats.avro._ +/** + * A sort order that orders all positions lexicographically by contig and + * numerically within a single contig. + */ object PositionOrdering extends ReferenceOrdering[ReferencePosition] { } + +/** + * A sort order that orders all given positions lexicographically by contig and + * numerically within a single contig, and puts all non-provided positions at + * the end. An extension of PositionOrdering to Optional data. + * + * @see PositionOrdering + */ object OptionalPositionOrdering extends OptionalReferenceOrdering[ReferencePosition] { val baseOrdering = PositionOrdering } +/** + * Companion object for creating and sorting ReferencePositions. + */ object ReferencePosition extends Serializable { implicit def orderingForPositions = PositionOrdering @@ -86,15 +101,35 @@ object ReferencePosition extends Serializable { new ReferencePosition(contigNameSet.head, startSet.head) } + /** + * Convenience method for building a ReferencePosition. + * + * @param referenceName The name of the reference contig this locus exists on. + * @param pos The position of this locus. + */ def apply(referenceName: String, pos: Long): ReferencePosition = { new ReferencePosition(referenceName, pos) } + /** + * Convenience method for building a ReferencePosition. + * + * @param referenceName The name of the reference contig this locus exists on. + * @param pos The position of this locus. + * @param orientation The strand that this locus is on. + */ def apply(referenceName: String, pos: Long, orientation: Strand): ReferencePosition = { new ReferencePosition(referenceName, pos, orientation) } } +/** + * A single genomic locus. + * + * @param referenceName The name of the reference contig this locus exists on. + * @param pos The position of this locus. + * @param orientation The strand that this locus is on. + */ class ReferencePosition( override val referenceName: String, val pos: Long, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala index 00cb0b8285..5688af8306 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala @@ -21,7 +21,6 @@ import com.esotericsoftware.kryo.io.{ Input, Output } import com.esotericsoftware.kryo.{ Kryo, Serializer } import org.bdgenomics.formats.avro._ import org.bdgenomics.utils.intervaltree.Interval - import scala.math.{ max, min } trait ReferenceOrdering[T <: ReferenceRegion] extends Ordering[T] { @@ -61,12 +60,29 @@ trait OptionalReferenceOrdering[T <: ReferenceRegion] extends Ordering[Option[T] } } +/** + * A sort order that orders all given regions lexicographically by contig and + * numerically within a single contig, and puts all non-provided regions at + * the end. Regions are compared by start position first. If start positions + * are equal, then we compare by end position. + */ object RegionOrdering extends ReferenceOrdering[ReferenceRegion] { } + +/** + * A sort order that orders all given regions lexicographically by contig and + * numerically within a single contig, and puts all non-provided regions at + * the end. An extension of PositionOrdering to Optional data. + * + * @see PositionOrdering + */ object OptionalRegionOrdering extends OptionalReferenceOrdering[ReferenceRegion] { val baseOrdering = RegionOrdering } +/** + * A companion object for creating and ordering ReferenceRegions. + */ object ReferenceRegion { implicit def orderingForPositions = RegionOrdering @@ -156,10 +172,13 @@ object ReferenceRegion { } /** - * Builds a reference region from an alignment record. + * Builds a reference region for an aligned read. + * + * @throws IllegalArgumentException If this read is not aligned or alignment + * data is null. * - * @param record AlignmentRecord to extract region from. - * @return The site where the alignment record aligns. + * @param record The read to extract the reference region from. + * @return Returns the reference region covered by this read's alignment. */ def apply(record: AlignmentRecord): ReferenceRegion = { require(record.getReadMapped, @@ -239,8 +258,14 @@ case class ReferenceRegion( extends Comparable[ReferenceRegion] with Interval { - assert(start >= 0 && end >= start, "Failed when trying to create region %s %d %d on %s strand.".format(referenceName, start, end, orientation)) + assert(start >= 0 && end >= start, + "Failed when trying to create region %s %d %d on %s strand.".format( + referenceName, start, end, orientation)) + /** + * @return Returns a copy of this reference region that is on the independent + * strand. + */ def disorient: ReferenceRegion = new ReferenceRegion(referenceName, start, end) /** @@ -287,7 +312,9 @@ case class ReferenceRegion( } /** - * Returns whether two regions are adjacent. Adjacent regions do not overlap, but have no separation between start/end. + * Returns whether two regions are adjacent. + * + * Adjacent regions do not overlap, but have no separation between start/end. * * @param region Region to compare against. * @return True if regions are adjacent. @@ -296,28 +323,35 @@ case class ReferenceRegion( distance(region).exists(_ == 1) /** - * Returns the distance between this reference region and another region in the reference space. + * Returns the distance between this reference region and another region in + * the reference space. * - * @note Distance here is defined as the minimum distance between any point within this region, and - * any point within the other region we are measuring against. If the two sets overlap, the distance - * will be 0. If the sets abut, the distance will be 1. Else, the distance will be greater. + * @note Distance here is defined as the minimum distance between any point + * within this region, and any point within the other region we are measuring + * against. If the two sets overlap, the distance will be 0. If the sets abut, + * the distance will be 1. Else, the distance will be greater. * * @param other Region to compare against. - * @return Returns an option containing the distance between two points. If the point is not in - * our reference space, we return an empty option (None). + * @return Returns an option containing the distance between two points. If + * the point is not in our reference space, we return an empty option. */ - def distance(other: ReferenceRegion): Option[Long] = - if (referenceName == other.referenceName && orientation == other.orientation) - if (overlaps(other)) + def distance(other: ReferenceRegion): Option[Long] = { + if (referenceName == other.referenceName && orientation == other.orientation) { + if (overlaps(other)) { Some(0) - else if (other.start >= end) + } else if (other.start >= end) { Some(other.start - end + 1) - else + } else { Some(start - other.end + 1) - else + } + } else { None + } + } /** + * Extends the current reference region at both the start and end. + * * @param by The number of bases to extend the region by from both the start * and the end. * @return Returns a new reference region where the start and end have been @@ -328,6 +362,9 @@ case class ReferenceRegion( } /** + * Extends the current reference region at both the start and end, but by + * different numbers of bases. + * * @param byStart The number of bases to move the start position forward by. * @param byEnd The number of bases to move the end position back by. * @return Returns a new reference region where the start and/or end have been @@ -340,28 +377,55 @@ case class ReferenceRegion( orientation) } + /** + * Checks if a position is wholly within our region. + * + * @param other The reference position to compare against. + * @return True if the position is within our region. + */ def contains(other: ReferencePosition): Boolean = { orientation == other.orientation && referenceName == other.referenceName && start <= other.pos && end > other.pos } + /** + * Checks if another region is wholly within our region. + * + * @param other The region to compare against. + * @return True if the region is wholly contained within our region. + */ def contains(other: ReferenceRegion): Boolean = { orientation == other.orientation && referenceName == other.referenceName && start <= other.start && end >= other.end } + /** + * Checks if our region overlaps (wholly or partially) another region. + * + * @param other The region to compare against. + * @return True if any section of the two regions overlap. + */ def overlaps(other: ReferenceRegion): Boolean = { orientation == other.orientation && referenceName == other.referenceName && end > other.start && start < other.end } + /** + * Compares between two regions using the RegionOrdering. + * + * @param that The region to compare against. + * @return An ordering depending on which region comes first. + */ def compareTo(that: ReferenceRegion): Int = { RegionOrdering.compare(this, that) } + /** + * @return The length of this region in bases. + */ def length(): Long = { end - start } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala index 63759a1e9a..ecf9d7bfc0 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala @@ -21,29 +21,47 @@ import htsjdk.samtools.{ SAMFileHeader, SAMProgramRecord } import scala.collection.JavaConversions._ private[adam] object SAMFileHeaderWritable { + + /** + * Creates a serializable representation of a SAM file header. + * + * @param header SAMFileHeader to create serializable representation of. + * @return A serializable representation of the header that can be transformed + * back into the htsjdk representation on the executor. + */ def apply(header: SAMFileHeader): SAMFileHeaderWritable = { new SAMFileHeaderWritable(header) } } -class SAMFileHeaderWritable(hdr: SAMFileHeader) extends Serializable { +/** + * Wrapper for the SAM file header to get around serialization issues. + * + * The SAM file header is not serialized and is instead recreated on demaind. + * + * @param hdr A SAM file header to extract metadata from. + */ +private[adam] class SAMFileHeaderWritable private (hdr: SAMFileHeader) extends Serializable { + // extract fields that are needed in order to recreate the SAMFileHeader - protected val text = { + private val text = { val txt: String = hdr.getTextHeader Option(txt) } - protected val sd = SequenceDictionary(hdr.getSequenceDictionary) - protected val pgl = { + private val sd = SequenceDictionary(hdr.getSequenceDictionary) + private val pgl = { val pgs = hdr.getProgramRecords pgs.map(ProgramRecord(_)) } - protected val comments = { + private val comments = { val cmts = hdr.getComments cmts.flatMap(Option(_)) // don't trust samtools to return non-nulls } - protected val rgs = RecordGroupDictionary.fromSAMHeader(hdr) + private val rgs = RecordGroupDictionary.fromSAMHeader(hdr) - // recreate header when requested to get around header not being serializable + /** + * Recreate header when requested to get around header not being serializable. + */ @transient lazy val header = { val h = new SAMFileHeader() diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala index 315ae2c168..1b0400c339 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala @@ -17,7 +17,7 @@ */ package org.bdgenomics.adam.models -import htsjdk.samtools.{ SamReader, SAMFileHeader, SAMSequenceRecord, SAMSequenceDictionary } +import htsjdk.samtools.{ SAMFileHeader, SAMSequenceRecord, SAMSequenceDictionary } import htsjdk.variant.vcf.VCFHeader import org.apache.avro.generic.IndexedRecord import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment, Contig } @@ -25,25 +25,41 @@ import scala.collection._ import scala.collection.JavaConversions._ /** - * SequenceDictionary contains the (bijective) map between Ints (the referenceId) and Strings (the referenceName) - * from the header of a BAM file, or the combined result of multiple such SequenceDictionaries. + * Singleton object for creating SequenceDictionaries. */ - object SequenceDictionary { + /** * @return Creates a new, empty SequenceDictionary. */ def empty: SequenceDictionary = new SequenceDictionary() + /** + * Builds a sequence dictionary for a variable length collection of records. + * + * @param records Records to include in the dictionary. + * @return A sequence dictionary containing these records. + */ def apply(records: SequenceRecord*): SequenceDictionary = new SequenceDictionary(records.toVector) + + /** + * Builds a sequence dictionary from an htsjdk SAMSequenceDictionary. + * + * @param dict Htsjdk sequence dictionary to build from. + * @return A SequenceDictionary with populated sequence records. + */ def apply(dict: SAMSequenceDictionary): SequenceDictionary = { new SequenceDictionary(dict.getSequences.map(SequenceRecord.fromSAMSequenceRecord).toVector) } - def apply(header: SAMFileHeader): SequenceDictionary = SequenceDictionary(header.getSequenceDictionary) - def apply(reader: SamReader): SequenceDictionary = SequenceDictionary(reader.getFileHeader) - def toSAMSequenceDictionary(dictionary: SequenceDictionary): SAMSequenceDictionary = { - new SAMSequenceDictionary(dictionary.records.map(SequenceRecord.toSAMSequenceRecord).toList) + /** + * Makes a SequenceDictionary from a SAMFileHeader. + * + * @param header htsjdk SAMFileHeader to extract sequences from. + * @return A SequenceDictionary with populated sequence records. + */ + def apply(header: SAMFileHeader): SequenceDictionary = { + SequenceDictionary(header.getSequenceDictionary) } /** @@ -56,21 +72,6 @@ object SequenceDictionary { new SequenceDictionary(contigs.map(SequenceRecord.fromADAMContig).toVector) } - /** - * Extracts a SAM sequence dictionary from a SAM file header and returns an - * ADAM sequence dictionary. - * - * @see fromSAMSequenceDictionary - * - * @param header SAM file header. - * @return Returns an ADAM style sequence dictionary. - */ - def fromSAMHeader(header: SAMFileHeader): SequenceDictionary = { - val samDict = header.getSequenceDictionary - - fromSAMSequenceDictionary(samDict) - } - /** * Extracts a SAM sequence dictionary from a VCF header and returns an * ADAM sequence dictionary. @@ -100,11 +101,16 @@ object SequenceDictionary { val samDictRecords = samDict.getSequences new SequenceDictionary(samDictRecords.map(SequenceRecord.fromSAMSequenceRecord).toVector) } - - def fromSAMReader(samReader: SamReader): SequenceDictionary = - fromSAMHeader(samReader.getFileHeader) } +/** + * A SequenceDictionary contains metadata about the reference build genomic data + * is aligned against. + * + * @see SequenceRecord + * + * @param records The individual reference contigs. + */ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializable { def this() = this(Vector.empty[SequenceRecord]) @@ -113,6 +119,10 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab private val hasSequenceOrdering = records.forall(_.referenceIndex.isDefined) + /** + * @param that Sequence dictionary to compare against. + * @return True if each record in this dictionary exists in the other dictionary. + */ def isCompatibleWith(that: SequenceDictionary): Boolean = { for (record <- that.records) { val myRecord = byName.get(record.name) @@ -122,10 +132,37 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab true } + /** + * @param name The name of the contig to extract. + * @return If available, the sequence record for this contig. + */ def apply(name: String): Option[SequenceRecord] = byName.get(name) + + /** + * Checks to see if we have a contig with a given name. + * + * @param name The name of the contig to extract. + * @return True if we have a sequence record for this contig. + */ def containsRefName(name: String): Boolean = byName.containsKey(name) + /** + * Adds a sequence record to this dictionary. + * + * @param record The sequence record to add. + * @return A new sequence dictionary with the new record added. + */ def +(record: SequenceRecord): SequenceDictionary = this ++ SequenceDictionary(record) + + /** + * Merges two sequence dictionaries. + * + * Filters any sequence records that exist in both dictionaries. + * + * @param that The sequence dictionary to add. + * @return A new sequence dictionary that contains a record per contig in each + * input dictionary. + */ def ++(that: SequenceDictionary): SequenceDictionary = { new SequenceDictionary(records ++ that.records.filter(r => !byName.contains(r.name))) } @@ -185,14 +222,17 @@ class SequenceDictionary(val records: Vector[SequenceRecord]) extends Serializab } private[adam] def toAvro: Seq[Contig] = { - records.map(SequenceRecord.toADAMContig) + records.map(_.toADAMContig) .toSeq } + /** + * @return True if this dictionary contains no sequence records. + */ def isEmpty: Boolean = records.isEmpty } -object SequenceOrderingByName extends Ordering[SequenceRecord] { +private object SequenceOrderingByName extends Ordering[SequenceRecord] { def compare( a: SequenceRecord, b: SequenceRecord): Int = { @@ -200,7 +240,7 @@ object SequenceOrderingByName extends Ordering[SequenceRecord] { } } -object SequenceOrderingByRefIdx extends Ordering[SequenceRecord] { +private object SequenceOrderingByRefIdx extends Ordering[SequenceRecord] { def compare( a: SequenceRecord, b: SequenceRecord): Int = { @@ -216,8 +256,19 @@ object SequenceOrderingByRefIdx extends Ordering[SequenceRecord] { } /** - * Utility class within the SequenceDictionary; represents unique reference name-to-id correspondence + * Metadata about a single reference contig. * + * @param name The name of the contig. + * @param length The length of the contig. + * @param url If available, the URL the contig is accessible from. + * @param md5 If available, the MD5 checksum for the contig. + * @param refseq If available, the REFSEQ ID for the contig. + * @param genbank If available, the Genbank ID for the contig. + * @param assembly If available, the assembly name for the assembly this contig + * is from. + * @param species If available, the species this contig was assembled from. + * @param referenceIndex If available, the number of this contig in a set of + * contigs. */ case class SequenceRecord( name: String, @@ -295,12 +346,47 @@ case class SequenceRecord( case (Some(c1), Some(c2)) => c1 == c2 case _ => true } + + /** + * @return Builds an Avro contig representation from this record. + */ + def toADAMContig: Contig = { + val builder = Contig.newBuilder() + .setContigName(name) + .setContigLength(length) + md5.foreach(builder.setContigMD5) + url.foreach(builder.setReferenceURL) + assembly.foreach(builder.setAssembly) + species.foreach(builder.setSpecies) + referenceIndex.foreach(builder.setReferenceIndex(_)) + builder.build + } } +/** + * Companion object for creating Sequence Records. + */ object SequenceRecord { - val REFSEQ_TAG = "REFSEQ" - val GENBANK_TAG = "GENBANK" + private val REFSEQ_TAG = "REFSEQ" + private val GENBANK_TAG = "GENBANK" + /** + * Java friendly apply method that wraps null strings. + * + * @param name The name of the contig. + * @param length The length of the contig. + * @param url If available, the URL the contig is accessible from. + * @param md5 If available, the MD5 checksum for the contig. + * @param refseq If available, the REFSEQ ID for the contig. + * @param genbank If available, the Genbank ID for the contig. + * @param assembly If available, the assembly name for the assembly this contig + * is from. + * @param species If available, the species this contig was assembled from. + * @param referenceIndex If available, the number of this contig in a set of + * contigs. + * @return Returns a new SequenceRecord where all strings except for name are + * wrapped in Options to check for null values. + */ def apply( name: String, length: Long, @@ -342,15 +428,14 @@ object SequenceRecord { species = record.getAttribute(SAMSequenceRecord.SPECIES_TAG), referenceIndex = if (record.getSequenceIndex == -1) None else Some(record.getSequenceIndex) ) - - } - def toSAMSequenceRecord(record: SequenceRecord): SAMSequenceRecord = { - val sam = new SAMSequenceRecord(record.name, record.length.toInt) - record.md5.foreach(v => sam.setAttribute(SAMSequenceRecord.MD5_TAG, v)) - record.url.foreach(v => sam.setAttribute(SAMSequenceRecord.URI_TAG, v)) - sam } + /** + * Builds a sequence record from an Avro Contig. + * + * @param contig Contig record to build from. + * @return This Contig record as a SequenceRecord. + */ def fromADAMContig(contig: Contig): SequenceRecord = { SequenceRecord( contig.getContigName, @@ -363,37 +448,14 @@ object SequenceRecord { ) } - def toADAMContig(record: SequenceRecord): Contig = { - val builder = Contig.newBuilder() - .setContigName(record.name) - .setContigLength(record.length) - record.md5.foreach(builder.setContigMD5) - record.url.foreach(builder.setReferenceURL) - record.assembly.foreach(builder.setAssembly) - record.species.foreach(builder.setSpecies) - record.referenceIndex.foreach(builder.setReferenceIndex(_)) - builder.build - } - + /** + * Extracts the contig metadata from a nucleotide fragment. + * + * @param fragment The assembly fragment to extract a SequenceRecord from. + * @return The sequence record metadata from a single assembly fragment. + */ def fromADAMContigFragment(fragment: NucleotideContigFragment): SequenceRecord = { fromADAMContig(fragment.getContig) } - - def fromSpecificRecord(rec: IndexedRecord): SequenceRecord = { - val schema = rec.getSchema - if (schema.getField("referenceId") != null) { - SequenceRecord( - rec.get(schema.getField("referenceName").pos()).toString, - rec.get(schema.getField("referenceLength").pos()).asInstanceOf[Long], - url = rec.get(schema.getField("referenceUrl").pos()).toString - ) - } else if (schema.getField("contig") != null) { - val pos = schema.getField("contig").pos() - fromADAMContig(rec.get(pos).asInstanceOf[Contig]) - } else { - throw new AssertionError("Missing information to generate SequenceRecord") - } - } - } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala index ae8a39e375..f687b5afec 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala @@ -23,10 +23,17 @@ import org.bdgenomics.utils.misc.Logging import org.apache.spark.rdd.RDD import scala.collection.immutable._ import scala.collection.mutable -import java.io.File +/** + * A table containing all of the SNPs in a known variation dataset. + * + * @param table A map between a contig name and a set containing all coordinates + * where a point variant is known to exist. + */ class SnpTable(private val table: Map[String, Set[Long]]) extends Serializable with Logging { - log.info("SNP table has %s contigs and %s entries".format(table.size, table.values.map(_.size).sum)) + log.info("SNP table has %s contigs and %s entries".format( + table.size, + table.values.map(_.size).sum)) /** * Is there a known SNP at the reference location of this Residue? @@ -58,41 +65,26 @@ class SnpTable(private val table: Map[String, Set[Long]]) extends Serializable w } } +/** + * Companion object with helper functions for building SNP tables. + */ object SnpTable { + + /** + * Creates an empty SNP Table. + * + * @return An empty SNP table. + */ def apply(): SnpTable = { new SnpTable(Map[String, Set[Long]]()) } - // `knownSnpsFile` is expected to be a sites-only VCF - def apply(knownSnpsFile: File): SnpTable = { - // parse into tuples of (contig, position) - val snpsSource = scala.io.Source.fromFile(knownSnpsFile) - try { - val lines = snpsSource.getLines() - val tuples = lines.filter(line => !line.startsWith("#")).flatMap(line => { - val split = line.split("\t") - val contig = split(0) - val pos = split(1).toLong - 1 - val ref = split(3) - assert(pos >= 0) - assert(!ref.isEmpty) - ref.zipWithIndex.map { - case (base, idx) => - assert(Seq('A', 'C', 'T', 'G', 'N').contains(base)) - (contig, pos + idx) - } - }) - // construct map from contig to set of positions - // this is done in-place to reduce overhead - val table = new mutable.HashMap[String, mutable.HashSet[Long]] - tuples.foreach(tup => table.getOrElseUpdate(tup._1, { new mutable.HashSet[Long] }) += tup._2) - // construct SnpTable from immutable copy of `table` - new SnpTable(table.mapValues(_.toSet).toMap) - } finally { - snpsSource.close() - } - } - + /** + * Creates a SNP Table from an RDD of RichVariants. + * + * @param variants The variants to populate the table from. + * @return Returns a new SNPTable containing the input variants. + */ def apply(variants: RDD[RichVariant]): SnpTable = { val positions = variants.map(variant => (variant.variant.getContigName, variant.variant.getStart)).collect() diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/VCFHeaderWritable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/VCFHeaderWritable.scala index 27c7724448..98c108c794 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/VCFHeaderWritable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/VCFHeaderWritable.scala @@ -19,5 +19,10 @@ package org.bdgenomics.adam.models import htsjdk.variant.vcf.VCFHeader -case class VCFHeaderWritable(header: VCFHeader) { +/** + * Serializable wrapper for the VCF header. + * + * @param header A VCF header to serialize. + */ +private[adam] case class VCFHeaderWritable(header: VCFHeader) { } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala index 16ad7d52c2..edd9fec4f9 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/VariantContext.scala @@ -21,8 +21,7 @@ import org.bdgenomics.formats.avro.{ Genotype, Variant, VariantAnnotation } import org.bdgenomics.adam.rich.RichVariant /** - * Note: VariantContext inherits its name from the Picard VariantContext, and is not related to the SparkContext object. - * If you're looking for the latter, see [[org.bdgenomics.adam.rdd.ADAMContext]]. + * Singleton object for building VariantContexts. */ object VariantContext { @@ -66,8 +65,9 @@ object VariantContext { * from the left record. * @return Returns the union of these two annotations. */ - def mergeAnnotations(leftRecord: VariantAnnotation, - rightRecord: VariantAnnotation): VariantAnnotation = { + private[adam] def mergeAnnotations( + leftRecord: VariantAnnotation, + rightRecord: VariantAnnotation): VariantAnnotation = { val mergedAnnotation = VariantAnnotation.newBuilder(leftRecord) .build() val numFields = VariantAnnotation.getClassSchema.getFields.size @@ -139,9 +139,22 @@ object VariantContext { } } +/** + * A representation of all variation data at a single variant. + * + * This class represents an equivalent to a single allele from a VCF line, and + * is the ADAM equivalent to htsjdk.variant.variantcontext.VariantContext. + * + * @param position The locus that the variant is at. + * @param variant The variant allele that is contained in this VariantContext. + * @param genotypes An iterable collection of Genotypes where this allele was + * called. Equivalent to the per-sample FORMAT fields in a VCF. + * @param databases An optional record annotating this variant. Equivalent to + * the per-allele split of the INFO field in a VCF. + */ class VariantContext( val position: ReferencePosition, val variant: RichVariant, val genotypes: Iterable[Genotype], - val annotations: Option[VariantAnnotation] = None) { + val annotations: Option[VariantAnnotation] = None) extends Serializable { } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala index 5e5fd221ed..f32921cd2a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala @@ -638,7 +638,7 @@ sealed trait AlignmentRecordRDD extends AvroReadGroupGenomicRDD[AlignmentRecord, * * @return SingleReadBuckets with primary, secondary and unmapped reads */ - def groupReadsByFragment(): RDD[SingleReadBucket] = { + private[read] def groupReadsByFragment(): RDD[SingleReadBucket] = { SingleReadBucket(rdd) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala index 522be3e2a2..ed90c02e39 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala @@ -54,7 +54,7 @@ trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterC header.setSortOrder(SAMFileHeader.SortOrder.coordinate) // construct the in formatter - makeFormatter(new SAMFileHeaderWritable(header), gRdd.recordGroups, arc) + makeFormatter(SAMFileHeaderWritable(header), gRdd.recordGroups, arc) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala index 6170aec56a..a13784b0f2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala @@ -22,9 +22,7 @@ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.instrumentation.Timers._ import org.bdgenomics.adam.models.{ RecordGroupDictionary, - ReferencePosition, - ReferencePositionPair, - SingleReadBucket + ReferencePosition } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.formats.avro.AlignmentRecord diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ReferencePositionPair.scala similarity index 73% rename from adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ReferencePositionPair.scala index 50f31085be..bfc9e5ad7d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferencePositionPair.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ReferencePositionPair.scala @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.bdgenomics.adam.models +package org.bdgenomics.adam.rdd.read import com.esotericsoftware.kryo.{ Kryo, Serializer } import com.esotericsoftware.kryo.io.{ Input, Output } @@ -23,10 +23,25 @@ import Ordering.Option import org.bdgenomics.utils.misc.Logging import org.bdgenomics.adam.instrumentation.Timers.CreateReferencePositionPair import org.bdgenomics.adam.models.ReferenceRegion._ +import org.bdgenomics.adam.models.{ + ReferencePosition, + ReferencePositionSerializer +} import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.formats.avro.AlignmentRecord -object ReferencePositionPair extends Logging { +/** + * A singleton object for creating reference position pairs. + */ +private[read] object ReferencePositionPair extends Logging { + + /** + * Extracts the reference positions from a bucket of reads from a single fragment. + * + * @param singleReadBucket A bucket of reads from a single DNA fragment. + * @return Returns the start position pair for the primary aligned first and + * second of pair reads. + */ def apply(singleReadBucket: SingleReadBucket): ReferencePositionPair = CreateReferencePositionPair.time { val firstOfPair = (singleReadBucket.primaryMapped.filter(_.getReadInFragment == 0) ++ singleReadBucket.unmapped.filter(_.getReadInFragment == 0)).toSeq @@ -56,14 +71,21 @@ object ReferencePositionPair extends Logging { } } -case class ReferencePositionPair( - read1refPos: Option[ReferencePosition], - read2refPos: Option[ReferencePosition]) +/** + * The start positions for a fragment sequenced with a paired protocol. + * + * @param read1refPos The start position of the first-of-pair read, if aligned. + * @param read2refPos The start position of the second-of-pair read, if aligned. + */ +private[adam] case class ReferencePositionPair( + read1refPos: Option[ReferencePosition], + read2refPos: Option[ReferencePosition]) { +} class ReferencePositionPairSerializer extends Serializer[ReferencePositionPair] { val rps = new ReferencePositionSerializer() - def writeOptionalReferencePos(kryo: Kryo, output: Output, optRefPos: Option[ReferencePosition]) = { + private def writeOptionalReferencePos(kryo: Kryo, output: Output, optRefPos: Option[ReferencePosition]) = { optRefPos match { case None => output.writeBoolean(false) @@ -73,7 +95,7 @@ class ReferencePositionPairSerializer extends Serializer[ReferencePositionPair] } } - def readOptionalReferencePos(kryo: Kryo, input: Input): Option[ReferencePosition] = { + private def readOptionalReferencePos(kryo: Kryo, input: Input): Option[ReferencePosition] = { val exists = input.readBoolean() if (exists) { Some(rps.read(kryo, input, classOf[ReferencePosition])) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SingleReadBucket.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SingleReadBucket.scala similarity index 72% rename from adam-core/src/main/scala/org/bdgenomics/adam/models/SingleReadBucket.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SingleReadBucket.scala index 37e1e86ec7..6ead0da7c2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SingleReadBucket.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SingleReadBucket.scala @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.bdgenomics.adam.models +package org.bdgenomics.adam.rdd.read import com.esotericsoftware.kryo.{ Kryo, Serializer } import com.esotericsoftware.kryo.io.{ Output, Input } @@ -29,7 +29,18 @@ import org.bdgenomics.formats.avro.{ } import scala.collection.JavaConversions._ -object SingleReadBucket extends Logging { +/** + * Companion object for building SingleReadBuckets. + */ +private[read] object SingleReadBucket extends Logging { + + /** + * Builds an RDD of SingleReadBuckets from an RDD of AlignmentRecords. + * + * @param rdd The RDD of AlignmentRecords to build the RDD of single read + * buckets from. + * @return Returns an RDD of SingleReadBuckets. + */ def apply(rdd: RDD[AlignmentRecord]): RDD[SingleReadBucket] = { rdd.groupBy(p => (p.getRecordGroupName, p.getReadName)) .map(kv => { @@ -39,21 +50,40 @@ object SingleReadBucket extends Logging { val (mapped, unmapped) = reads.partition(_.getReadMapped) val (primaryMapped, secondaryMapped) = mapped.partition(_.getPrimaryAlignment) - // TODO: consider doing validation here (e.g. read says mate mapped but it doesn't exist) + // TODO: consider doing validation here + // (e.g. read says mate mapped but it doesn't exist) new SingleReadBucket(primaryMapped, secondaryMapped, unmapped) }) } } -case class SingleReadBucket( - primaryMapped: Iterable[AlignmentRecord] = Seq.empty, - secondaryMapped: Iterable[AlignmentRecord] = Seq.empty, - unmapped: Iterable[AlignmentRecord] = Seq.empty) { - // Note: not a val in order to save serialization/memory cost +/** + * A representation of all of the read alignments that came from a single sequenced + * fragment. + * + * @param primaryMapped All read alignments that are primary alignments. + * @param secondaryMapped All read alignments that are non-primary (i.e., + * secondary or supplementary alignments). + * @param unmapped All reads from the fragment that are unmapped. + */ +private[adam] case class SingleReadBucket( + primaryMapped: Iterable[AlignmentRecord] = Iterable.empty, + secondaryMapped: Iterable[AlignmentRecord] = Iterable.empty, + unmapped: Iterable[AlignmentRecord] = Iterable.empty) { + + /** + * @return The union of the primary, secondary, and unmapped buckets. + */ def allReads = { primaryMapped ++ secondaryMapped ++ unmapped } + /** + * Converts to an Avro Fragment record. + * + * @return Converts this bucket to a Fragment type, which does not have the + * various alignment buckets, but is otherwise equivalent. + */ def toFragment: Fragment = { // take union of all reads, as we will need this for building and // want to pay the cost exactly once diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala index 9fb93ee615..dd1b7af748 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala @@ -134,13 +134,9 @@ class ADAMKryoRegistrator extends KryoRegistrator { kryo.register(classOf[org.bdgenomics.adam.models.ReferencePosition], new org.bdgenomics.adam.models.ReferencePositionSerializer) kryo.register(classOf[org.bdgenomics.adam.models.ReferenceRegion]) - kryo.register(classOf[org.bdgenomics.adam.models.ReferencePositionPair], - new org.bdgenomics.adam.models.ReferencePositionPairSerializer) kryo.register(classOf[org.bdgenomics.adam.models.SAMFileHeaderWritable]) kryo.register(classOf[org.bdgenomics.adam.models.SequenceDictionary]) kryo.register(classOf[org.bdgenomics.adam.models.SequenceRecord]) - kryo.register(classOf[org.bdgenomics.adam.models.SingleReadBucket], - new org.bdgenomics.adam.models.SingleReadBucketSerializer) kryo.register(classOf[org.bdgenomics.adam.models.SnpTable]) kryo.register(classOf[org.bdgenomics.adam.models.VariantContext]) @@ -150,6 +146,10 @@ class ADAMKryoRegistrator extends KryoRegistrator { // org.bdgenomics.adam.rdd.read kryo.register(classOf[org.bdgenomics.adam.rdd.read.FlagStatMetrics]) kryo.register(classOf[org.bdgenomics.adam.rdd.read.DuplicateMetrics]) + kryo.register(classOf[org.bdgenomics.adam.rdd.read.SingleReadBucket], + new org.bdgenomics.adam.rdd.read.SingleReadBucketSerializer) + kryo.register(classOf[org.bdgenomics.adam.rdd.read.ReferencePositionPair], + new org.bdgenomics.adam.rdd.read.ReferencePositionPairSerializer) // org.bdgenomics.adam.rdd.read.realignment kryo.register(classOf[org.bdgenomics.adam.rdd.read.realignment.IndelRealignmentTarget], diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/NonoverlappingRegionsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/NonoverlappingRegionsSuite.scala new file mode 100644 index 0000000000..a256025a6d --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/NonoverlappingRegionsSuite.scala @@ -0,0 +1,120 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.models + +import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig } +import org.scalatest.FunSuite + +class NonoverlappingRegionsSuite extends FunSuite { + + test("alternating returns an alternating seq of items") { + assert(NonoverlappingRegions.alternating(Seq(), + includeFirst = true) === Seq()) + assert(NonoverlappingRegions.alternating(Seq(1), + includeFirst = true) === Seq(1)) + assert(NonoverlappingRegions.alternating(Seq(1, 2), + includeFirst = true) === Seq(1)) + assert(NonoverlappingRegions.alternating(Seq(1, 2, 3), + includeFirst = true) === Seq(1, 3)) + assert(NonoverlappingRegions.alternating(Seq(1, 2, 3, 4), + includeFirst = true) === Seq(1, 3)) + assert(NonoverlappingRegions.alternating(Seq(1, 2, 3, 4, 5), + includeFirst = true) === Seq(1, 3, 5)) + + assert(NonoverlappingRegions.alternating(Seq(), + includeFirst = false) === Seq()) + assert(NonoverlappingRegions.alternating(Seq(1), + includeFirst = false) === Seq()) + assert(NonoverlappingRegions.alternating(Seq(1, 2), + includeFirst = false) === Seq(2)) + assert(NonoverlappingRegions.alternating(Seq(1, 2, 3), + includeFirst = false) === Seq(2)) + assert(NonoverlappingRegions.alternating(Seq(1, 2, 3, 4), + includeFirst = false) === Seq(2, 4)) + assert(NonoverlappingRegions.alternating(Seq(1, 2, 3, 4, 5), + includeFirst = false) === Seq(2, 4)) + assert(NonoverlappingRegions.alternating(Seq(1, 2, 3, 4, 5, 6), + includeFirst = false) === Seq(2, 4, 6)) + } + + test("Single region returns itself") { + val region = new ReferenceRegion("chr1", 1, 2) + val regions = new NonoverlappingRegions(Seq(region)) + val result = regions.findOverlappingRegions(region) + assert(result.size === 1) + assert(result.head === region) + } + + test("Two adjacent regions will be merged") { + val regions = new NonoverlappingRegions(Seq( + ReferenceRegion("chr1", 10, 20), + ReferenceRegion("chr1", 20, 30))) + + assert(regions.endpoints === Array(10L, 30L)) + } + + test("Nonoverlapping regions will all be returned") { + val region1 = new ReferenceRegion("chr1", 1, 2) + val region2 = new ReferenceRegion("chr1", 3, 5) + val testRegion3 = new ReferenceRegion("chr1", 1, 4) + val testRegion1 = new ReferenceRegion("chr1", 4, 5) + val regions = new NonoverlappingRegions(Seq(region1, region2)) + + // this should be 2, not 3, because binaryRegionSearch is (now) no longer returning + // ReferenceRegions in which no original RR's were placed (i.e. the 'gaps'). + assert(regions.findOverlappingRegions(testRegion3).size === 2) + + assert(regions.findOverlappingRegions(testRegion1).size === 1) + } + + test("Many overlapping regions will all be merged") { + val region1 = new ReferenceRegion("chr1", 1, 3) + val region2 = new ReferenceRegion("chr1", 2, 4) + val region3 = new ReferenceRegion("chr1", 3, 5) + val testRegion = new ReferenceRegion("chr1", 1, 4) + val regions = new NonoverlappingRegions(Seq(region1, region2, region3)) + assert(regions.findOverlappingRegions(testRegion).size === 1) + } + + test("ADAMRecords return proper references") { + val contig = Contig.newBuilder + .setContigName("chr1") + .setContigLength(5L) + .setReferenceURL("test://chrom1") + .build + + val built = AlignmentRecord.newBuilder() + .setContigName(contig.getContigName) + .setStart(1L) + .setReadMapped(true) + .setCigar("1M") + .setEnd(2L) + .build() + + val record1 = built + val record2 = AlignmentRecord.newBuilder(built).setStart(3L).setEnd(4L).build() + val baseRecord = AlignmentRecord.newBuilder(built).setCigar("4M").setEnd(5L).build() + + val baseMapping = new NonoverlappingRegions(Seq(ReferenceRegion(baseRecord))) + val regions1 = baseMapping.findOverlappingRegions(ReferenceRegion(record1)) + val regions2 = baseMapping.findOverlappingRegions(ReferenceRegion(record2)) + assert(regions1.size === 1) + assert(regions2.size === 1) + assert(regions1.head === regions2.head) + } +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala index 0a1b5b621b..6ac3be96c7 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/RecordGroupDictionarySuite.scala @@ -40,7 +40,7 @@ class RecordGroupDictionarySuite extends FunSuite { test("sample name must be set") { val samRGR = new SAMReadGroupRecord("myId") - intercept[AssertionError] { + intercept[IllegalArgumentException] { RecordGroup(samRGR) } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala index a1d299e457..a07dc6fc79 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/SequenceDictionarySuite.scala @@ -39,7 +39,7 @@ class SequenceDictionarySuite extends ADAMFunSuite { assert(asASR.length === 1000L) assert(asASR.url === Some("http://bigdatagenomics.github.io/1")) - val asPSR: SAMSequenceRecord = SequenceRecord.toSAMSequenceRecord(asASR) + val asPSR: SAMSequenceRecord = asASR.toSAMSequenceRecord assert(sr.isSameSequence(asPSR)) } @@ -75,7 +75,7 @@ class SequenceDictionarySuite extends ADAMFunSuite { val path = testFile("dict_with_accession.dict") val ssd = SAMSequenceDictionaryExtractor.extractDictionary(new File(path)) val asd = SequenceDictionary(ssd) - ssd.assertSameDictionary(SequenceDictionary.toSAMSequenceDictionary(asd)) + ssd.assertSameDictionary(asd.toSAMSequenceDictionary) } test("Can retrieve sequence by name") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/InnerBroadcastRegionJoinSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/InnerBroadcastRegionJoinSuite.scala index b314d13061..a5cf3fc9d2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/InnerBroadcastRegionJoinSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/InnerBroadcastRegionJoinSuite.scala @@ -18,97 +18,12 @@ package org.bdgenomics.adam.rdd import org.apache.spark.SparkContext._ -import org.bdgenomics.adam.models.{ NonoverlappingRegions, ReferenceRegion } +import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig } class InnerBroadcastRegionJoinSuite extends ADAMFunSuite { - test("alternating returns an alternating seq of items") { - import NonoverlappingRegions._ - - assert(alternating(Seq(), includeFirst = true) === Seq()) - assert(alternating(Seq(1), includeFirst = true) === Seq(1)) - assert(alternating(Seq(1, 2), includeFirst = true) === Seq(1)) - assert(alternating(Seq(1, 2, 3), includeFirst = true) === Seq(1, 3)) - assert(alternating(Seq(1, 2, 3, 4), includeFirst = true) === Seq(1, 3)) - assert(alternating(Seq(1, 2, 3, 4, 5), includeFirst = true) === Seq(1, 3, 5)) - - assert(alternating(Seq(), includeFirst = false) === Seq()) - assert(alternating(Seq(1), includeFirst = false) === Seq()) - assert(alternating(Seq(1, 2), includeFirst = false) === Seq(2)) - assert(alternating(Seq(1, 2, 3), includeFirst = false) === Seq(2)) - assert(alternating(Seq(1, 2, 3, 4), includeFirst = false) === Seq(2, 4)) - assert(alternating(Seq(1, 2, 3, 4, 5), includeFirst = false) === Seq(2, 4)) - assert(alternating(Seq(1, 2, 3, 4, 5, 6), includeFirst = false) === Seq(2, 4, 6)) - } - - test("Single region returns itself") { - val region = new ReferenceRegion("chr1", 1, 2) - val regions = new NonoverlappingRegions(Seq(region)) - val result = regions.findOverlappingRegions(region) - assert(result.size === 1) - assert(result.head === region) - } - - test("Two adjacent regions will be merged") { - val regions = new NonoverlappingRegions(Seq( - ReferenceRegion("chr1", 10, 20), - ReferenceRegion("chr1", 20, 30))) - - assert(regions.endpoints === Array(10L, 30L)) - } - - test("Nonoverlapping regions will all be returned") { - val region1 = new ReferenceRegion("chr1", 1, 2) - val region2 = new ReferenceRegion("chr1", 3, 5) - val testRegion3 = new ReferenceRegion("chr1", 1, 4) - val testRegion1 = new ReferenceRegion("chr1", 4, 5) - val regions = new NonoverlappingRegions(Seq(region1, region2)) - - // this should be 2, not 3, because binaryRegionSearch is (now) no longer returning - // ReferenceRegions in which no original RR's were placed (i.e. the 'gaps'). - assert(regions.findOverlappingRegions(testRegion3).size === 2) - - assert(regions.findOverlappingRegions(testRegion1).size === 1) - } - - test("Many overlapping regions will all be merged") { - val region1 = new ReferenceRegion("chr1", 1, 3) - val region2 = new ReferenceRegion("chr1", 2, 4) - val region3 = new ReferenceRegion("chr1", 3, 5) - val testRegion = new ReferenceRegion("chr1", 1, 4) - val regions = new NonoverlappingRegions(Seq(region1, region2, region3)) - assert(regions.findOverlappingRegions(testRegion).size === 1) - } - - test("ADAMRecords return proper references") { - val contig = Contig.newBuilder - .setContigName("chr1") - .setContigLength(5L) - .setReferenceURL("test://chrom1") - .build - - val built = AlignmentRecord.newBuilder() - .setContigName(contig.getContigName) - .setStart(1L) - .setReadMapped(true) - .setCigar("1M") - .setEnd(2L) - .build() - - val record1 = built - val record2 = AlignmentRecord.newBuilder(built).setStart(3L).setEnd(4L).build() - val baseRecord = AlignmentRecord.newBuilder(built).setCigar("4M").setEnd(5L).build() - - val baseMapping = new NonoverlappingRegions(Seq(ReferenceRegion(baseRecord))) - val regions1 = baseMapping.findOverlappingRegions(ReferenceRegion(record1)) - val regions2 = baseMapping.findOverlappingRegions(ReferenceRegion(record2)) - assert(regions1.size === 1) - assert(regions2.size === 1) - assert(regions1.head === regions2.head) - } - sparkTest("Ensure same reference regions get passed together") { val contig = Contig.newBuilder .setContigName("chr1") diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/SingleReadBucketSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/SingleReadBucketSuite.scala similarity index 99% rename from adam-core/src/test/scala/org/bdgenomics/adam/models/SingleReadBucketSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/SingleReadBucketSuite.scala index 758872b7a6..f1eeeb4c1f 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/SingleReadBucketSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/SingleReadBucketSuite.scala @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.bdgenomics.adam.models +package org.bdgenomics.adam.rdd.read import org.bdgenomics.formats.avro._ import org.scalatest.FunSuite diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala index 4dddab20da..7c6a218517 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibrationSuite.scala @@ -27,24 +27,6 @@ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro.AlignmentRecord class BaseQualityRecalibrationSuite extends ADAMFunSuite { - sparkTest("BQSR Test Input #1") { - val readsFilepath = testFile("bqsr1.sam") - val snpsFilepath = testFile("bqsr1.snps") - val obsFilepath = testFile("bqsr1-ref.observed") - - val reads: RDD[AlignmentRecord] = sc.loadAlignments(readsFilepath).rdd - val snps = sc.broadcast(SnpTable(new File(snpsFilepath))) - - val bqsr = new BaseQualityRecalibration(cloy(reads), snps) - - // Sanity checks - assert(bqsr.result.count == reads.count) - - // Compare the ObservationTables - val referenceObs: Seq[String] = scala.io.Source.fromFile(new File(obsFilepath)).getLines().filter(_.length > 0).toSeq.sortWith((kv1, kv2) => kv1.compare(kv2) < 0) - val testObs: Seq[String] = bqsr.observed.toCSV.split('\n').filter(_.length > 0).toSeq.sortWith((kv1, kv2) => kv1.compare(kv2) < 0) - referenceObs.zip(testObs).foreach(p => assert(p._1 === p._2)) - } sparkTest("BQSR Test Input #1 w/ VCF Sites") { val readsFilepath = testFile("bqsr1.sam")