diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala index adefbed6ff..697e3897e6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala @@ -27,6 +27,7 @@ import org.bdgenomics.adam.models.{ SequenceDictionary } import org.bdgenomics.adam.rdd.{ AvroGenomicRDD, JavaSaveArgs } +import org.bdgenomics.adam.util.ReferenceFile import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment } import scala.collection.JavaConversions._ import scala.math.max @@ -53,9 +54,18 @@ object NucleotideContigFragmentRDD extends Serializable { } } +/** + * A wrapper class for RDD[NucleotideContigFragment]. + * NucleotideContigFragmentRDD extends ReferenceFile. To specifically access a ReferenceFile within an RDD, + * refer to: + * @see ReferenceContigMap + * + * @param rdd Underlying RDD + * @param sequences Sequence dictionary computed from rdd + */ case class NucleotideContigFragmentRDD( rdd: RDD[NucleotideContigFragment], - sequences: SequenceDictionary) extends AvroGenomicRDD[NucleotideContigFragment, NucleotideContigFragmentRDD] { + sequences: SequenceDictionary) extends AvroGenomicRDD[NucleotideContigFragment, NucleotideContigFragmentRDD] with ReferenceFile { /** * Converts an RDD of nucleotide contig fragments into reads. Adjacent contig fragments are @@ -169,7 +179,7 @@ case class NucleotideContigFragmentRDD( * @param region Reference region over which to get sequence. * @return String of bases corresponding to reference sequence. */ - def getReferenceString(region: ReferenceRegion): String = { + def extract(region: ReferenceRegion): String = { def getString(fragment: (ReferenceRegion, NucleotideContigFragment)): (ReferenceRegion, String) = { val trimStart = max(0, region.start - fragment._1.start).toInt val trimEnd = max(0, fragment._1.end - region.end).toInt diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala index f216f05be5..f1e19a4862 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala @@ -20,10 +20,15 @@ package org.bdgenomics.adam.util import org.apache.spark.rdd.RDD // NOTE(ryan): this is necessary for Spark <= 1.2.1. import org.apache.spark.SparkContext._ -import org.bdgenomics.adam.models.ReferenceRegion +import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary, SequenceRecord } import org.bdgenomics.formats.avro.NucleotideContigFragment case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragment]]) extends ReferenceFile { + + // create sequence dictionary + val sequences: SequenceDictionary = new SequenceDictionary(contigMap.map(r => + SequenceRecord(r._1, r._2.map(_.getFragmentEndPosition).max)).toVector) + /** * Extract reference sequence from the file. * diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala index a205d027a2..ef6e5f9956 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceFile.scala @@ -18,7 +18,7 @@ package org.bdgenomics.adam.util -import org.bdgenomics.adam.models.ReferenceRegion +import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion } /** * File that contains a reference assembly that can be broadcasted @@ -31,4 +31,10 @@ trait ReferenceFile extends Serializable { * @return The reference sequence at the desired locus. */ def extract(region: ReferenceRegion): String + + /* + * Stores SequenceDictionary for ReferenceFile + */ + def sequences: SequenceDictionary + } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala index 218c443577..9b081e37cc 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala @@ -22,7 +22,7 @@ import java.nio.{ ByteOrder, ByteBuffer } import com.esotericsoftware.kryo.io.{ Output, Input } import com.esotericsoftware.kryo.{ Kryo, Serializer } import org.bdgenomics.utils.io.{ ByteArrayByteAccess, ByteAccess } -import org.bdgenomics.adam.models.{ NonoverlappingRegions, ReferencePosition, ReferenceRegion } +import org.bdgenomics.adam.models._ object TwoBitFile { val MAGIC_NUMBER: Int = 0x1A412743 @@ -67,7 +67,10 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile { indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + tup._1.length + TwoBitFile.OFFSET_SIZE tup }).toMap - val seqRecords = seqRecordStarts.map(tup => tup._1 -> TwoBitRecord(bytes, tup._1, tup._2)).toMap + val seqRecords = seqRecordStarts.map(tup => tup._1 -> TwoBitRecord(bytes, tup._1, tup._2)) + + // create sequence dictionary + val sequences = new SequenceDictionary(seqRecords.toVector.map(r => SequenceRecord(r._1, r._2.dnaSize))) private def readHeader(): Int = { // figure out proper byte order diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDSuite.scala index 93ae5035ce..56c1600215 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDSuite.scala @@ -17,13 +17,13 @@ */ package org.bdgenomics.adam.rdd.contig -import com.google.common.io.Files import java.io.File -import org.apache.spark.rdd.RDD + +import com.google.common.io.Files import org.bdgenomics.adam.models._ -import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ + import scala.collection.mutable.ListBuffer class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { @@ -76,7 +76,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) - assert(rdd.getReferenceString(region) === "ACTGTAC") + assert(rdd.extract(region) === "ACTGTAC") } sparkTest("recover trimmed reference string from a single contig fragment") { @@ -97,7 +97,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) - assert(rdd.getReferenceString(region) === "CTGTA") + assert(rdd.extract(region) === "CTGTA") } sparkTest("recover reference string from multiple contig fragments") { @@ -143,8 +143,8 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { fragment1, fragment2))) - assert(rdd.getReferenceString(region0) === "ACTGTAC") - assert(rdd.getReferenceString(region1) === "GTACTCTCATG") + assert(rdd.extract(region0) === "ACTGTAC") + assert(rdd.extract(region1) === "GTACTCTCATG") } sparkTest("recover trimmed reference string from multiple contig fragments") { @@ -190,8 +190,8 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { fragment1, fragment2))) - assert(rdd.getReferenceString(region0) === "CTGTA") - assert(rdd.getReferenceString(region1) === "CTCTCA") + assert(rdd.extract(region0) === "CTGTA") + assert(rdd.extract(region1) === "CTCTCA") } sparkTest("testing nondeterminism from reduce when recovering referencestring") { @@ -213,7 +213,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { var passed = true val rdd = NucleotideContigFragmentRDD(sc.parallelize(fragments.toList)) try { - val result = rdd.getReferenceString(new ReferenceRegion("chr1", 0L, 1000L)) + val result = rdd.extract(new ReferenceRegion("chr1", 0L, 1000L)) } catch { case e: AssertionError => passed = false } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala index 840c35b764..da4c691d65 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala @@ -18,9 +18,8 @@ package org.bdgenomics.adam.rdd.read import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.util.{ ReferenceContigMap, ADAMFunSuite } -import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment, Contig } -import org.bdgenomics.utils.misc.SparkFunSuite +import org.bdgenomics.adam.util.{ ADAMFunSuite, ReferenceContigMap } +import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig, NucleotideContigFragment } class MDTaggingSuite extends ADAMFunSuite { val chr1 = @@ -34,7 +33,13 @@ class MDTaggingSuite extends ADAMFunSuite { sc.parallelize( for { (contig, start, seq) <- frags - } yield NucleotideContigFragment.newBuilder().setContig(contig).setFragmentStartPosition(start.toLong).setFragmentSequence(seq).build() + } yield ( + NucleotideContigFragment.newBuilder() + .setContig(contig) + .setFragmentStartPosition(start.toLong) + .setFragmentEndPosition(start.toLong + seq.length) + .setFragmentSequence(seq).build() + ) ) def makeReads(reads: ((Contig, Int, Int, String, String), String)*): (Map[Int, String], RDD[AlignmentRecord]) = { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/util/TwoBitSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/util/TwoBitSuite.scala index 2f35f0dbcd..d7994ac477 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/util/TwoBitSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/util/TwoBitSuite.scala @@ -19,9 +19,8 @@ package org.bdgenomics.adam.util import java.io.File -import org.bdgenomics.utils.io.LocalFileByteAccess import org.bdgenomics.adam.models.ReferenceRegion -import org.scalatest.FunSuite +import org.bdgenomics.utils.io.LocalFileByteAccess class TwoBitSuite extends ADAMFunSuite { test("correctly read sequence from .2bit file") { @@ -49,4 +48,13 @@ class TwoBitSuite extends ADAMFunSuite { val twoBitFile = new TwoBitFile(byteAccess) assert(twoBitFile.extract(ReferenceRegion("1", 9990, 10010), true) == "NNNNNNNNNNTAACCCTAAC") } + + test("correctly calculates sequence dictionary") { + val file = new File(resourcePath("hg19.chrM.2bit")) + val byteAccess = new LocalFileByteAccess(file) + val twoBitFile = new TwoBitFile(byteAccess) + val dict = twoBitFile.sequences + assert(dict.records.length == 1) + assert(dict.records.head.length == 16571) + } }