Adding Snap for alignment.

bigdatagenomics · Apr 19, 2018 · e59078f · e59078f
1 parent 4ea7617
commit e59078f
Show file tree

Hide file tree

Showing 4 changed files with 298 additions and 0 deletions.
diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala
@@ -35,6 +35,7 @@ object Cannoli {
     Bwa,
     Freebayes,
     SamtoolsMpileup,
+    Snap,
     SnpEff,
     VtNormalize)),
     CommandGroup("CANNOLI TOOLS", List(InterleaveFastq,

diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Snap.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Snap.scala
@@ -0,0 +1,160 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.cannoli.cli
+
+import htsjdk.samtools.ValidationStringency
+import org.apache.hadoop.fs.{ FileSystem, Path, PathFilter }
+import org.apache.spark.SparkContext
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs
+import org.bdgenomics.adam.rdd.fragment.{ FragmentRDD, InterleavedFASTQInFormatter }
+import org.bdgenomics.adam.rdd.read.{ AlignmentRecordRDD, AnySAMOutFormatter }
+import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct }
+import org.bdgenomics.cannoli.builder.CommandBuilders
+import org.bdgenomics.cannoli.util.AppendSuffixes
+import org.bdgenomics.formats.avro.AlignmentRecord
+import org.bdgenomics.utils.cli._
+import org.bdgenomics.utils.misc.Logging
+import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
+import scala.collection.JavaConversions._
+
+/**
+ * Snap function arguments.
+ */
+class SnapFnArgs extends Args4jBase {
+  @Args4jOption(required = false, name = "-executable", usage = "Path to the Snap executable. Defaults to snap-aligner.")
+  var executable: String = "snap-aligner"
+
+  @Args4jOption(required = false, name = "-image", usage = "Container image to use. Defaults to quay.io/ucsc_cgl/snap:1.0beta.18--07475052cd34f17055991cec339898ff0e8bd07d")
+  var image: String = "quay.io/ucsc_cgl/snap:1.0beta.18--07475052cd34f17055991cec339898ff0e8bd07d"
+
+  @Args4jOption(required = false, name = "-sudo", usage = "Run via sudo.")
+  var sudo: Boolean = false
+
+  @Args4jOption(required = false, name = "-add_files", usage = "If true, use the SparkFiles mechanism to distribute files to executors.")
+  var addFiles: Boolean = false
+
+  @Args4jOption(required = false, name = "-use_docker", usage = "If true, uses Docker to launch Snap.")
+  var useDocker: Boolean = false
+
+  @Args4jOption(required = false, name = "-use_singularity", usage = "If true, uses Singularity to launch Snap.")
+  var useSingularity: Boolean = false
+
+  @Args4jOption(required = true, name = "-index", usage = "Index directory?")
+  var indexPath: String = null
+}
+
+/**
+ * Snap wrapper as a function FragmentRDD &rarr; AlignmentRecordRDD,
+ * for use in cannoli-shell or notebooks.
+ *
+ * @param args Snap function arguments.
+ * @param sc Spark context.
+ */
+class SnapFn(
+    val args: SnapFnArgs,
+    sc: SparkContext) extends CannoliFn[FragmentRDD, AlignmentRecordRDD](sc) with Logging {
+
+  override def apply(fragments: FragmentRDD): AlignmentRecordRDD = {
+
+    val builder = CommandBuilders.create(args.useDocker, args.useSingularity)
+      .setExecutable(args.executable)
+      .add("paired")
+      .add(if (args.addFiles) "$0" else absolute(args.indexPath))
+      .add("-t")
+      .add("1")
+      .add("-pairedInterleavedFastq")
+      .add("-")
+      .add("-o")
+      .add("-bam")
+      .add("-")
+      .add("-map")
+      .add("-pre")
+
+    if (args.addFiles) builder.addFile(args.indexPath)
+
+    if (args.useDocker || args.useSingularity) {
+      builder
+        .setImage(args.image)
+        .setSudo(args.sudo)
+        .addMount(if (args.addFiles) "$root" else root(args.indexPath))
+    }
+
+    log.info("Piping {} to snap with command: {} files: {}",
+      fragments, builder.build(), builder.getFiles())
+
+    implicit val tFormatter = InterleavedFASTQInFormatter
+    implicit val uFormatter = new AnySAMOutFormatter
+
+    fragments.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, InterleavedFASTQInFormatter](
+      cmd = builder.build(),
+      files = builder.getFiles()
+    )
+  }
+}
+
+object Snap extends BDGCommandCompanion {
+  val commandName = "snap"
+  val commandDescription = "ADAM Pipe API wrapper for Snap."
+
+  def apply(cmdLine: Array[String]) = {
+    new Snap(Args4j[SnapArgs](cmdLine))
+  }
+}
+
+/**
+ * Snap command line arguments.
+ */
+class SnapArgs extends SnapFnArgs with ADAMSaveAnyArgs with ParquetArgs {
+  @Argument(required = true, metaVar = "INPUT", usage = "Location to pipe fragments from (e.g. interleaved FASTQ format, .ifq). If extension is not detected, Parquet is assumed.", index = 0)
+  var inputPath: String = null
+
+  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to pipe alignments to (e.g. .bam, .cram, .sam). If extension is not detected, Parquet is assumed.", index = 1)
+  var outputPath: String = null
+
+  @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file.")
+  var asSingleFile: Boolean = false
+
+  @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output.")
+  var deferMerging: Boolean = false
+
+  @Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.")
+  var disableFastConcat: Boolean = false
+
+  @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT.")
+  var stringency: String = "STRICT"
+
+  // must be defined due to ADAMSaveAnyArgs, but unused here
+  var sortFastqOutput: Boolean = false
+}
+
+/**
+ * Snap command line wrapper.
+ */
+class Snap(protected val args: SnapArgs) extends BDGSparkCommand[SnapArgs] with Logging {
+  val companion = Snap
+  val stringency: ValidationStringency = ValidationStringency.valueOf(args.stringency)
+
+  def run(sc: SparkContext) {
+    val fragments = sc.loadFragments(args.inputPath, stringency = stringency)
+      .transform(AppendSuffixes(_))
+
+    val alignments = new SnapFn(args, sc).apply(fragments)
+    alignments.save(args)
+  }
+}
diff --git a/core/src/main/scala/org/bdgenomics/cannoli/util/AppendSuffixes.scala b/core/src/main/scala/org/bdgenomics/cannoli/util/AppendSuffixes.scala
@@ -0,0 +1,55 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.cannoli.util
+
+import org.apache.spark.rdd.RDD
+import org.bdgenomics.formats.avro.{ AlignmentRecord, Fragment }
+import scala.collection.JavaConversions._
+
+object AppendSuffixes {
+
+  def apply(rdd: RDD[Fragment]): RDD[Fragment] = {
+    rdd.map(addSuffixesToFragment)
+  }
+
+  def addSuffixesToFragment(fragment: Fragment): Fragment = {
+    Fragment.newBuilder(fragment)
+      .setAlignments(fragment.getAlignments.map(addSuffixToRead))
+      .build
+  }
+
+  def addSuffixToRead(read: AlignmentRecord): AlignmentRecord = {
+    require(read.getReadInFragment != null,
+      "Cannot process read with null read in fragment number (%s)".format(read))
+    require(read.getReadInFragment >= 0,
+      "Read in fragment must be positive (%s)".format(read))
+
+    val suffix = "/%d".format(read.getReadInFragment + 1)
+
+    // don't modify the read if it already is suffixed properly
+    if (read.getReadName.endsWith(suffix)) {
+      println("no new name: %s%s".format(read.getReadName, suffix))
+      read
+    } else {
+      println("new name: %s%s".format(read.getReadName, suffix))
+      AlignmentRecord.newBuilder(read)
+        .setReadName("%s%s".format(read.getReadName, suffix))
+        .build
+    }
+  }
+}
diff --git a/core/src/test/scala/org/bdgenomics/cannoli/util/AppendSuffixesSuite.scala b/core/src/test/scala/org/bdgenomics/cannoli/util/AppendSuffixesSuite.scala
@@ -0,0 +1,82 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.cannoli.util
+
+import org.scalatest.FunSuite
+import org.bdgenomics.formats.avro.{ AlignmentRecord, Fragment }
+import scala.collection.JavaConversions._
+
+class AppendSuffixesSuite extends FunSuite {
+
+  test("add a suffix to a read that has no suffix") {
+    val read = AlignmentRecord.newBuilder
+      .setReadName("myRead")
+      .setReadInFragment(0)
+      .build
+    val suffixedRead = AppendSuffixes.addSuffixToRead(read)
+    assert(suffixedRead.getReadName === "myRead/1")
+  }
+
+  test("don't add a suffix to a read that has a suffix") {
+    val read = AlignmentRecord.newBuilder
+      .setReadName("myRead/2")
+      .setReadInFragment(1)
+      .build
+    val suffixedRead = AppendSuffixes.addSuffixToRead(read)
+    assert(suffixedRead.getReadName === "myRead/2")
+  }
+
+  test("read number must exist") {
+    val read = AlignmentRecord.newBuilder
+      .setReadName("myRead")
+      .setReadInFragment(null)
+      .build
+    intercept[IllegalArgumentException] {
+      AppendSuffixes.addSuffixToRead(read)
+    }
+  }
+
+  test("read number must be positive") {
+    val read = AlignmentRecord.newBuilder
+      .setReadName("myRead")
+      .setReadInFragment(-1)
+      .build
+    intercept[IllegalArgumentException] {
+      AppendSuffixes.addSuffixToRead(read)
+    }
+  }
+
+  test("add suffixes to fragment") {
+    val fragment = Fragment.newBuilder
+      .setAlignments(Seq(AlignmentRecord.newBuilder
+        .setReadName("myRead/1")
+        .setReadInFragment(0)
+        .build, AlignmentRecord.newBuilder
+        .setReadName("myRead")
+        .setReadInFragment(1)
+        .build)).build
+
+    val suffixedFragment = AppendSuffixes.addSuffixesToFragment(fragment)
+
+    val readNames = suffixedFragment.getAlignments
+      .map(_.getReadName)
+      .toSet
+    assert(readNames("myRead/1"))
+    assert(readNames("myRead/2"))
+  }
+}