diff --git a/build.gradle b/build.gradle index 40bf373d40e..769dc80c3d6 100644 --- a/build.gradle +++ b/build.gradle @@ -64,7 +64,7 @@ final sparkVersion = System.getProperty('spark.version', '2.4.3') final scalaVersion = System.getProperty('scala.version', '2.11') final hadoopVersion = System.getProperty('hadoop.version', '2.8.2') final disqVersion = System.getProperty('disq.version','0.3.3') -final genomicsdbVersion = System.getProperty('genomicsdb.version','1.1.0.1') +final genomicsdbVersion = System.getProperty('genomicsdb.version','1.1.2.2') final testNGVersion = '6.11' // Using the shaded version to avoid conflicts between its protobuf dependency // and that of Hadoop/Spark (either the one we reference explicitly, or the one diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java index 6c8216b903e..db7d40e6f4f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java @@ -729,6 +729,8 @@ public void traverse() { GenomicsDBImporter importer; try { importer = new GenomicsDBImporter(importConfig); + // Modify importer directly from updateImportProtobufVidMapping. + org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateImportProtobufVidMapping(importer); importer.executeImport(maxNumIntervalsToImportInParallel); } catch (final IOException e) { throw new UserException("Error initializing GenomicsDBImporter", e); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java index 516191c9d30..a4f852d64dd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java @@ -6,6 +6,7 @@ import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.genomicsdb.importer.GenomicsDBImporter; import org.genomicsdb.model.GenomicsDBExportConfiguration; import org.genomicsdb.model.GenomicsDBVidMapProto; @@ -19,24 +20,70 @@ /** * Utility class containing various methods for working with GenomicsDB - * Contains code to modify the GenomicsDB query output format using the Protobuf API + * Contains code to modify the GenomicsDB import input using the Protobuf API * * References: - * GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto + * GenomicsDB Protobuf structs: https://github.com/GenomicsDB/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto * Protobuf generated Java code guide: * https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api * https://developers.google.com/protocol-buffers/docs/reference/java-generated */ public class GenomicsDBUtils { + private static final String SUM = "sum"; private static final String ELEMENT_WISE_SUM = "element_wise_sum"; private static final String ELEMENT_WISE_FLOAT_SUM = "element_wise_float_sum"; - private static final String SUM = "sum"; + private static final String ELEMENT_WISE_INT_SUM = "element_wise_int_sum"; private static final String HISTOGRAM_SUM = "histogram_sum"; - private static final String STRAND_BIAS_TABLE_COMBINE = "strand_bias_table"; + private static final String MOVE_TO_FORMAT = "move_to_FORMAT"; + private static final String GDB_TYPE_FLOAT = "float"; private static final String GDB_TYPE_INT = "int"; + + /** + * Info and Allele-specific fields that need to be treated differently + * will have to be explicitly overridden in this method during import. + * Note that the recommendation is to perform this operation during the import phase + * as only a limited set of mappings can be changed during export. + * + * @param importer + */ + public static void updateImportProtobufVidMapping(GenomicsDBImporter importer) { + //Get the in-memory Protobuf structure representing the vid information. + GenomicsDBVidMapProto.VidMappingPB vidMapPB = importer.getProtobufVidMapping(); + if (vidMapPB == null) { + throw new UserException("Could not get protobuf vid mappping object from GenomicsDBImporter"); + } + + // In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects. + // Each GenomicsDBFieldInfo object contains information about a specific field in the + // GenomicsDB store and this list is iterated to create a field name to list index map. + final HashMap fieldNameToIndexInVidFieldsList = + getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); + + vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, ELEMENT_WISE_SUM); + + vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY, ELEMENT_WISE_FLOAT_SUM); + + //Update combine operations for GnarlyGenotyper + //Note that this MQ format is deprecated, but was used by the prototype version of ReblockGVCF + vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + GATKVCFConstants.MAPPING_QUALITY_DEPTH_DEPRECATED, SUM); + vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + GATKVCFConstants.RAW_QUAL_APPROX_KEY, SUM); + vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + GATKVCFConstants.VARIANT_DEPTH_KEY, SUM); + vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, ELEMENT_WISE_SUM); + vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, + GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY, ELEMENT_WISE_INT_SUM); + + importer.updateProtobufVidMapping(vidMapPB); + } + /** * * @param workspace path to the GenomicsDB workspace @@ -87,47 +134,7 @@ public static GenomicsDBExportConfiguration.ExportConfiguration createExportConf exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true); } - //Parse the vid json and create an in-memory Protobuf structure representing the information in the JSON file - GenomicsDBVidMapProto.VidMappingPB vidMapPB; - try { - vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson); - } catch (final IOException e) { - throw new UserException("Could not open vid json file " + vidmapJson, e); - } - //In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects - //Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store - //We iterate over the list and create a field name to list index map - final HashMap fieldNameToIndexInVidFieldsList = - getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); - - vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, - GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, ELEMENT_WISE_SUM); - - vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, - GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY, ELEMENT_WISE_FLOAT_SUM); - - //Update combine operations for GnarlyGenotyper - //Note that this MQ format is deprecated, but was used by the prototype version of ReblockGVCF - vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, - GATKVCFConstants.MAPPING_QUALITY_DEPTH_DEPRECATED, SUM); - vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, - GATKVCFConstants.RAW_QUAL_APPROX_KEY, SUM); - vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, - GATKVCFConstants.VARIANT_DEPTH_KEY, SUM); - vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, - GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, ELEMENT_WISE_SUM); - vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, - GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY, ELEMENT_WISE_FLOAT_SUM); - - - if (vidMapPB != null) { - //Use rebuilt vidMap in exportConfiguration - //NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to - //C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information - //in the JSON file - exportConfigurationBuilder.setVidMapping(vidMapPB); - } return exportConfigurationBuilder.build(); } @@ -156,7 +163,7 @@ public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFi /** * In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects - * Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store + * Each GenomicsDBFieldInfo object contains information about a specific field in the GenomicsDB store * We iterate over the list and create a field name to list index map * * @param vidMapPB Protobuf vid mapping object @@ -227,6 +234,11 @@ public static GenomicsDBVidMapProto.VidMappingPB updateAlleleSpecificINFOFieldCo GenomicsDBVidMapProto.FieldLengthDescriptorComponentPB.Builder lengthDescriptorComponentBuilder = GenomicsDBVidMapProto.FieldLengthDescriptorComponentPB.newBuilder(); + + infoBuilder.clearLength(); + infoBuilder.clearVcfDelimiter(); + infoBuilder.clearType(); + lengthDescriptorComponentBuilder.setVariableLengthDescriptor("R"); infoBuilder.addLength(lengthDescriptorComponentBuilder.build()); lengthDescriptorComponentBuilder.setVariableLengthDescriptor("var"); //ignored - can set anything here @@ -243,7 +255,7 @@ public static GenomicsDBVidMapProto.VidMappingPB updateAlleleSpecificINFOFieldCo infoBuilder.setVCFFieldCombineOperation(ELEMENT_WISE_SUM); if (newCombineOperation.equals(ELEMENT_WISE_FLOAT_SUM)) { infoBuilder.addType(GDB_TYPE_FLOAT); - } else if (newCombineOperation.equals(STRAND_BIAS_TABLE_COMBINE)) { + } else { infoBuilder.addType(GDB_TYPE_INT); } } diff --git a/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java index 0fa85bb8a75..5e792fd8e82 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/GenomicsDBIntegrationTest.java @@ -33,6 +33,15 @@ public void testGenomicsDBInClassPath(){ Assert.assertNotNull(GenomicsDBLibLoader.class.getResource(path), "Could not find the genomicsdb binary at " + path); } + @Test + public void testGenomicsDBJarForNativeLibraries() { + final String GENOMICSDB_LIBRARY_NAME = "/libtiledbgenomicsdb"; + final String LINUX_DL_SUFFIX = ".so"; + final String MACOSX_DL_SUFFIX = ".dylib"; + Assert.assertNotNull(GenomicsDBLibLoader.class.getResource(GENOMICSDB_LIBRARY_NAME+LINUX_DL_SUFFIX), "Shared Library for Linux not found"); + Assert.assertNotNull(GenomicsDBLibLoader.class.getResource(GENOMICSDB_LIBRARY_NAME+MACOSX_DL_SUFFIX), "Shared Library for Mac OSX not found"); + } + @Test public void testAsDrivingVariants() throws IOException { final File workspace = GenomicsDBTestUtils.createTempGenomicsDB(TINY_GVCF, INTERVAL); diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf index 3ad79ac051c..9fe20403402 100755 --- a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.2 ##FORMAT= ##FORMAT= +##FORMAT= ##INFO= ##contig= ##contig= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf.idx index b14208bd21e..23c16463d77 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf.idx and b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample2.vcf.idx differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf index 9299813f52d..8e3a5000d41 100755 --- a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.2 ##FORMAT= ##FORMAT= +##FORMAT= ##INFO= ##contig= ##contig= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf.idx index 649cd5e660d..8e4f7435f8c 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf.idx and b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample3.vcf.idx differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf index d1bc84efddb..98134ddf20e 100755 --- a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.2 ##FORMAT= ##FORMAT= +##FORMAT= ##INFO= ##contig= ##contig= diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf.idx index 3783b1d1ecb..bc5f103c7cc 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf.idx and b/src/test/resources/org/broadinstitute/hellbender/tools/mutect/createpon/sample4.vcf.idx differ