Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to Genomicsdb 1.1.2.2 with Linux and MacOS shared libraries packaged #6206

Merged
merged 3 commits into from
Oct 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ final sparkVersion = System.getProperty('spark.version', '2.4.3')
final scalaVersion = System.getProperty('scala.version', '2.11')
final hadoopVersion = System.getProperty('hadoop.version', '2.8.2')
final disqVersion = System.getProperty('disq.version','0.3.3')
final genomicsdbVersion = System.getProperty('genomicsdb.version','1.1.0.1')
final genomicsdbVersion = System.getProperty('genomicsdb.version','1.1.2.2')
final testNGVersion = '6.11'
// Using the shaded version to avoid conflicts between its protobuf dependency
// and that of Hadoop/Spark (either the one we reference explicitly, or the one
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,8 @@ public void traverse() {
GenomicsDBImporter importer;
try {
importer = new GenomicsDBImporter(importConfig);
// Modify importer directly from updateImportProtobufVidMapping.
org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateImportProtobufVidMapping(importer);
importer.executeImport(maxNumIntervalsToImportInParallel);
} catch (final IOException e) {
throw new UserException("Error initializing GenomicsDBImporter", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.genomicsdb.importer.GenomicsDBImporter;
import org.genomicsdb.model.GenomicsDBExportConfiguration;
import org.genomicsdb.model.GenomicsDBVidMapProto;

Expand All @@ -19,24 +20,70 @@

/**
* Utility class containing various methods for working with GenomicsDB
* Contains code to modify the GenomicsDB query output format using the Protobuf API
* Contains code to modify the GenomicsDB import input using the Protobuf API
*
* References:
* GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
* GenomicsDB Protobuf structs: https://github.com/GenomicsDB/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
* Protobuf generated Java code guide:
* https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
* https://developers.google.com/protocol-buffers/docs/reference/java-generated
*/
public class GenomicsDBUtils {

private static final String SUM = "sum";
private static final String ELEMENT_WISE_SUM = "element_wise_sum";
private static final String ELEMENT_WISE_FLOAT_SUM = "element_wise_float_sum";
private static final String SUM = "sum";
private static final String ELEMENT_WISE_INT_SUM = "element_wise_int_sum";
private static final String HISTOGRAM_SUM = "histogram_sum";
private static final String STRAND_BIAS_TABLE_COMBINE = "strand_bias_table";
private static final String MOVE_TO_FORMAT = "move_to_FORMAT";

private static final String GDB_TYPE_FLOAT = "float";
private static final String GDB_TYPE_INT = "int";


/**
* Info and Allele-specific fields that need to be treated differently
* will have to be explicitly overridden in this method during import.
* Note that the recommendation is to perform this operation during the import phase
* as only a limited set of mappings can be changed during export.
*
* @param importer
*/
public static void updateImportProtobufVidMapping(GenomicsDBImporter importer) {
//Get the in-memory Protobuf structure representing the vid information.
GenomicsDBVidMapProto.VidMappingPB vidMapPB = importer.getProtobufVidMapping();
if (vidMapPB == null) {
throw new UserException("Could not get protobuf vid mappping object from GenomicsDBImporter");
}

// In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects.
// Each GenomicsDBFieldInfo object contains information about a specific field in the
// GenomicsDB store and this list is iterated to create a field name to list index map.
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);

vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, ELEMENT_WISE_SUM);

vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY, ELEMENT_WISE_FLOAT_SUM);

//Update combine operations for GnarlyGenotyper
//Note that this MQ format is deprecated, but was used by the prototype version of ReblockGVCF
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.MAPPING_QUALITY_DEPTH_DEPRECATED, SUM);
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.RAW_QUAL_APPROX_KEY, SUM);
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.VARIANT_DEPTH_KEY, SUM);
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, ELEMENT_WISE_SUM);
vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY, ELEMENT_WISE_INT_SUM);

importer.updateProtobufVidMapping(vidMapPB);
}

/**
*
* @param workspace path to the GenomicsDB workspace
Expand Down Expand Up @@ -87,47 +134,7 @@ public static GenomicsDBExportConfiguration.ExportConfiguration createExportConf
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
}

//Parse the vid json and create an in-memory Protobuf structure representing the information in the JSON file
GenomicsDBVidMapProto.VidMappingPB vidMapPB;
try {
vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
} catch (final IOException e) {
throw new UserException("Could not open vid json file " + vidmapJson, e);
}

//In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
//Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
//We iterate over the list and create a field name to list index map
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);

vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, ELEMENT_WISE_SUM);

vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY, ELEMENT_WISE_FLOAT_SUM);

//Update combine operations for GnarlyGenotyper
//Note that this MQ format is deprecated, but was used by the prototype version of ReblockGVCF
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.MAPPING_QUALITY_DEPTH_DEPRECATED, SUM);
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.RAW_QUAL_APPROX_KEY, SUM);
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.VARIANT_DEPTH_KEY, SUM);
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, ELEMENT_WISE_SUM);
vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY, ELEMENT_WISE_FLOAT_SUM);


if (vidMapPB != null) {
//Use rebuilt vidMap in exportConfiguration
//NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
//C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
//in the JSON file
exportConfigurationBuilder.setVidMapping(vidMapPB);
}

return exportConfigurationBuilder.build();
}
Expand Down Expand Up @@ -156,7 +163,7 @@ public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFi

/**
* In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
* Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
* Each GenomicsDBFieldInfo object contains information about a specific field in the GenomicsDB store
* We iterate over the list and create a field name to list index map
*
* @param vidMapPB Protobuf vid mapping object
Expand Down Expand Up @@ -227,6 +234,11 @@ public static GenomicsDBVidMapProto.VidMappingPB updateAlleleSpecificINFOFieldCo

GenomicsDBVidMapProto.FieldLengthDescriptorComponentPB.Builder lengthDescriptorComponentBuilder =
GenomicsDBVidMapProto.FieldLengthDescriptorComponentPB.newBuilder();

infoBuilder.clearLength();
infoBuilder.clearVcfDelimiter();
infoBuilder.clearType();

lengthDescriptorComponentBuilder.setVariableLengthDescriptor("R");
infoBuilder.addLength(lengthDescriptorComponentBuilder.build());
lengthDescriptorComponentBuilder.setVariableLengthDescriptor("var"); //ignored - can set anything here
Expand All @@ -243,7 +255,7 @@ public static GenomicsDBVidMapProto.VidMappingPB updateAlleleSpecificINFOFieldCo
infoBuilder.setVCFFieldCombineOperation(ELEMENT_WISE_SUM);
if (newCombineOperation.equals(ELEMENT_WISE_FLOAT_SUM)) {
infoBuilder.addType(GDB_TYPE_FLOAT);
} else if (newCombineOperation.equals(STRAND_BIAS_TABLE_COMBINE)) {
} else {
infoBuilder.addType(GDB_TYPE_INT);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ public void testGenomicsDBInClassPath(){
Assert.assertNotNull(GenomicsDBLibLoader.class.getResource(path), "Could not find the genomicsdb binary at " + path);
}

@Test
public void testGenomicsDBJarForNativeLibraries() {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test for Linux and MacOS shared native libraries in genomicsdb jar.

final String GENOMICSDB_LIBRARY_NAME = "/libtiledbgenomicsdb";
final String LINUX_DL_SUFFIX = ".so";
final String MACOSX_DL_SUFFIX = ".dylib";
Assert.assertNotNull(GenomicsDBLibLoader.class.getResource(GENOMICSDB_LIBRARY_NAME+LINUX_DL_SUFFIX), "Shared Library for Linux not found");
Assert.assertNotNull(GenomicsDBLibLoader.class.getResource(GENOMICSDB_LIBRARY_NAME+MACOSX_DL_SUFFIX), "Shared Library for Mac OSX not found");
}

@Test
public void testAsDrivingVariants() throws IOException {
final File workspace = GenomicsDBTestUtils.createTempGenomicsDB(TINY_GVCF, INTERVAL);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=TLOD,Number=A,Type=Float,Description="Log odds ratio score for variant">
##contig=<ID=20,length=63025520,assembly=GRCh37>
##contig=<ID=21,length=48129895,assembly=GRCh37>
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=TLOD,Number=A,Type=Float,Description="Log odds ratio score for variant">
##contig=<ID=20,length=63025520,assembly=GRCh37>
##contig=<ID=21,length=48129895,assembly=GRCh37>
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=TLOD,Number=A,Type=Float,Description="Log odds ratio score for variant">
##contig=<ID=20,length=63025520,assembly=GRCh37>
##contig=<ID=21,length=48129895,assembly=GRCh37>
Expand Down
Binary file not shown.