Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cloud-enable IndexFeatureFile and change input arg name from -F to -I. #6246

Merged
merged 3 commits into from
Nov 8, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Cloud-enable IndexFeatureFile.
  • Loading branch information
cmnbroad committed Nov 6, 2019
commit d0c424252d8f5197dcf1c21040b48a972cdadab0
Original file line number Diff line number Diff line change
Expand Up @@ -421,32 +421,11 @@ private <T extends Feature> FeatureDataSource<T> lookupDataSource( final Feature
* an unsupported format), or if more than one codec claims to be able to decode the file (this is
* a configuration error on the codec authors' part).
*
* @param featureFile file for which to find the right codec
* @param featurePath path for which to find the right codec
* @return the codec suitable for decoding the provided file
*/
public static FeatureCodec<? extends Feature, ?> getCodecForFile( final File featureFile ) {
return getCodecForFile(featureFile.toPath(), null);
}

/**
* Utility method that determines the correct codec to use to read Features from the provided file,
* optionally considering only codecs that produce a particular type of Feature.
*
* Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method
* in order to be considered as candidates for decoding the file, and must produce
* Features of the specified type if featureType is non-null.
*
* Throws an exception if no suitable codecs are found (this is a user error, since the file is of
* an unsupported format), or if more than one codec claims to be able to decode the file (this is
* a configuration error on the codec authors' part).
*
* @param featureFile file for which to find the right codec
* @param featureType If specified, consider only codecs that produce Features of this type. May be null,
* in which case all codecs are considered.
* @return the codec suitable for decoding the provided file
*/
public static FeatureCodec<? extends Feature, ?> getCodecForFile( final File featureFile, final Class<? extends Feature> featureType ) {
return getCodecForFile(featureFile.toPath(), featureType);
public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath ) {
return getCodecForFile(featurePath, null);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ protected final void onStartup() {
@SuppressWarnings("unchecked")
private void initializeDrivingFeatures() {
final File drivingFile = getDrivingFeatureFile();
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(drivingFile);
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(drivingFile.toPath());
if (isAcceptableFeatureType(codec.getFeatureType())) {
drivingFeatures = new FeatureDataSource<>(new FeatureInput<>(drivingFile.getAbsolutePath()), FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, null, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, referenceArguments.getReferencePath());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -444,9 +444,9 @@ public HardwareFeatureException(String message, Exception e){
public static final class CouldNotIndexFile extends UserException {
private static final long serialVersionUID = 0L;

public CouldNotIndexFile(final File file, final Exception e) {
public CouldNotIndexFile(final Path path, final Exception e) {
super(String.format("Error while trying to create index for %s. Error was: %s: %s",
file.getAbsolutePath(), e.getClass().getCanonicalName(), e.getMessage()), e);
path.toString(), e.getClass().getCanonicalName(), e.getMessage()), e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPathSpecifier;
import picard.cmdline.programgroups.OtherProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureManager;
import org.broadinstitute.hellbender.engine.ProgressMeter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.codecs.ProgressReportingDelegatingCodec;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* This tool creates an index file for the various kinds of feature-containing files supported by GATK (such as VCF
Expand All @@ -47,78 +49,78 @@ public final class IndexFeatureFile extends CommandLineProgram {
@Argument(shortName = "F",
fullName = "feature-file",
doc = "Feature file (eg., VCF or BED file) to index. Must be in a tribble-supported format")
public File featureFile;
public GATKPathSpecifier featurePath;

@Argument(shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
doc = "The output index file. If missing, the tool will create an index file in the same directory " +
"as the input file.",
optional = true)
public File outputFile;
public GATKPathSpecifier outputPath;

public static final int OPTIMAL_GVCF_INDEX_BIN_SIZE = 128000;
public static final String GVCF_FILE_EXTENSION = ".g.vcf";

@Override
protected Object doWork() {
if (!featureFile.canRead()) {
throw new UserException.CouldNotReadInputFile(featureFile);
if (!Files.isReadable(featurePath.toPath()) ) {
throw new UserException.CouldNotReadInputFile(featurePath.toPath());
}

// Get the right codec for the file to be indexed. This call will throw an appropriate exception
// if featureFile is not in a supported format or is unreadable.
final FeatureCodec<? extends Feature, ?> codec = new ProgressReportingDelegatingCodec<>(FeatureManager.getCodecForFile(featureFile), ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES);
final FeatureCodec<? extends Feature, ?> codec = new ProgressReportingDelegatingCodec<>(
FeatureManager.getCodecForFile(featurePath.toPath()), ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES);

final Index index = createAppropriateIndexInMemory(codec);
final File indexFile = determineFileName(index);
final Path indexPath = determineFileName(index);

try {
index.write(indexFile);
index.write(indexPath);
} catch (final IOException e) {
throw new UserException.CouldNotCreateOutputFile("Could not write index to file " + indexFile.getAbsolutePath(), e);
throw new UserException.CouldNotCreateOutputFile("Could not write index to file " + indexPath.toAbsolutePath(), e);
}

logger.info("Successfully wrote index to " + indexFile.getAbsolutePath());
return indexFile.getAbsolutePath();
logger.info("Successfully wrote index to " + indexPath.toAbsolutePath());
return indexPath.toAbsolutePath().toString();
}

private File determineFileName(final Index index) {
if (outputFile != null) {
return outputFile;
private Path determineFileName(final Index index) {
if (outputPath != null) {
return outputPath.toPath();
} else if (index instanceof TabixIndex) {
return Tribble.tabixIndexFile(featureFile);
return Tribble.tabixIndexPath(featurePath.toPath());
} else {
return Tribble.indexFile(featureFile);
return Tribble.indexPath(featurePath.toPath());
}
}

private Index createAppropriateIndexInMemory(final FeatureCodec<? extends Feature, ?> codec) {
try {
// For block-compression files, write a Tabix index
if (IOUtil.hasBlockCompressedExtension(featureFile)) {
if (IOUtil.hasBlockCompressedExtension(featurePath.toPath())) {
// Creating tabix indices with a non standard extensions can cause problems so we disable it
if (outputFile != null && !outputFile.getAbsolutePath().endsWith(FileExtensions.TABIX_INDEX)) {
throw new UserException("The index for " + featureFile + " must be written to a file with a \"" + FileExtensions.TABIX_INDEX + "\" extension");
if (outputPath != null && !outputPath.getURIString().endsWith(FileExtensions.TABIX_INDEX)) {
throw new UserException("The index for " + featurePath + " must be written to a file with a \"" + FileExtensions.TABIX_INDEX + "\" extension");
}

// TODO: this could benefit from provided sequence dictionary from reference
// TODO: this can be an optional parameter for the tool
return IndexFactory.createIndex(featureFile, codec, IndexFactory.IndexType.TABIX, null);

return IndexFactory.createIndex(featurePath.toPath(), codec, IndexFactory.IndexType.TABIX, null);
}
// TODO: detection of GVCF files should not be file-extension-based. Need to come up with canonical
// TODO: way of detecting GVCFs based on the contents (may require changes to the spec!)
else if (featureFile.getName().endsWith(GVCF_FILE_EXTENSION)) {
else if (featurePath.getURIString().endsWith(GVCF_FILE_EXTENSION)) {
// Optimize GVCF indices for the use case of having a large number of GVCFs open simultaneously
return IndexFactory.createLinearIndex(featureFile, codec, OPTIMAL_GVCF_INDEX_BIN_SIZE);
return IndexFactory.createLinearIndex(featurePath.toPath(), codec, OPTIMAL_GVCF_INDEX_BIN_SIZE);
} else {
// Optimize indices for other kinds of files for seek time / querying
return IndexFactory.createDynamicIndex(featureFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
return IndexFactory.createDynamicIndex(featurePath.toPath(), codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
}
} catch (TribbleException e) {
// Underlying cause here is usually a malformed file, but can also be things like
// "codec does not support tabix"
throw new UserException.CouldNotIndexFile(featureFile, e);
throw new UserException.CouldNotIndexFile(featurePath.toPath(), e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ private FeatureInput<VariantContext> getVariantFeatureInputWithCachedCodec() {
final FeatureInput<VariantContext> featureInput = new FeatureInput<>(inputVCFFile.getAbsolutePath());
Assert.assertNull(featureInput.getFeatureCodecClass());

final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(new File(featureInput.getFeaturePath()));
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(featureInput.toPath());
featureInput.setFeatureCodecClass((Class<FeatureCodec<VariantContext, ?>>)codec.getClass());

return featureInput;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ public Object[][] getDetectCorrectFileFormatTestData() {

@Test(dataProvider = "DetectCorrectFileFormatTestData")
public void testDetectCorrectFileFormat( final File file, final Class<? extends FeatureCodec<? extends Feature, ?>> expectedCodecClass ) throws Exception {
Assert.assertEquals(FeatureManager.getCodecForFile(file).getClass(), expectedCodecClass,
Assert.assertEquals(FeatureManager.getCodecForFile(file.toPath()).getClass(), expectedCodecClass,
"Wrong codec selected for file " + file.getAbsolutePath());

// We should also get the correct codec if we pass in the explicit expected Feature type to getCodecForFile()
@SuppressWarnings("unchecked")
final Class<? extends Feature> expectedCodecFeatureType = expectedCodecClass.getDeclaredConstructor().newInstance().getFeatureType();
Assert.assertEquals(FeatureManager.getCodecForFile(file, expectedCodecFeatureType).getClass(), expectedCodecClass,
Assert.assertEquals(FeatureManager.getCodecForFile(file.toPath(), expectedCodecFeatureType).getClass(), expectedCodecClass,
"Wrong codec selected for file " + file.getAbsolutePath() + " after subsetting to the expected Feature type");
}

Expand All @@ -62,15 +62,15 @@ public void testDetectUnsupportedFileFormat() {
Assert.assertTrue(unsupportedFile.canRead(), "Cannot test detection of unsupported file formats on an unreadable file");

// Should throw, since the file exists and is readable, but is in an unsupported format
FeatureManager.getCodecForFile(unsupportedFile);
FeatureManager.getCodecForFile(unsupportedFile.toPath());
}

@Test(expectedExceptions = UserException.WrongFeatureType.class)
public void testRestrictCodecSelectionToWrongFeatureType() {
final File vcf = new File(FEATURE_MANAGER_TEST_DIRECTORY + "minimal_vcf4_file.vcf");

// If we require BED Features from this vcf file, we should get a type mismatch exception
FeatureManager.getCodecForFile(vcf, BEDFeature.class);
FeatureManager.getCodecForFile(vcf.toPath(), BEDFeature.class);
}

@DataProvider(name = "IsFeatureFileTestData")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,18 @@
import htsjdk.tribble.index.tabix.TabixIndex;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.Main;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
Expand All @@ -32,6 +35,7 @@ public void testVCFIndex() {
"--feature-file" , ORIG_FILE.getAbsolutePath(),
"-O" , outName.getAbsolutePath()
};

final Object res = this.runCommandLine(args);
Assert.assertEquals(res, outName.getAbsolutePath());

Expand All @@ -42,6 +46,27 @@ public void testVCFIndex() {
checkIndex(index, Arrays.asList("1", "2", "3", "4"));
}

@Test(groups={"bucket"})
public void testVCFIndexOnCloud() throws IOException {
final File testFile = getTestFile("test_variants_for_index.vcf");
final String vcfOnGCS = BucketUtils.getTempFilePath(
getGCPTestStaging() +"testIndexOnCloud", ".vcf");
BucketUtils.copyFile(testFile.getAbsolutePath(), vcfOnGCS);

final String[] args = new String[] {
"IndexFeatureFile", "--feature-file", vcfOnGCS
};

new Main().instanceMain(args);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit unconventional, most tests just use runCommandLine there's nothing wrong with this though.


Assert.assertTrue(BucketUtils.fileExists(vcfOnGCS + ".idx"));

final Index index = IndexFactory.loadIndex(vcfOnGCS + ".idx");
Assert.assertTrue(index instanceof LinearIndex);
Assert.assertEquals(index.getSequenceNames(), Arrays.asList("1", "2", "3", "4"));
checkIndex(index, Arrays.asList("1", "2", "3", "4"));
}

@Test
public void testVCFIndex_inferredName() {
final File ORIG_FILE = getTestFile("test_variants_for_index.vcf");
Expand All @@ -50,9 +75,9 @@ public void testVCFIndex_inferredName() {
"--feature-file" , ORIG_FILE.getAbsolutePath(),
};
final Object res = this.runCommandLine(args);
final File tribbleIndex = Tribble.indexFile(ORIG_FILE);
Assert.assertEquals(res, tribbleIndex.getAbsolutePath());
tribbleIndex.deleteOnExit();
final Path tribbleIndex = Tribble.indexPath(ORIG_FILE.toPath());
Assert.assertEquals(res, tribbleIndex.toAbsolutePath().toString());
tribbleIndex.toFile().deleteOnExit();

final Index index = IndexFactory.loadIndex(res.toString());
Assert.assertTrue(index instanceof LinearIndex);
Expand Down Expand Up @@ -382,4 +407,5 @@ public void testVCFWithNoRecords() {
Assert.assertTrue(output.exists());
Assert.assertTrue(output.length() > 0);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -1660,7 +1660,7 @@ public void testCreateVCFWriterWithOptions(
private void verifyFileType(
final File resultVCFFile,
final String outputExtension) {
final FeatureCodec<? extends Feature, ?> featureCodec = FeatureManager.getCodecForFile(resultVCFFile);
final FeatureCodec<? extends Feature, ?> featureCodec = FeatureManager.getCodecForFile(resultVCFFile.toPath());

if (outputExtension.equals(".vcf") ||
outputExtension.equals(".vcf.bgz") ||
Expand Down