Skip to content

Commit

Permalink
Cloud-enable IndexFeatureFile and change input arg name from -F to -I. (
Browse files Browse the repository at this point in the history
#6246)

* Cloud-enable IndexFeatureFile.
* Change IndexFeatureFile to use -I instead of -F.
* Fixes #6161
  • Loading branch information
cmnbroad authored and lbergelson committed Nov 8, 2019
1 parent aae646c commit a632a05
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 85 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -421,32 +421,11 @@ private <T extends Feature> FeatureDataSource<T> lookupDataSource( final Feature
* an unsupported format), or if more than one codec claims to be able to decode the file (this is
* a configuration error on the codec authors' part).
*
* @param featureFile file for which to find the right codec
* @param featurePath path for which to find the right codec
* @return the codec suitable for decoding the provided file
*/
public static FeatureCodec<? extends Feature, ?> getCodecForFile( final File featureFile ) {
return getCodecForFile(featureFile.toPath(), null);
}

/**
* Utility method that determines the correct codec to use to read Features from the provided file,
* optionally considering only codecs that produce a particular type of Feature.
*
* Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method
* in order to be considered as candidates for decoding the file, and must produce
* Features of the specified type if featureType is non-null.
*
* Throws an exception if no suitable codecs are found (this is a user error, since the file is of
* an unsupported format), or if more than one codec claims to be able to decode the file (this is
* a configuration error on the codec authors' part).
*
* @param featureFile file for which to find the right codec
* @param featureType If specified, consider only codecs that produce Features of this type. May be null,
* in which case all codecs are considered.
* @return the codec suitable for decoding the provided file
*/
public static FeatureCodec<? extends Feature, ?> getCodecForFile( final File featureFile, final Class<? extends Feature> featureType ) {
return getCodecForFile(featureFile.toPath(), featureType);
public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath ) {
return getCodecForFile(featurePath, null);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ protected final void onStartup() {
@SuppressWarnings("unchecked")
private void initializeDrivingFeatures() {
final File drivingFile = getDrivingFeatureFile();
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(drivingFile);
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(drivingFile.toPath());
if (isAcceptableFeatureType(codec.getFeatureType())) {
drivingFeatures = new FeatureDataSource<>(new FeatureInput<>(drivingFile.getAbsolutePath()), FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, null, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, referenceArguments.getReferencePath());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -444,9 +444,9 @@ public HardwareFeatureException(String message, Exception e){
public static final class CouldNotIndexFile extends UserException {
private static final long serialVersionUID = 0L;

public CouldNotIndexFile(final File file, final Exception e) {
public CouldNotIndexFile(final Path path, final Exception e) {
super(String.format("Error while trying to create index for %s. Error was: %s: %s",
file.getAbsolutePath(), e.getClass().getCanonicalName(), e.getMessage()), e);
path.toString(), e.getClass().getCanonicalName(), e.getMessage()), e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPathSpecifier;
import picard.cmdline.programgroups.OtherProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureManager;
import org.broadinstitute.hellbender.engine.ProgressMeter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.codecs.ProgressReportingDelegatingCodec;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* This tool creates an index file for the various kinds of feature-containing files supported by GATK (such as VCF
Expand All @@ -30,7 +32,7 @@
* <h3>Usage example</h3>
* <pre>
* gatk IndexFeatureFile \
* -F cohort.vcf.gz
* -I cohort.vcf.gz
* </pre>
* This produces the corresponding index, cohort.vcf.gz.tbi.
*/
Expand All @@ -44,81 +46,81 @@
public final class IndexFeatureFile extends CommandLineProgram {
private static final Logger logger = LogManager.getLogger(IndexFeatureFile.class);

@Argument(shortName = "F",
fullName = "feature-file",
@Argument(shortName =StandardArgumentDefinitions.INPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
doc = "Feature file (eg., VCF or BED file) to index. Must be in a tribble-supported format")
public File featureFile;
public GATKPathSpecifier featurePath;

@Argument(shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
doc = "The output index file. If missing, the tool will create an index file in the same directory " +
"as the input file.",
optional = true)
public File outputFile;
public GATKPathSpecifier outputPath;

public static final int OPTIMAL_GVCF_INDEX_BIN_SIZE = 128000;
public static final String GVCF_FILE_EXTENSION = ".g.vcf";

@Override
protected Object doWork() {
if (!featureFile.canRead()) {
throw new UserException.CouldNotReadInputFile(featureFile);
if (!Files.isReadable(featurePath.toPath()) ) {
throw new UserException.CouldNotReadInputFile(featurePath.toPath());
}

// Get the right codec for the file to be indexed. This call will throw an appropriate exception
// if featureFile is not in a supported format or is unreadable.
final FeatureCodec<? extends Feature, ?> codec = new ProgressReportingDelegatingCodec<>(FeatureManager.getCodecForFile(featureFile), ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES);
final FeatureCodec<? extends Feature, ?> codec = new ProgressReportingDelegatingCodec<>(
FeatureManager.getCodecForFile(featurePath.toPath()), ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES);

final Index index = createAppropriateIndexInMemory(codec);
final File indexFile = determineFileName(index);
final Path indexPath = determineFileName(index);

try {
index.write(indexFile);
index.write(indexPath);
} catch (final IOException e) {
throw new UserException.CouldNotCreateOutputFile("Could not write index to file " + indexFile.getAbsolutePath(), e);
throw new UserException.CouldNotCreateOutputFile("Could not write index to file " + indexPath.toAbsolutePath(), e);
}

logger.info("Successfully wrote index to " + indexFile.getAbsolutePath());
return indexFile.getAbsolutePath();
logger.info("Successfully wrote index to " + indexPath.toAbsolutePath());
return indexPath.toAbsolutePath().toString();
}

private File determineFileName(final Index index) {
if (outputFile != null) {
return outputFile;
private Path determineFileName(final Index index) {
if (outputPath != null) {
return outputPath.toPath();
} else if (index instanceof TabixIndex) {
return Tribble.tabixIndexFile(featureFile);
return Tribble.tabixIndexPath(featurePath.toPath());
} else {
return Tribble.indexFile(featureFile);
return Tribble.indexPath(featurePath.toPath());
}
}

private Index createAppropriateIndexInMemory(final FeatureCodec<? extends Feature, ?> codec) {
try {
// For block-compression files, write a Tabix index
if (IOUtil.hasBlockCompressedExtension(featureFile)) {
if (IOUtil.hasBlockCompressedExtension(featurePath.toPath())) {
// Creating tabix indices with a non standard extensions can cause problems so we disable it
if (outputFile != null && !outputFile.getAbsolutePath().endsWith(FileExtensions.TABIX_INDEX)) {
throw new UserException("The index for " + featureFile + " must be written to a file with a \"" + FileExtensions.TABIX_INDEX + "\" extension");
if (outputPath != null && !outputPath.getURIString().endsWith(FileExtensions.TABIX_INDEX)) {
throw new UserException("The index for " + featurePath + " must be written to a file with a \"" + FileExtensions.TABIX_INDEX + "\" extension");
}

// TODO: this could benefit from provided sequence dictionary from reference
// TODO: this can be an optional parameter for the tool
return IndexFactory.createIndex(featureFile, codec, IndexFactory.IndexType.TABIX, null);

return IndexFactory.createIndex(featurePath.toPath(), codec, IndexFactory.IndexType.TABIX, null);
}
// TODO: detection of GVCF files should not be file-extension-based. Need to come up with canonical
// TODO: way of detecting GVCFs based on the contents (may require changes to the spec!)
else if (featureFile.getName().endsWith(GVCF_FILE_EXTENSION)) {
else if (featurePath.getURIString().endsWith(GVCF_FILE_EXTENSION)) {
// Optimize GVCF indices for the use case of having a large number of GVCFs open simultaneously
return IndexFactory.createLinearIndex(featureFile, codec, OPTIMAL_GVCF_INDEX_BIN_SIZE);
return IndexFactory.createLinearIndex(featurePath.toPath(), codec, OPTIMAL_GVCF_INDEX_BIN_SIZE);
} else {
// Optimize indices for other kinds of files for seek time / querying
return IndexFactory.createDynamicIndex(featureFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
return IndexFactory.createDynamicIndex(featurePath.toPath(), codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
}
} catch (TribbleException e) {
// Underlying cause here is usually a malformed file, but can also be things like
// "codec does not support tabix"
throw new UserException.CouldNotIndexFile(featureFile, e);
throw new UserException.CouldNotIndexFile(featurePath.toPath(), e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ private FeatureInput<VariantContext> getVariantFeatureInputWithCachedCodec() {
final FeatureInput<VariantContext> featureInput = new FeatureInput<>(inputVCFFile.getAbsolutePath());
Assert.assertNull(featureInput.getFeatureCodecClass());

final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(new File(featureInput.getFeaturePath()));
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(featureInput.toPath());
featureInput.setFeatureCodecClass((Class<FeatureCodec<VariantContext, ?>>)codec.getClass());

return featureInput;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ public Object[][] getDetectCorrectFileFormatTestData() {

@Test(dataProvider = "DetectCorrectFileFormatTestData")
public void testDetectCorrectFileFormat( final File file, final Class<? extends FeatureCodec<? extends Feature, ?>> expectedCodecClass ) throws Exception {
Assert.assertEquals(FeatureManager.getCodecForFile(file).getClass(), expectedCodecClass,
Assert.assertEquals(FeatureManager.getCodecForFile(file.toPath()).getClass(), expectedCodecClass,
"Wrong codec selected for file " + file.getAbsolutePath());

// We should also get the correct codec if we pass in the explicit expected Feature type to getCodecForFile()
@SuppressWarnings("unchecked")
final Class<? extends Feature> expectedCodecFeatureType = expectedCodecClass.getDeclaredConstructor().newInstance().getFeatureType();
Assert.assertEquals(FeatureManager.getCodecForFile(file, expectedCodecFeatureType).getClass(), expectedCodecClass,
Assert.assertEquals(FeatureManager.getCodecForFile(file.toPath(), expectedCodecFeatureType).getClass(), expectedCodecClass,
"Wrong codec selected for file " + file.getAbsolutePath() + " after subsetting to the expected Feature type");
}

Expand All @@ -62,15 +62,15 @@ public void testDetectUnsupportedFileFormat() {
Assert.assertTrue(unsupportedFile.canRead(), "Cannot test detection of unsupported file formats on an unreadable file");

// Should throw, since the file exists and is readable, but is in an unsupported format
FeatureManager.getCodecForFile(unsupportedFile);
FeatureManager.getCodecForFile(unsupportedFile.toPath());
}

@Test(expectedExceptions = UserException.WrongFeatureType.class)
public void testRestrictCodecSelectionToWrongFeatureType() {
final File vcf = new File(FEATURE_MANAGER_TEST_DIRECTORY + "minimal_vcf4_file.vcf");

// If we require BED Features from this vcf file, we should get a type mismatch exception
FeatureManager.getCodecForFile(vcf, BEDFeature.class);
FeatureManager.getCodecForFile(vcf.toPath(), BEDFeature.class);
}

@DataProvider(name = "IsFeatureFileTestData")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ public void testEnableDisableGVCFWriting(boolean writeGvcf, String extension) th
public String getTestedToolName(){
return IndexFeatureFile.class.getSimpleName();
}
}.runCommandLine(new String[]{"-F", output.getAbsolutePath()});
}.runCommandLine(new String[]{"-I", output.getAbsolutePath()});

final List<VariantContext> writtenVcs = readVariants(output.toString());
//if we are actually writing a gvcf, all the variant blocks will be merged into a single homref block with
Expand Down
Loading

0 comments on commit a632a05

Please sign in to comment.