-
Notifications
You must be signed in to change notification settings - Fork 597
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added ExcessiveEndClippedReadFilter (#7638)
* Added ExcessiveEndClippedReadFilter - Added ExcessiveEndClippedReadFilter - a filter that will keep reads that have fewer than the specified number of clipped bases on either end. This is designed for long reads, and has a default value of 1k.
- Loading branch information
1 parent
5b87367
commit caa48f9
Showing
3 changed files
with
167 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
88 changes: 88 additions & 0 deletions
88
...main/java/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
package org.broadinstitute.hellbender.engine.filters; | ||
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import htsjdk.samtools.CigarElement; | ||
import htsjdk.samtools.CigarOperator; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; | ||
import org.broadinstitute.hellbender.utils.help.HelpConstants; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
|
||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
/** | ||
* Filter out reads where the number of soft-/hard-clipped bases on either end is above a certain threshold. | ||
* | ||
* This filter will filter reads where the number of soft clips on one end of the read is too large. It does NOT | ||
* check the total number of bases clipped on BOTH ends. For the purposes of this filter, the count of soft and hard | ||
* clipped bases on either end of a read are combined. | ||
* | ||
* For example, a read with the following cigar strings will be filtered out (given maxClippedBases=1000): | ||
* | ||
* 1500S0000M | ||
* 900H300S50000M | ||
* 1001H50000M | ||
* | ||
* However, this read will NOT be filtered out (given maxClippedBases=1000): | ||
* | ||
* 900S50000M900S | ||
* 150H800S50000M900S150H | ||
* 1000S500M | ||
* 500M1000S | ||
* | ||
* Note: This is designed for use with (unpaired) long reads, but can be used with any reads. | ||
*/ | ||
@DocumentedFeature( | ||
groupName= HelpConstants.DOC_CAT_READFILTERS, | ||
groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, | ||
summary = "Filter out reads that have too many clipped bases on either end." | ||
) | ||
public final class ExcessiveEndClippedReadFilter extends ReadFilter { | ||
static final long serialVersionUID = 1L; | ||
|
||
@Argument(fullName = ReadFilterArgumentDefinitions.MAXIMUM_END_CLIPPED_BASES, | ||
doc = "Maximum number of clipped bases on either end of a given read", | ||
optional = true) | ||
private int maxClippedBases = 1000; | ||
|
||
// Command line parser requires a no-arg constructor | ||
public ExcessiveEndClippedReadFilter() {} | ||
|
||
@VisibleForTesting | ||
ExcessiveEndClippedReadFilter(final int maxClippedBases) { | ||
this.maxClippedBases = maxClippedBases; | ||
} | ||
|
||
@Override | ||
public boolean test(final GATKRead read) { | ||
final List<CigarElement> cigarElements = read.getCigarElements(); | ||
|
||
// Check front end of the read: | ||
int clippedBases = 0; | ||
for ( final CigarElement element : cigarElements) { | ||
if ( element.getOperator() != CigarOperator.S && element.getOperator() != CigarOperator.H) { | ||
break; | ||
} | ||
clippedBases += element.getLength(); | ||
} | ||
// If we fail on the front end we can quit now: | ||
if (clippedBases > maxClippedBases) { | ||
return false; | ||
} | ||
|
||
// Check back end of the read: | ||
clippedBases = 0; | ||
for ( int i = cigarElements.size() - 1; i >= 0; --i ) { | ||
final CigarElement element = cigarElements.get(i); | ||
if ( element.getOperator() != CigarOperator.S && element.getOperator() != CigarOperator.H) { | ||
break; | ||
} | ||
clippedBases += element.getLength(); | ||
} | ||
return clippedBases <= maxClippedBases; | ||
} | ||
} |
77 changes: 77 additions & 0 deletions
77
...a/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilterUnitTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package org.broadinstitute.hellbender.engine.filters; | ||
|
||
import htsjdk.samtools.Cigar; | ||
import htsjdk.samtools.SAMFileHeader; | ||
import htsjdk.samtools.TextCigarCodec; | ||
import org.broadinstitute.hellbender.GATKBaseTest; | ||
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
import org.testng.Assert; | ||
import org.testng.annotations.DataProvider; | ||
import org.testng.annotations.Test; | ||
|
||
|
||
public class ExcessiveEndClippedReadFilterUnitTest extends GATKBaseTest { | ||
|
||
private static final int CHR_COUNT = 1; | ||
private static final int CHR_START = 1; | ||
private static final int CHR_SIZE = 1000; | ||
private static final int GROUP_COUNT = 5; | ||
|
||
private final SAMFileHeader header= ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT); | ||
|
||
private GATKRead buildSAMRead(final String cigarString) { | ||
final Cigar cigar = TextCigarCodec.decode(cigarString); | ||
return ArtificialReadUtils.createArtificialRead(header, cigar); | ||
} | ||
|
||
@Test(dataProvider= "provideForTestExcessiveEndClipFilter") | ||
public void testExcessiveEndClipFilter(final int maxClippedBases, | ||
final String cigarString, | ||
final boolean expectedResult) { | ||
|
||
final ExcessiveEndClippedReadFilter filter = new ExcessiveEndClippedReadFilter(maxClippedBases); | ||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
} | ||
|
||
@DataProvider(name = "provideForTestExcessiveEndClipFilter") | ||
public Object[][] overclippedDataProvider() { | ||
return new Object[][] { | ||
// Failing test cases: | ||
{ 50, "1S10M", true }, | ||
{ 50, "10M1S", true }, | ||
{ 50, "49S10M", true }, | ||
{ 50, "50S10M", true }, | ||
{ 50, "10M49S", true }, | ||
{ 50, "10M50S", true }, | ||
{ 50, "49S10M1S", true }, | ||
{ 50, "50S10M1S", true }, | ||
{ 50, "1S10M49S", true }, | ||
{ 50, "1S10M50S", true }, | ||
{ 50, "49H10M1S", true }, | ||
{ 50, "50H10M1S", true }, | ||
{ 50, "1H10M49S", true }, | ||
{ 50, "1H10M50S", true }, | ||
{ 50, "49H10M49H", true }, | ||
{ 50, "50H10M50H", true }, | ||
{ 50, "40H9S10M", true }, | ||
{ 50, "40H10S10M", true }, | ||
{ 50, "10M40H9S", true }, | ||
{ 50, "10M40H10S", true }, | ||
|
||
// Passing test cases: | ||
{ 50, "10M51S", false }, | ||
{ 50, "51S10M", false }, | ||
{ 50, "51S10M51S", false }, | ||
|
||
{ 50, "10M51H", false }, | ||
{ 50, "51H10M", false }, | ||
{ 50, "51H10M51H", false }, | ||
|
||
{ 50, "10M26S26H", false }, | ||
{ 50, "26H26S10M", false }, | ||
{ 50, "26H26S10M26S26H", false }, | ||
}; | ||
} | ||
} |