Skip to content

Commit

Permalink
Merge pull request #109 from epifluidlab/frag-bed-file-support
Browse files Browse the repository at this point in the history
Frag bed file support
  • Loading branch information
ravibandaru-lab authored Nov 18, 2024
2 parents 3db6fe4 + c82048b commit 38fe104
Show file tree
Hide file tree
Showing 11 changed files with 61 additions and 17 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,17 @@ The format is based on
and this project adheres to
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.7.4] - 2024-08-24
## [0.7.6] - 2024-11-18

### Fixed
- indexing issue in region_end_motifs that would misread strand
information when calculating end motifs on forward-strand only.

### Changed
- frag_generator now accepts fragment coordinates in bed.gz files


## [0.7.5] - 2024-10-10

### Fixed
- `delfi` accepts `gap_file=None`
Expand Down
Binary file modified docs/_build/doctrees/documentation/api_reference/fragfile.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
8 changes: 5 additions & 3 deletions docs/_build/html/documentation/api_reference/fragfile.html
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,8 @@ <h1>Frag File Utilities<a class="headerlink" href="#frag-file-utilities" title="
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>input_file</strong> (<em>str</em><em> or </em><em>AlignmentFile</em>) – </p></li>
<li><p><strong>input_file</strong> (<em>str</em><em>, </em><em>pathlike</em><em>, or </em><em>AlignmentFile</em>) – Fragment coordinates stored as a SAM, BAM, CRAM, tabix-indexed
bed.gz, or tabix-indexed FinaleDB fragment file.</p></li>
<li><p><strong>contig</strong> (<em>str</em>) – </p></li>
<li><p><strong>quality_threshold</strong> (<em>int</em><em>, </em><em>optional</em>) – </p></li>
<li><p><strong>start</strong> (<em>int</em><em>, </em><em>optional</em>) – </p></li>
Expand All @@ -514,8 +515,9 @@ <h1>Frag File Utilities<a class="headerlink" href="#frag-file-utilities" title="
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p><strong>frag_ends</strong> – Generator that yields tuples containing the region covered by
each fragment in input_file.</p>
<dd class="field-even"><p><strong>frag_ends</strong> – Generator that yields tuples:
(contig: str, read_start: int, read_stop: int, mapq: int,
read_on_plus: boolean)</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>Generator</p>
Expand Down
2 changes: 1 addition & 1 deletion docs/_build/html/searchindex.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/finaletoolkit/frag/end_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ def region_end_motifs(
continue
else:
for frag in frag_ends:
if frag[3]: # is on forward strand or not
if frag[4]: # is on forward strand or not
# py2bit uses 0-based for start, 1-based for end
# forward end-motif
forward_kmer = refseq.sequence(
Expand Down
31 changes: 21 additions & 10 deletions src/finaletoolkit/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ def frag_generator(
Parameters
----------
input_file : str or AlignmentFile
input_file : str, pathlike, or AlignmentFile
Fragment coordinates stored as a SAM, BAM, CRAM, tabix-indexed
bed.gz, or tabix-indexed FinaleDB fragment file.
contig : str
quality_threshold : int, optional
start : int, optional
Expand All @@ -231,28 +233,31 @@ def frag_generator(
Returns
-------
frag_ends : Generator
Generator that yields tuples containing the region covered by
each fragment in input_file.
Generator that yields tuples:
(contig: str, read_start: int, read_stop: int, mapq: int,
read_on_plus: boolean)
"""
try:
# check type of input and open if needed
input_file_is_path = False
if isinstance(input_file, str) or isinstance(input_file, Path):
input_file_is_path == True
# check file type
if (
if ( # AlignmentFile
str(input_file).endswith('.sam')
or str(input_file).endswith('.bam')
or str(input_file).endswith('.cram')
):
is_sam = True
sam_file = pysam.AlignmentFile(input_file, 'r')
elif (
elif ( # Tabix indexed file
str(input_file).endswith('frag.gz')
or str(input_file).endswith('bed.gz')
):
tbx = pysam.TabixFile(str(input_file), 'r')
is_sam = False
# check if input is FinaleDB Fragment file
is_frag = str(input_file).endswith('frag.gz')
else:
raise ValueError(
"Unaccepted interval file type. Only SAM, CRAM, BAM"
Expand All @@ -265,6 +270,7 @@ def frag_generator(
input_file_is_path = False
is_sam = False
tbx = input_file
is_frag = tbx.filename.endswith('frag.gz')
else:
raise TypeError(
f'{type(input_file)} is invalid type for input_file.'
Expand Down Expand Up @@ -292,7 +298,7 @@ def frag_generator(
else:
raise ValueError("contig should be specified if start or stop given.")

if is_sam:
if is_sam: # AlignmentFile
for read in sam_file.fetch(contig, start, stop):
# Only select read1 and filter out non-paired-end
# reads and low-quality reads
Expand Down Expand Up @@ -336,22 +342,27 @@ def frag_generator(
"Skipping interval.\n",
f"Error: {e}\n"])

else:
else: # Tabix Indexed
for line in tbx.fetch(
contig, start, stop, parser=pysam.asTuple()
):
read_start = int(line[1])
read_stop = int(line[2])
mapq = int(line[3])
frag_length = read_stop - read_start
read_on_plus = '+' in line[4]
if is_frag:
mapq = int(line[3])
read_on_plus = '+' in line[4]
else:
mapq = int(line[4])
read_on_plus = '+' in line[5]

try:
if (frag_length >= fraction_low
and frag_length <= fraction_high
and mapq >= quality_threshold
):
yield contig, read_start, read_stop, mapq, read_on_plus
# HACK: for some reason read_length is sometimes None
# HACK: read_length is sometimes None
except TypeError:
continue

Expand Down
2 changes: 1 addition & 1 deletion src/finaletoolkit/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Single-source module for the package version number.
"""

__version__ = "0.7.5"
__version__ = "0.7.6"

Binary file added tests/data/12.3444.b37.frag.bed.gz
Binary file not shown.
Binary file added tests/data/12.3444.b37.frag.bed.gz.tbi
Binary file not shown.
21 changes: 21 additions & 0 deletions tests/test_frag_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,27 @@ def test_frag_gz(self, request):

assert len(frags) == 17, "Incorrect number of frags"

def test_bed_gz(self, request):
"""
See if frag_generator runs when opening a bed.gz file and reads the
right number of reads
"""
path = request.path.parent / 'data' / '12.3444.b37.frag.bed.gz'
frag_gen = frag_generator(
path, "12", quality_threshold=0, fraction_low=0, fraction_high=9999
)
frags = [frag for frag in frag_gen]

chroms = np.array([chrom for chrom, *_ in frags])
starts = np.array([start for _, start, *_ in frags])
stops = np.array([stop for _, _, stop, *_ in frags])

in_region = np.any(overlaps(np.array(['12']), np.array([34442500]),
np.array([34446500]), chroms, starts, stops))
assert in_region, "Some fragments are outside of region"

assert len(frags) == 17, "Incorrect number of frags"




Expand Down

0 comments on commit 38fe104

Please sign in to comment.