Skip to content

Commit

Permalink
test v0.1.4
Browse files Browse the repository at this point in the history
add 'collapse' function
  • Loading branch information
andyjslee committed Jan 31, 2024
1 parent d1daedf commit b677025
Show file tree
Hide file tree
Showing 11 changed files with 937 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "vstolib"
version = "0.1.3"
version = "0.1.4"
edition = "2021"

[package.metadata.maturin]
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ vstol [-h] [--version] {annotate,diff,filter,intersect,merge,overlap,vcf2tsv}
| Command | Description |
| ------- |-----------------------------------------------------------------------------------------------------------------------------|
| annotate | Annotate variant calls using [pyensembl](https://github.com/openvax/pyensembl) or [gencode](https://www.gencodegenes.org/). |
| collapse | Collapse a variants list into unique variants. |
| diff | Identify variant calls specific to a list. |
| filter | Filter variant calls (can be used to identify somatic variants). |
| intersect | Identify intersecting variant calls. |
Expand Down
751 changes: 751 additions & 0 deletions examples/outputs/hg002_merged_variants_collapsed.tsv

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions examples/run_vstol_collapse.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
vstol collapse \
--tsv-file ../test/data/hg002_merged_variants.tsv \
--sample-id HG002 \
--output-tsv-file outputs/hg002_merged_variants_collapsed.tsv
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ build-backend = "maturin"

[project]
name = "vstol"
version = "0.1.3"
version = "0.1.4"
requires-python = ">=3.10"
keywords = [
"somatic variants",
Expand Down
107 changes: 107 additions & 0 deletions python/vstolib/cli/cli_collapse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
The purpose of this python3 script is to create parser
and run 'collapse' command.
"""


import argparse
from ..constants import *
from ..logging import get_logger
from ..main import collapse
from ..variants_list import VariantsList


logger = get_logger(__name__)


def add_cli_collapse_arg_parser(
sub_parsers: argparse._SubParsersAction
) -> argparse._SubParsersAction:
"""
Add 'collapse' parser.
"""
parser = sub_parsers.add_parser(
'collapse',
help='Collapses a VSTOL TSV file (with multiple VariantCall rows per Variant) '
'into a TSV file with one representative VariantCall row per Variant '
'to a TSV file.')
parser._action_groups.pop()

# Required arguments
parser_required = parser.add_argument_group('required arguments')
parser_required.add_argument(
"--tsv-file", '-i',
dest="tsv_file",
type=str,
required=True,
help="Input TSV file."
)
parser_required.add_argument(
"--sample-id", '-a',
dest="sample_id",
type=str,
required=True,
help="Sample ID to retain."
)
parser_required.add_argument(
"--output-tsv-file", '-o',
dest="output_tsv_file",
type=str,
required=True,
help="Output TSV file."
)

# Optional arguments
parser_optional = parser.add_argument_group('optional arguments')
parser_required.add_argument(
"--strategy", '-s',
dest="strategy",
type=str,
required=False,
default=CollapseStrategies.MAX_ALTERNATE_ALLELE_READ_COUNT,
choices=CollapseStrategies.ALL,
help="Collapse (summarization) strategy. "
"Allowed options: %s. Default: %s"
% (', '.join(CollapseStrategies.ALL),
CollapseStrategies.MAX_ALTERNATE_ALLELE_READ_COUNT)
)

parser.set_defaults(which='collapse')
return sub_parsers


def run_cli_collapse_from_parsed_args(args: argparse.Namespace):
"""
Run 'collapse' command using parameters from parsed arguments.
Parameters:
args : argparse.Namespace object with the following variables:
tsv_file
sample_id
output_tsv_file
strategy
"""
variants_list = VariantsList.read_tsv_file(tsv_file=args.tsv_file)
variants_list_collapsed = collapse(
variants_list=variants_list,
sample_id=args.sample_id,
strategy=args.strategy
)
variants_list_collapsed.to_dataframe().to_csv(
args.output_tsv_file,
sep='\t',
index=False
)
4 changes: 4 additions & 0 deletions python/vstolib/cli/cli_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import vstolib
from typing import Tuple
from .cli_annotate import *
from .cli_collapse import *
from .cli_diff import *
from .cli_filter import *
from .cli_intersect import *
Expand Down Expand Up @@ -56,6 +57,7 @@ def run():
# Step 1. Initialize argument parser
arg_parser, sub_parsers = init_arg_parser()
sub_parsers = add_cli_annotate_arg_parser(sub_parsers=sub_parsers) # annotate
sub_parsers = add_cli_collapse_arg_parser(sub_parsers=sub_parsers) # collapse
sub_parsers = add_cli_diff_arg_parser(sub_parsers=sub_parsers) # diff
sub_parsers = add_cli_filter_arg_parser(sub_parsers=sub_parsers) # filter
sub_parsers = add_cli_intersect_arg_parser(sub_parsers=sub_parsers) # intersect
Expand All @@ -67,6 +69,8 @@ def run():
# Step 2. Execute function based on CLI arguments
if args.which == 'annotate':
run_cli_annotate_from_parsed_args(args=args)
elif args.which == 'collapse':
run_cli_collapse_from_parsed_args(args=args)
elif args.which == 'diff':
run_cli_diff_from_parsed_args(args=args)
elif args.which == 'filter':
Expand Down
7 changes: 7 additions & 0 deletions python/vstolib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ class Annotators:
]


class CollapseStrategies:
MAX_ALTERNATE_ALLELE_READ_COUNT = 'max_alternate_allele_read_count'
ALL = [
MAX_ALTERNATE_ALLELE_READ_COUNT
]


class GenomicRegionTypes:
EXONIC = 'exonic'
INTRONIC = 'intronic'
Expand Down
48 changes: 46 additions & 2 deletions python/vstolib/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
import copy
import pandas as pd
from collections import defaultdict
from typing import List, Tuple
from typing import List, Literal, Tuple
from .annotator import Annotator
from .constants import VariantCallingMethods, VariantCallTags
from .constants import CollapseStrategies, VariantCallingMethods, VariantCallTags
from .default import *
from .genomic_ranges_list import GenomicRangesList
from .logging import get_logger
Expand Down Expand Up @@ -61,6 +61,50 @@ def annotate(
return annotator.annotate(variants_list=variants_list)


def collapse(
variants_list: VariantsList,
sample_id: str,
strategy: str = Literal[CollapseStrategies.MAX_ALTERNATE_ALLELE_READ_COUNT]
) -> VariantsList:
"""
Collapses (summarizes) a VariantsList such that each Variant has 1
VariantCall.
Args:
variants_list : VariantsList object.
sample_id : Sample ID to retain.
strategy : Strategy (options: 'max_alternate_allele_read_count').
Returns:
VariantsList
"""
if strategy == CollapseStrategies.MAX_ALTERNATE_ALLELE_READ_COUNT:
variants_list_collapsed = VariantsList()
for variant in variants_list.variants:
target_idx = -1
min_reads = -1
matches_sample_id = False
for i in range(0, len(variant.variant_calls)):
variant_call = variant.variant_calls[i]
if variant_call.sample_id == sample_id:
matches_sample_id = True
if variant_call.alternate_allele_read_count > min_reads:
target_idx = i
min_reads = variant_call.alternate_allele_read_count
if matches_sample_id:
if target_idx == -1:
target_idx = 0
variant_ = Variant(id=variant.id)
variant_.add_variant_call(variant_call=variant.variant_calls[target_idx])
variants_list_collapsed.add_variant(variant=variant_)
else:
raise Exception('Unknown collapse strategy: %s' % strategy)
logger.info("%i variants and %i variant calls in the collapsed VariantsList"
% (variants_list_collapsed.size,
len(variants_list_collapsed.variant_call_ids)))
return variants_list_collapsed


def diff(
target_variants_list: VariantsList,
query_variants_lists: List[VariantsList],
Expand Down
1 change: 0 additions & 1 deletion python/vstolib/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,3 @@ def to_dict(self) -> Dict:
'variant_calls': [variant_call.to_dict() for variant_call in self.variant_calls]
}
return data

15 changes: 15 additions & 0 deletions test/test_collapse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .conftest import *
from vstolib.constants import CollapseStrategies
from vstolib.main import collapse
from vstolib.variants_list import VariantsList


def test_collapse():
tsv_file = get_data_path(name='hg002_merged_variants.tsv')
variants_list = VariantsList.read_tsv_file(tsv_file=tsv_file)
variants_list_collapsed = collapse(
variants_list=variants_list,
sample_id='HG002',
strategy=CollapseStrategies.MAX_ALTERNATE_ALLELE_READ_COUNT
)
print(variants_list_collapsed.size)

0 comments on commit b677025

Please sign in to comment.