Skip to content

Commit

Permalink
Merge pull request #99 from monarch-initiative/finalize-vrs-for-pp-v202
Browse files Browse the repository at this point in the history
Finished!
  • Loading branch information
ielis authored Apr 1, 2024
2 parents d81e09f + dcd87d8 commit 7a84cbf
Show file tree
Hide file tree
Showing 8 changed files with 2,216 additions and 96 deletions.
117 changes: 117 additions & 0 deletions src/pyphetools/pp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,122 @@
"""
A package with strongly typed Phenopacket Schema types and the code for I/O and validation.
Examples
^^^^^^^^
Create phenopacket programatically
##################################
We recommend to bring the classes into scope all at once using the import star:
>>> from pyphetools.pp.v202 import *
Then, we can build a phenopacket from the individual building blocks.
Let's start with the subject:
>>> subject = Individual(
... id='proband A',
... time_at_last_encounter=TimeElement(
... element=Age(iso8601duration='P6M'),
... ),
... sex=Sex.FEMALE,
... )
>>> subject.id
'proband A'
>>> subject.sex.name
'FEMALE'
The created subject represents a female proband who had 6 months at the time of the last encounter.
We can update the fields using a simple assignment:
>>> subject.karyotypic_sex = KaryotypicSex.XX
>>> subject.karyotypic_sex.name
'XX'
We assigned an enum constant `KaryotypicSex.XX` to previously unset `karyotypic_sex` attribute.
The same can be done with object attributes:
>>> subject.vital_status = VitalStatus(
... status=VitalStatus.Status.DECEASED,
... time_of_death=TimeElement(
... element=Age(iso8601duration='P1Y')
... ),
... cause_of_death=OntologyClass(
... id='NCIT:C7541', label='Retinoblastoma',
... ),
... )
We set the vital status to indicate that the proband died at 1 year of age due to *Retinoblastoma*.
Now we can create a phenopacket. The phenopacket requires an identifier, `MetaData` and an optional subject.
>>> pp = Phenopacket(
... id='example.retinoblastoma.phenopacket.id',
... meta_data=MetaData(
... created=Timestamp.from_str('2021-05-14T10:35:00Z'),
... created_by='anonymous biocurator',
... ),
... )
To create a phenopacket, we must provide the `id` and `meta_data` fields
since they are required by the Phenopacket Schema.
The same applies to `created` and `created_by` fields of `MetaData`.
`MetaData` contextualizes the used ontology classes, such as `NCIT:C7541` *Retinoblastoma*,
to a particular ontology, such as NCI Thesaurus. We can store the ontology resource in `MetaData.resources`
field:
>>> pp.meta_data.resources.append(
... Resource(
... id='ncit', name='NCI Thesaurus', url='http://purl.obolibrary.org/obo/ncit.owl',
... version='23.09d', namespace_prefix='NCIT', iri_prefix='http://purl.obolibrary.org/obo/NCIT_',
... ),
... )
All repeated elements, such as `MetaData.resources`, can be accessed via a `list`.
Read/write JSON and Protobuf
############################
We can read and write phenopackets in JSON format using the `JsonDeserializer` and `JsonSerializer` classes:
>>> from pyphetools.pp.parse.json import JsonSerializer, JsonDeserializer
>>> serializer = JsonSerializer()
The serializer can write a Phenopacket Schema building block, such as `OntologyClass` or `Phenopacket` into
a file handle:
>>> from io import StringIO
>>> buf = StringIO()
>>> serializer.serialize(subject.vital_status, buf)
>>> buf.getvalue()
'{"status": "DECEASED", "timeOfDeath": {"age": {"iso8601duration": "P1Y"}}, "causeOfDeath": {"id": "NCIT:C7541", "label": "Retinoblastoma"}}'
and the JSON can be read back from a file handle:
>>> _ = buf.seek(0) # Rewind and ignore the result
>>> deserializer = JsonDeserializer()
>>> decoded = deserializer.deserialize(buf, VitalStatus)
>>> decoded == subject.vital_status
True
The building block can also be written into Protobuf wire format.
We can do a similar round-trip as above, but we will need a byte IO handle:
>>> from io import BytesIO
>>> byte_buf = BytesIO()
We can write the subject into the buffer and get the same data back:
>>> subject.dump_pb(byte_buf)
>>> _ = byte_buf.seek(0) # Rewind to start
>>> other = Individual.from_pb(byte_buf)
>>> subject == other
True
"""

from . import parse
Expand Down
2 changes: 1 addition & 1 deletion src/pyphetools/pp/parse/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _put_field_to_mapping(
elif isinstance(field, enum.Enum):
out[name] = field.name
elif hasattr(field, 'seconds') and hasattr(field, 'nanos') and hasattr(field, 'as_str') and callable(field.as_str):
# This quack *exactly* as a Timestamp!
# This quacks *exactly* as a Timestamp!
out[name] = field.as_str()
else:
raise ValueError(f'Unexpected field {field}')
Expand Down
9 changes: 9 additions & 0 deletions src/pyphetools/pp/v202/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
from ._disease import Disease
from ._meta_data import MetaData, Resource, Update
from ._phenopackets import Phenopacket
from ._vrs import Gene, Text, Number, IndefiniteRange, DefiniteRange, SimpleInterval, SequenceInterval
from ._vrs import SequenceLocation, SequenceState, LiteralSequenceExpression, DerivedSequenceExpression
from ._vrs import RepeatedSequenceExpression, CytobandInterval, ChromosomeLocation, Allele, Haplotype, CopyNumber
from ._vrs import VariationSet, Variation
from ._vrsatile import Expression, Extension, VcfRecord, MoleculeContext, VariationDescriptor

__all__ = [
Expand All @@ -32,4 +36,9 @@
'MetaData', 'Resource', 'Update',
'OntologyClass', 'ExternalReference', 'Evidence', 'Procedure', 'GestationalAge', 'Age', 'AgeRange', 'TimeInterval',
'TimeElement', 'Timestamp', 'File',
# and the VRS members
'Gene', 'Text', 'Number', 'IndefiniteRange', 'DefiniteRange', 'SimpleInterval', 'SequenceInterval',
'SequenceLocation', 'SequenceState', 'LiteralSequenceExpression', 'DerivedSequenceExpression',
'RepeatedSequenceExpression', 'CytobandInterval', 'ChromosomeLocation', 'Allele', 'Haplotype', 'CopyNumber',
'VariationSet', 'Variation',
]
129 changes: 41 additions & 88 deletions src/pyphetools/pp/v202/_interpretation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ._gene_descriptor import GeneDescriptor
from .._api import MessageMixin
from ..parse import extract_message_scalar, extract_message_sequence, extract_pb_message_scalar, extract_pb_message_seq
from ..parse import extract_oneof_scalar, extract_pb_oneof_scalar


class AcmgPathogenicityClassification(enum.Enum):
Expand Down Expand Up @@ -124,6 +125,11 @@ def __repr__(self):


class GenomicInterpretation(MessageMixin):
_ONEOF_CALL = {
'gene_descriptor': GeneDescriptor,
'variant_interpretation': VariantInterpretation,
}

class InterpretationStatus(enum.Enum):
UNKNOWN_STATUS = 0
REJECTED = 1
Expand All @@ -135,25 +141,11 @@ def __init__(
self,
subject_or_biosample_id: str,
interpretation_status: InterpretationStatus,
gene_descriptor: typing.Optional[GeneDescriptor] = None,
variant_interpretation: typing.Optional[VariantInterpretation] = None,
call: typing.Union[GeneDescriptor, VariantInterpretation],
):
self._subject_or_biosample_id = subject_or_biosample_id
self._interpretation_status = interpretation_status
one_ofs = (gene_descriptor, variant_interpretation)
if sum(1 for arg in one_ofs if arg is not None) != 1:
cnt = sum(1 for arg in one_ofs if arg is not None)
raise ValueError(
f'GenomicInterpretation must be provided with exactly 1 argument but {cnt} arguments were provided!')

if gene_descriptor is not None:
self._discriminant = 0
self._call = gene_descriptor
elif variant_interpretation is not None:
self._discriminant = 1
self._call = variant_interpretation
else:
raise ValueError('Bug') # TODO: wording
self._call = call

@property
def subject_or_biosample_id(self) -> str:
Expand All @@ -171,22 +163,24 @@ def interpretation_status(self) -> InterpretationStatus:
def interpretation_status(self, value: InterpretationStatus):
self._interpretation_status = value

@property
def call(self) -> typing.Union[GeneDescriptor, VariantInterpretation]:
return self._call

@property
def gene_descriptor(self) -> typing.Optional[GeneDescriptor]:
return self._call if self._discriminant == 0 else None
return self._call if isinstance(self._call, GeneDescriptor) else None

@gene_descriptor.setter
def gene_descriptor(self, value: GeneDescriptor):
self._discriminant = 0
self._call = value

@property
def variant_interpretation(self) -> typing.Optional[VariantInterpretation]:
return self._call if self._discriminant == 1 else None
return self._call if isinstance(self._call, VariantInterpretation) else None

@variant_interpretation.setter
def variant_interpretation(self, value: VariantInterpretation):
self._discriminant = 1
self._call = value

@staticmethod
Expand All @@ -195,39 +189,25 @@ def field_names() -> typing.Iterable[str]:

@classmethod
def required_fields(cls) -> typing.Sequence[str]:
return 'subject_or_biosample_id', 'interpretation_status',
raise NotImplementedError('Should not be called!')

@classmethod
def from_dict(cls, values: typing.Mapping[str, typing.Any]):
if cls._all_required_fields_are_present(values):
if 'gene_descriptor' in values:
assert 'variant_interpretation' not in values, \
'Variant interpretation must be unset when Gene descriptor is set!'
return GenomicInterpretation(
subject_or_biosample_id=values['subject_or_biosample_id'],
interpretation_status=MessageMixin._extract_enum_field(
'interpretation_status', GenomicInterpretation.InterpretationStatus, values
),
gene_descriptor=extract_message_scalar('gene_descriptor', GeneDescriptor, values),
)

elif 'variant_interpretation' in values:
assert 'gene_descriptor' not in values, \
'Gene descriptor must be unset when Variant interpretation is set!'
return GenomicInterpretation(
subject_or_biosample_id=values['subject_or_biosample_id'],
interpretation_status=MessageMixin._extract_enum_field(
'interpretation_status', GenomicInterpretation.InterpretationStatus, values
),
variant_interpretation=extract_message_scalar(
'variant_interpretation', VariantInterpretation, values
),
)

else:
raise ValueError('Either `gene_descriptor` or `variant_interpretation` must be set!')
if 'subject_or_biosample_id' in values \
and 'interpretation_status' in values \
and any(field in values for field in cls._ONEOF_CALL):
return GenomicInterpretation(
subject_or_biosample_id=values['subject_or_biosample_id'],
interpretation_status=MessageMixin._extract_enum_field(
'interpretation_status', GenomicInterpretation.InterpretationStatus, values,
),
call=extract_oneof_scalar(cls._ONEOF_CALL, values),
)
else:
cls._complain_about_missing_field(values)
raise ValueError(
'Missing one of required fields: '
f'`subject_or_biosample_id, interpretation_status, gene_descriptor|variant_interpretation` in {values}'
)

def to_message(self) -> Message:
msg = pp202.GenomicInterpretation(
Expand All @@ -237,13 +217,12 @@ def to_message(self) -> Message:
),
)

val = self._call.to_message()
if self._discriminant == 0:
msg.gene_descriptor.CopyFrom(val)
elif self._discriminant == 1:
msg.variant_interpretation.CopyFrom(val)
if isinstance(self._call, GeneDescriptor):
msg.gene_descriptor.CopyFrom(self._call.to_message())
elif isinstance(self._call, VariantInterpretation):
msg.variant_interpretation.CopyFrom(self._call.to_message())
else:
raise ValueError(f'Invalid discriminant {self._discriminant}')
raise ValueError('Bug')

return msg

Expand All @@ -253,52 +232,26 @@ def message_type(cls) -> typing.Type[Message]:

@classmethod
def from_message(cls, msg: Message):
if isinstance(msg, pp202.GenomicInterpretation):
subject_or_biosample_id = msg.subject_or_biosample_id
interpretation_status = GenomicInterpretation.InterpretationStatus(msg.interpretation_status)

case = msg.WhichOneof('call')
if case == 'gene_descriptor':
return GenomicInterpretation(
subject_or_biosample_id=subject_or_biosample_id,
interpretation_status=interpretation_status,
gene_descriptor=extract_pb_message_scalar(
'gene_descriptor', GeneDescriptor, msg
),
)
elif case == 'variant_interpretation':
return GenomicInterpretation(
subject_or_biosample_id=subject_or_biosample_id,
interpretation_status=interpretation_status,
variant_interpretation=extract_pb_message_scalar(
'variant_interpretation', VariantInterpretation, msg
),
)
else:
raise ValueError(f'Unknown one of field set {case}')

if isinstance(msg, cls.message_type()):
return GenomicInterpretation(
subject_or_biosample_id=msg.subject_or_biosample_id,
interpretation_status=GenomicInterpretation.InterpretationStatus(msg.interpretation_status),
call=extract_pb_oneof_scalar('call', cls._ONEOF_CALL, msg),
)
else:
cls.complain_about_incompatible_msg_type(msg)

def __eq__(self, other):
return isinstance(other, GenomicInterpretation) \
and self._subject_or_biosample_id == other._subject_or_biosample_id \
and self._interpretation_status == other._interpretation_status \
and self._discriminant == other._discriminant \
and self._call == other._call

def __repr__(self):
if self._discriminant == 0:
val = f'gene_descriptor={self._call}'
elif self._discriminant == 1:
val = f'variant_interpretation={self._call}'
else:
raise ValueError(f'Invalid discriminant {self._discriminant}')

return f'GenomicInterpretation(' \
f'subject_or_biosample_id={self._subject_or_biosample_id}, ' \
f'interpretation_status={self._interpretation_status}, ' \
f'{val})'
f'call={self._call})'


class Diagnosis(MessageMixin):
Expand Down
Loading

0 comments on commit 7a84cbf

Please sign in to comment.