Merge pull request #99 from monarch-initiative/finalize-vrs-for-pp-v202

Finished!
monarch-initiative · Apr 1, 2024 · 7a84cbf · 7a84cbf
2 parents d81e09f + dcd87d8
commit 7a84cbf
Show file tree

Hide file tree

Showing 8 changed files with 2,216 additions and 96 deletions.
diff --git a/src/pyphetools/pp/__init__.py b/src/pyphetools/pp/__init__.py
@@ -1,5 +1,122 @@
 """
 A package with strongly typed Phenopacket Schema types and the code for I/O and validation.
+
+Examples
+^^^^^^^^
+
+Create phenopacket programatically
+##################################
+
+We recommend to bring the classes into scope all at once using the import star:
+
+>>> from pyphetools.pp.v202 import *
+
+Then, we can build a phenopacket from the individual building blocks.
+
+Let's start with the subject:
+
+>>> subject = Individual(
+...   id='proband A',
+...   time_at_last_encounter=TimeElement(
+...     element=Age(iso8601duration='P6M'),
+...   ),
+...   sex=Sex.FEMALE,
+... )
+>>> subject.id
+'proband A'
+>>> subject.sex.name
+'FEMALE'
+
+The created subject represents a female proband who had 6 months at the time of the last encounter.
+
+We can update the fields using a simple assignment:
+
+>>> subject.karyotypic_sex = KaryotypicSex.XX
+>>> subject.karyotypic_sex.name
+'XX'
+
+We assigned an enum constant `KaryotypicSex.XX` to  previously unset `karyotypic_sex` attribute.
+
+
+The same can be done with object attributes:
+
+>>> subject.vital_status = VitalStatus(
+...   status=VitalStatus.Status.DECEASED,
+...   time_of_death=TimeElement(
+...     element=Age(iso8601duration='P1Y')
+...   ),
+...   cause_of_death=OntologyClass(
+...     id='NCIT:C7541', label='Retinoblastoma',
+...   ),
+... )
+
+We set the vital status to indicate that the proband died at 1 year of age due to *Retinoblastoma*.
+
+Now we can create a phenopacket. The phenopacket requires an identifier, `MetaData` and an optional subject.
+
+>>> pp = Phenopacket(
+...   id='example.retinoblastoma.phenopacket.id',
+...   meta_data=MetaData(
+...     created=Timestamp.from_str('2021-05-14T10:35:00Z'),
+...     created_by='anonymous biocurator',
+...   ),
+... )
+
+To create a phenopacket, we must provide the  `id` and `meta_data` fields
+since they are required by the Phenopacket Schema.
+The same applies to `created` and `created_by` fields of `MetaData`.
+
+`MetaData` contextualizes the used ontology classes, such as `NCIT:C7541` *Retinoblastoma*,
+to a particular ontology, such as NCI Thesaurus. We can store the ontology resource in `MetaData.resources`
+field:
+
+>>> pp.meta_data.resources.append(
+...   Resource(
+...     id='ncit', name='NCI Thesaurus', url='http://purl.obolibrary.org/obo/ncit.owl',
+...     version='23.09d', namespace_prefix='NCIT', iri_prefix='http://purl.obolibrary.org/obo/NCIT_',
+...   ),
+... )
+
+All repeated elements, such as `MetaData.resources`, can be accessed via a `list`.
+
+Read/write JSON and Protobuf
+############################
+
+We can read and write phenopackets in JSON format using the `JsonDeserializer` and `JsonSerializer` classes:
+
+>>> from pyphetools.pp.parse.json import JsonSerializer, JsonDeserializer
+>>> serializer = JsonSerializer()
+
+The serializer can write a Phenopacket Schema building block, such as `OntologyClass` or `Phenopacket` into
+a file handle:
+
+>>> from io import StringIO
+>>> buf = StringIO()
+>>> serializer.serialize(subject.vital_status, buf)
+>>> buf.getvalue()
+'{"status": "DECEASED", "timeOfDeath": {"age": {"iso8601duration": "P1Y"}}, "causeOfDeath": {"id": "NCIT:C7541", "label": "Retinoblastoma"}}'
+
+and the JSON can be read back from a file handle:
+
+>>> _ = buf.seek(0)  # Rewind and ignore the result
+>>> deserializer = JsonDeserializer()
+>>> decoded = deserializer.deserialize(buf, VitalStatus)
+>>> decoded == subject.vital_status
+True
+
+The building block can also be written into Protobuf wire format.
+We can do a similar round-trip as above, but we will need a byte IO handle:
+
+>>> from io import BytesIO
+>>> byte_buf = BytesIO()
+
+We can write the subject into the buffer and get the same data back:
+
+>>> subject.dump_pb(byte_buf)
+>>> _ = byte_buf.seek(0)  # Rewind to start
+>>> other = Individual.from_pb(byte_buf)
+>>> subject == other
+True
 """
 
 from . import parse

diff --git a/src/pyphetools/pp/parse/_io.py b/src/pyphetools/pp/parse/_io.py
@@ -64,7 +64,7 @@ def _put_field_to_mapping(
         elif isinstance(field, enum.Enum):
             out[name] = field.name
         elif hasattr(field, 'seconds') and hasattr(field, 'nanos') and hasattr(field, 'as_str') and callable(field.as_str):
-            # This quack *exactly* as a Timestamp!
+            # This quacks *exactly* as a Timestamp!
             out[name] = field.as_str()
         else:
             raise ValueError(f'Unexpected field {field}')

diff --git a/src/pyphetools/pp/v202/__init__.py b/src/pyphetools/pp/v202/__init__.py
@@ -18,6 +18,10 @@
 from ._disease import Disease
 from ._meta_data import MetaData, Resource, Update
 from ._phenopackets import Phenopacket
+from ._vrs import Gene, Text, Number, IndefiniteRange, DefiniteRange, SimpleInterval, SequenceInterval
+from ._vrs import SequenceLocation, SequenceState, LiteralSequenceExpression, DerivedSequenceExpression
+from ._vrs import RepeatedSequenceExpression, CytobandInterval, ChromosomeLocation, Allele, Haplotype, CopyNumber
+from ._vrs import VariationSet, Variation
 from ._vrsatile import Expression, Extension, VcfRecord, MoleculeContext, VariationDescriptor
 
 __all__ = [
@@ -32,4 +36,9 @@
     'MetaData', 'Resource', 'Update',
     'OntologyClass', 'ExternalReference', 'Evidence', 'Procedure', 'GestationalAge', 'Age', 'AgeRange', 'TimeInterval',
     'TimeElement', 'Timestamp', 'File',
+    # and the VRS members
+    'Gene', 'Text', 'Number', 'IndefiniteRange', 'DefiniteRange', 'SimpleInterval', 'SequenceInterval',
+    'SequenceLocation', 'SequenceState', 'LiteralSequenceExpression', 'DerivedSequenceExpression',
+    'RepeatedSequenceExpression', 'CytobandInterval', 'ChromosomeLocation', 'Allele', 'Haplotype', 'CopyNumber',
+    'VariationSet', 'Variation',
 ]
diff --git a/src/pyphetools/pp/v202/_interpretation.py b/src/pyphetools/pp/v202/_interpretation.py
@@ -10,6 +10,7 @@
 from ._gene_descriptor import GeneDescriptor
 from .._api import MessageMixin
 from ..parse import extract_message_scalar, extract_message_sequence, extract_pb_message_scalar, extract_pb_message_seq
+from ..parse import extract_oneof_scalar, extract_pb_oneof_scalar
 
 
 class AcmgPathogenicityClassification(enum.Enum):
@@ -124,6 +125,11 @@ def __repr__(self):
 
 
 class GenomicInterpretation(MessageMixin):
+    _ONEOF_CALL = {
+        'gene_descriptor': GeneDescriptor,
+        'variant_interpretation': VariantInterpretation,
+    }
+
     class InterpretationStatus(enum.Enum):
         UNKNOWN_STATUS = 0
         REJECTED = 1
@@ -135,25 +141,11 @@ def __init__(
             self,
             subject_or_biosample_id: str,
             interpretation_status: InterpretationStatus,
-            gene_descriptor: typing.Optional[GeneDescriptor] = None,
-            variant_interpretation: typing.Optional[VariantInterpretation] = None,
+            call: typing.Union[GeneDescriptor, VariantInterpretation],
     ):
         self._subject_or_biosample_id = subject_or_biosample_id
         self._interpretation_status = interpretation_status
-        one_ofs = (gene_descriptor, variant_interpretation)
-        if sum(1 for arg in one_ofs if arg is not None) != 1:
-            cnt = sum(1 for arg in one_ofs if arg is not None)
-            raise ValueError(
-                f'GenomicInterpretation must be provided with exactly 1 argument but {cnt} arguments were provided!')
-
-        if gene_descriptor is not None:
-            self._discriminant = 0
-            self._call = gene_descriptor
-        elif variant_interpretation is not None:
-            self._discriminant = 1
-            self._call = variant_interpretation
-        else:
-            raise ValueError('Bug')  # TODO: wording
+        self._call = call
 
     @property
     def subject_or_biosample_id(self) -> str:
@@ -171,22 +163,24 @@ def interpretation_status(self) -> InterpretationStatus:
     def interpretation_status(self, value: InterpretationStatus):
         self._interpretation_status = value
 
+    @property
+    def call(self) -> typing.Union[GeneDescriptor, VariantInterpretation]:
+        return self._call
+
     @property
     def gene_descriptor(self) -> typing.Optional[GeneDescriptor]:
-        return self._call if self._discriminant == 0 else None
+        return self._call if isinstance(self._call, GeneDescriptor) else None
 
     @gene_descriptor.setter
     def gene_descriptor(self, value: GeneDescriptor):
-        self._discriminant = 0
         self._call = value
 
     @property
     def variant_interpretation(self) -> typing.Optional[VariantInterpretation]:
-        return self._call if self._discriminant == 1 else None
+        return self._call if isinstance(self._call, VariantInterpretation) else None
 
     @variant_interpretation.setter
     def variant_interpretation(self, value: VariantInterpretation):
-        self._discriminant = 1
         self._call = value
 
     @staticmethod
@@ -195,39 +189,25 @@ def field_names() -> typing.Iterable[str]:
 
     @classmethod
     def required_fields(cls) -> typing.Sequence[str]:
-        return 'subject_or_biosample_id', 'interpretation_status',
+        raise NotImplementedError('Should not be called!')
 
     @classmethod
     def from_dict(cls, values: typing.Mapping[str, typing.Any]):
-        if cls._all_required_fields_are_present(values):
-            if 'gene_descriptor' in values:
-                assert 'variant_interpretation' not in values, \
-                    'Variant interpretation must be unset when Gene descriptor is set!'
-                return GenomicInterpretation(
-                    subject_or_biosample_id=values['subject_or_biosample_id'],
-                    interpretation_status=MessageMixin._extract_enum_field(
-                        'interpretation_status', GenomicInterpretation.InterpretationStatus, values
-                    ),
-                    gene_descriptor=extract_message_scalar('gene_descriptor', GeneDescriptor, values),
-                )
-
-            elif 'variant_interpretation' in values:
-                assert 'gene_descriptor' not in values, \
-                    'Gene descriptor must be unset when Variant interpretation is set!'
-                return GenomicInterpretation(
-                    subject_or_biosample_id=values['subject_or_biosample_id'],
-                    interpretation_status=MessageMixin._extract_enum_field(
-                        'interpretation_status', GenomicInterpretation.InterpretationStatus, values
-                    ),
-                    variant_interpretation=extract_message_scalar(
-                        'variant_interpretation', VariantInterpretation, values
-                    ),
-                )
-
-            else:
-                raise ValueError('Either `gene_descriptor` or `variant_interpretation` must be set!')
+        if 'subject_or_biosample_id' in values \
+                and 'interpretation_status' in values \
+                and any(field in values for field in cls._ONEOF_CALL):
+            return GenomicInterpretation(
+                subject_or_biosample_id=values['subject_or_biosample_id'],
+                interpretation_status=MessageMixin._extract_enum_field(
+                    'interpretation_status', GenomicInterpretation.InterpretationStatus, values,
+                ),
+                call=extract_oneof_scalar(cls._ONEOF_CALL, values),
+            )
         else:
-            cls._complain_about_missing_field(values)
+            raise ValueError(
+                'Missing one of required fields: '
+                f'`subject_or_biosample_id, interpretation_status, gene_descriptor|variant_interpretation` in {values}'
+            )
 
     def to_message(self) -> Message:
         msg = pp202.GenomicInterpretation(
@@ -237,13 +217,12 @@ def to_message(self) -> Message:
             ),
         )
 
-        val = self._call.to_message()
-        if self._discriminant == 0:
-            msg.gene_descriptor.CopyFrom(val)
-        elif self._discriminant == 1:
-            msg.variant_interpretation.CopyFrom(val)
+        if isinstance(self._call, GeneDescriptor):
+            msg.gene_descriptor.CopyFrom(self._call.to_message())
+        elif isinstance(self._call, VariantInterpretation):
+            msg.variant_interpretation.CopyFrom(self._call.to_message())
         else:
-            raise ValueError(f'Invalid discriminant {self._discriminant}')
+            raise ValueError('Bug')
 
         return msg
 
@@ -253,52 +232,26 @@ def message_type(cls) -> typing.Type[Message]:
 
     @classmethod
     def from_message(cls, msg: Message):
-        if isinstance(msg, pp202.GenomicInterpretation):
-            subject_or_biosample_id = msg.subject_or_biosample_id
-            interpretation_status = GenomicInterpretation.InterpretationStatus(msg.interpretation_status)
-
-            case = msg.WhichOneof('call')
-            if case == 'gene_descriptor':
-                return GenomicInterpretation(
-                    subject_or_biosample_id=subject_or_biosample_id,
-                    interpretation_status=interpretation_status,
-                    gene_descriptor=extract_pb_message_scalar(
-                        'gene_descriptor', GeneDescriptor, msg
-                    ),
-                )
-            elif case == 'variant_interpretation':
-                return GenomicInterpretation(
-                    subject_or_biosample_id=subject_or_biosample_id,
-                    interpretation_status=interpretation_status,
-                    variant_interpretation=extract_pb_message_scalar(
-                        'variant_interpretation', VariantInterpretation, msg
-                    ),
-                )
-            else:
-                raise ValueError(f'Unknown one of field set {case}')
-
+        if isinstance(msg, cls.message_type()):
+            return GenomicInterpretation(
+                subject_or_biosample_id=msg.subject_or_biosample_id,
+                interpretation_status=GenomicInterpretation.InterpretationStatus(msg.interpretation_status),
+                call=extract_pb_oneof_scalar('call', cls._ONEOF_CALL, msg),
+            )
         else:
             cls.complain_about_incompatible_msg_type(msg)
 
     def __eq__(self, other):
         return isinstance(other, GenomicInterpretation) \
             and self._subject_or_biosample_id == other._subject_or_biosample_id \
             and self._interpretation_status == other._interpretation_status \
-            and self._discriminant == other._discriminant \
             and self._call == other._call
 
     def __repr__(self):
-        if self._discriminant == 0:
-            val = f'gene_descriptor={self._call}'
-        elif self._discriminant == 1:
-            val = f'variant_interpretation={self._call}'
-        else:
-            raise ValueError(f'Invalid discriminant {self._discriminant}')
-
         return f'GenomicInterpretation(' \
                f'subject_or_biosample_id={self._subject_or_biosample_id}, ' \
                f'interpretation_status={self._interpretation_status}, ' \
-               f'{val})'
+               f'call={self._call})'
 
 
 class Diagnosis(MessageMixin):