Add SCOP2(B) to SIFTS, use OrderedDict and bump version

diegozea · Aug 5, 2020 · 60e1487 · 60e1487
1 parent 30564ce
commit 60e1487
Show file tree

Hide file tree

Showing 8 changed files with 137 additions and 63 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,14 +1,18 @@
 ## MIToS.jl Release Notes
 
-### Changes from v2.4.0 to v2.4.1
+### Changes from v2.4.0 to v2.5.0
 
-MIToS v2.4.1 requires Julia v1.0 or higher. This release drops *Julia 0.7* 
-support and adds support for **Julia 1.5**. 
-
-* Several bug fixes.
+MIToS v2.5.0 drops support for *Julia 0.7* and adds support for *Julia 1.5* and 
+includes several bug fixes.
 
 * `Cookbook` section added to the docs using [Literate](https://github.com/fredrikekre/Literate.jl)
 
+* The `SIFTS` module now includes the `dbSCOP2` and `dbSCOP2B` databases.
+
+* `siftsmapping` now returns an `OrderedDict` instead of a `Dict`.
+
+* `msacolumn2pdbresidue` now return an `OrderedDict` instead of a `Dict`.
+
 ### Changes from v2.3.0 to v2.4.0
 
 MIToS v2.4 uses `Project.toml` and includes several bug fixes.

diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "MIToS"
 uuid = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
-version = "2.4.1"
+version = "2.5.0"
 
 [deps]
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"

diff --git a/src/Pfam/PDB.jl b/src/Pfam/PDB.jl
@@ -38,7 +38,7 @@ end
 """
 `msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, siftsfile; strict=false, checkpdbname=false, missings=true)`
 
-This function returns a `Dict{Int,String}` with **MSA column numbers on the input file**
+This function returns a `OrderedDict{Int,String}` with **MSA column numbers on the input file**
 as keys and PDB residue numbers (`""` for missings) as values. The mapping is performed
 using SIFTS. This function needs correct *ColMap* and *SeqMap* annotations. This checks
 correspondence of the residues between the MSA sequence and SIFTS
@@ -65,7 +65,7 @@ function msacolumn2pdbresidue(msa::AnnotatedMultipleSequenceAlignment,
 
     siftsres = read(siftsfile, SIFTSXML, chain=chain, missings=missings)
 
-    up2res = Dict{String,Tuple{String,String,Char}}()
+    up2res = OrderedDict{String,Tuple{String,String,Char}}()
     for res in siftsres
         if !ismissing(res.Pfam) && res.Pfam.id == uppercase(pfamid)
             pfnum  = res.Pfam.number
@@ -90,7 +90,7 @@ function msacolumn2pdbresidue(msa::AnnotatedMultipleSequenceAlignment,
     colmap   = getcolumnmapping(msa)
     N        = ncolumns(msa)
 
-    m = Dict{Int,String}()
+    m = OrderedDict{Int,String}()
     sizehint!(m, N)
     for i in 1:N
         up_number = string(seqmap[i])
@@ -131,7 +131,7 @@ end
 
 "Returns a `BitVector` where there is a `true` for each column with PDB residue."
 function hasresidues(msa::AnnotatedMultipleSequenceAlignment,
-                    column2residues::Dict{Int,String})
+                    column2residues::AbstractDict{Int,String})
     colmap = getcolumnmapping(msa)
     ncol = length(colmap)
     mask = falses(ncol)
@@ -157,8 +157,8 @@ annotations and two dicts:
 to `PDBResidue`. Residues on inserts are not included.
 """
 function msaresidues(msa::AnnotatedMultipleSequenceAlignment,
-                     residues::OrderedDict{String,PDBResidue},
-                     column2residues::Dict{Int,String})
+                     residues::AbstractDict{String,PDBResidue},
+                     column2residues::AbstractDict{Int,String})
     colmap = getcolumnmapping(msa)
     msares = sizehint!(OrderedDict{Int,PDBResidue}(), length(colmap))
     for col in colmap
@@ -190,8 +190,8 @@ equal to `distance_limit` (default to `6.05`) angstroms between any heavy atom.
 indicates a missing value.
 """
 function msacontacts(msa::AnnotatedMultipleSequenceAlignment,
-                     residues::OrderedDict{String,PDBResidue},
-                     column2residues::Dict{Int,String},
+                     residues::AbstractDict{String,PDBResidue},
+                     column2residues::AbstractDict{Int,String},
                      distance_limit::Float64=6.05)
     colmap   = getcolumnmapping(msa)
     contacts = columnpairsmatrix(msa)

diff --git a/src/SIFTS/ResidueMapping.jl b/src/SIFTS/ResidueMapping.jl
@@ -56,7 +56,7 @@ end
 `dbNCBI` stores the residue `id`, `number` and `name` in NCBI as strings.
 """ dbNCBI
 
-for ref_type in [:dbPDB, :dbCATH, :dbSCOP]
+for ref_type in [:dbPDB, :dbCATH, :dbSCOP, :dbSCOP2, :dbSCOP2B]
     @eval begin
 
         @auto_hash_equals struct $(ref_type) <: DataBase
@@ -74,13 +74,27 @@ end
 """ dbPDB
 
 @doc """
-`dbCATH` stores the residue `id`, `number`, `name` and `chain` in CATH as strings.
+`dbCATH` stores the residue `id`, `number`, `name` and `chain` in CATH as 
+strings.
 """ dbCATH
 
 @doc """
-`dbSCOP` stores the residue `id`, `number`, `name` and `chain` in SCOP as strings.
+`dbSCOP` stores the residue `id`, `number`, `name` and `chain` in SCOP as 
+strings.
 """ dbSCOP
 
+@doc """
+`dbSCOP2` stores the residue `id`, `number`, `name` and `chain` in SCOP2 as 
+strings.
+""" dbSCOP2
+
+@doc """
+`dbSCOP2B` stores the residue `id`, `number`, `name` and `chain` in SCOP2B as 
+strings. *SCOP2B* is expansion of *SCOP2* domain annotations at superfamily 
+level to every *PDB* with same *UniProt* accession having at least 80% *SCOP2* 
+domain coverage.
+""" dbSCOP2B
+
 """
 Returns "" if the attributte is missing
 """
@@ -97,12 +111,12 @@ end
 Returns `missing` if the attributte is missing
 """
 function _get_nullable_attribute(elem::LightXML.XMLElement,
-                                 attr::String)::Union{String, Missing}
+                                 attr::String)::Union{String,Missing}
     text = attribute(elem, attr)
     (text === nothing || text == "None") ? missing : text
 end
 
-for ref_type in [:dbPDB, :dbCATH, :dbSCOP]
+for ref_type in [:dbPDB, :dbCATH, :dbSCOP, :dbSCOP2, :dbSCOP2B]
     @eval begin
 
         function $(ref_type)(map::LightXML.XMLElement)
@@ -152,7 +166,7 @@ function dbInterPro(map::LightXML.XMLElement)
 end
 
 function dbPDBe(map::LightXML.XMLElement)
-      dbPDBe(
+    dbPDBe(
         _get_attribute(map, "dbResNum"),
         _get_attribute(map, "dbResName")
       )
@@ -169,6 +183,8 @@ following fields that you can access at any moment for query purposes:
     - `InterPro` : An array of `dbInterPro` objects.
     - `PDB` : A `dbPDB` object or `missing`.
     - `SCOP` : A `dbSCOP` object or `missing`.
+    - `SCOP2` : An array of `dbSCOP2` objects.
+    - `SCOP2B` : A `dbSCOP2B` object or `missing`.
     - `CATH` : A `dbCATH` object or `missing`.
     - `Ensembl` : An array of `dbEnsembl` objects.
     - `missing` : It's `true` if the residue is missing, i.e. not observed, in the structure.
@@ -178,14 +194,16 @@ following fields that you can access at any moment for query purposes:
 @auto_hash_equals struct SIFTSResidue
     PDBe::dbPDBe
     # crossRefDb
-    UniProt::Union{dbUniProt, Missing}
-    Pfam::Union{dbPfam, Missing}
-    NCBI::Union{dbNCBI, Missing}
-    InterPro::Array{dbInterPro, 1}
-    PDB::Union{dbPDB, Missing}
-    SCOP::Union{dbSCOP, Missing}
-    CATH::Union{dbCATH, Missing}
-    Ensembl::Array{dbEnsembl, 1}
+    UniProt::Union{dbUniProt,Missing}
+    Pfam::Union{dbPfam,Missing}
+    NCBI::Union{dbNCBI,Missing}
+    InterPro::Array{dbInterPro,1}
+    PDB::Union{dbPDB,Missing}
+    SCOP::Union{dbSCOP,Missing}
+    SCOP2::Array{dbSCOP2,1}
+    SCOP2B::Union{dbSCOP2B,Missing}
+    CATH::Union{dbCATH,Missing}
+    Ensembl::Array{dbEnsembl,1}
     # residueDetail
     missing::Bool  # XML: <residueDetail dbSource="PDBe" property="Annotation" ...
     sscode::String # XML: <residueDetail dbSource="PDBe" property="codeSecondaryStructure"...
@@ -200,28 +218,30 @@ end
 @inline _name(::Type{dbUniProt}) = "UniProt"
 @inline _name(::Type{dbPfam})    = "Pfam"
 @inline _name(::Type{dbNCBI})    = "NCBI"
-@inline _name(::Type{dbInterPro})= "InterPro"
+@inline _name(::Type{dbInterPro}) = "InterPro"
 @inline _name(::Type{dbPDB})     = "PDB"
 @inline _name(::Type{dbSCOP})    = "SCOP"
+@inline _name(::Type{dbSCOP2})   = "SCOP2"
+@inline _name(::Type{dbSCOP2B})  = "SCOP2B"
 @inline _name(::Type{dbCATH})    = "CATH"
 @inline _name(::Type{dbEnsembl}) = "Ensembl"
 
 @inline Base.get(res::SIFTSResidue, db::Type{dbPDBe})    = res.PDBe
 @inline Base.get(res::SIFTSResidue, db::Type{dbUniProt}) = res.UniProt
 @inline Base.get(res::SIFTSResidue, db::Type{dbPfam})    = res.Pfam
 @inline Base.get(res::SIFTSResidue, db::Type{dbNCBI})    = res.NCBI
-@inline Base.get(res::SIFTSResidue, db::Type{dbInterPro})= res.InterPro
+@inline Base.get(res::SIFTSResidue, db::Type{dbInterPro}) = res.InterPro
 @inline Base.get(res::SIFTSResidue, db::Type{dbPDB})     = res.PDB
 @inline Base.get(res::SIFTSResidue, db::Type{dbSCOP})    = res.SCOP
+@inline Base.get(res::SIFTSResidue, db::Type{dbSCOP2})   = res.SCOP2
+@inline Base.get(res::SIFTSResidue, db::Type{dbSCOP2B})  = res.SCOP2B
 @inline Base.get(res::SIFTSResidue, db::Type{dbCATH})    = res.CATH
 @inline Base.get(res::SIFTSResidue, db::Type{dbEnsembl}) = res.Ensembl
 
 function Base.get(res::SIFTSResidue,
                   db::Type{T},
                   field::Symbol,
-                  default::Union{String, Missing}=missing) where T <: Union{
-                        dbUniProt, dbPfam, dbNCBI, dbPDB, dbSCOP, dbCATH
-                    }
+                  default::Union{String,Missing}=missing) where T <: Union{dbUniProt,dbPfam,dbNCBI,dbPDB,dbSCOP,dbSCOP2B,dbCATH}
     database = get(res, db)
     ismissing(database) ? default : getfield(database, field)
 end
@@ -239,7 +259,7 @@ function Base.show(io::IO, res::SIFTSResidue)
     println(io, "  PDBe:")
     println(io, "    number: ", res.PDBe.number)
     println(io, "    name: ", res.PDBe.name)
-    for dbname in [:UniProt, :Pfam, :NCBI, :PDB, :SCOP, :CATH]
+    for dbname in [:UniProt, :Pfam, :NCBI, :PDB, :SCOP, :SCOP2B, :CATH]
         dbfield = getfield(res, dbname)
         if !ismissing(dbfield)
             println(io, "  ", dbname, " :")
@@ -248,8 +268,9 @@ function Base.show(io::IO, res::SIFTSResidue)
             end
         end
     end
-    println(io, "  InterPro: ",  res.InterPro)
-    println(io, "  Ensembl: ",  res.Ensembl)
+    length(res.SCOP2) > 0 && println(io, "  SCOP2: ", res.SCOP2)
+    length(res.InterPro) > 0 && println(io, "  InterPro: ",  res.InterPro)
+    length(res.Ensembl) > 0 && println(io, "  Ensembl: ",  res.Ensembl)
 end
 
 # Creation
@@ -264,6 +285,8 @@ function SIFTSResidue(residue::LightXML.XMLElement, missing_residue::Bool,
     InterPro = dbInterPro[]
     PDB = missing
     SCOP = missing
+    SCOP2 = dbSCOP2[]
+    SCOP2B = missing
     CATH = missing
     Ensembl = dbEnsembl[]
     for crossref in get_elements_by_tagname(residue, "crossRefDb")
@@ -280,6 +303,10 @@ function SIFTSResidue(residue::LightXML.XMLElement, missing_residue::Bool,
             PDB = dbPDB(crossref)
         elseif db == "SCOP"
             SCOP = dbSCOP(crossref)
+        elseif db == "SCOP2"
+            push!(SCOP2, dbSCOP2(crossref))
+        elseif db == "SCOP2B"
+            SCOP2B = dbSCOP2B(crossref)
         elseif db == "CATH"
             CATH = dbCATH(crossref)
         elseif db == "Ensembl"
@@ -295,6 +322,8 @@ function SIFTSResidue(residue::LightXML.XMLElement, missing_residue::Bool,
                  InterPro,
                  PDB,
                  SCOP,
+                 SCOP2,
+                 SCOP2B,
                  CATH,
                  Ensembl,
                  missing_residue,
@@ -313,19 +342,19 @@ _is_All(::Any) = false
 _is_All(::Type{All}) = true
 
 """
-Parses a SIFTS XML file and returns a `Dict` between residue numbers of two `DataBase`s
-with the given identifiers. A `chain` could be specified (`All` by default). If `missings`
-is `true` (default) all the residues are used, even if they haven’t coordinates in the
-PDB file.
+Parses a SIFTS XML file and returns a `OrderedDict` between residue numbers of 
+two `DataBase`s with the given identifiers. A `chain` could be specified 
+(`All` by default). If `missings` is `true` (default) all the residues are 
+used, even if they haven’t coordinates in the PDB file.
 """
 function siftsmapping(filename::String,
                       db_from::Type{F},
                       id_from::String,
                       db_to::Type{T},
                       id_to::String;
-                      chain::Union{Type{All},String} = All,
-                      missings::Bool = true) where {F, T}
-    mapping = Dict{String,String}()
+                      chain::Union{Type{All},String}=All,
+                      missings::Bool=true) where {F,T}
+    mapping = OrderedDict{String,String}()
     xdoc = parse_file(filename)
     try
         for entity in _get_entities(xdoc)
@@ -334,17 +363,17 @@ function siftsmapping(filename::String,
                 residues = _get_residues(segment)
                 for residue in residues
                     in_chain = _is_All(chain)
-                    key_data = _name(db_from) == "PDBe" ? attribute(residue,"dbResNum") : missing
-                    value_data = _name(db_to) == "PDBe" ? attribute(residue,"dbResNum") : missing
+                    key_data = _name(db_from) == "PDBe" ? attribute(residue, "dbResNum") : missing
+                    value_data = _name(db_to) == "PDBe" ? attribute(residue, "dbResNum") : missing
                     if missings || !_is_missing(residue)
                         crossref = get_elements_by_tagname(residue, "crossRefDb")
                         for ref in crossref
                             source = attribute(ref, "dbSource")
                             if source == _name(db_from) && attribute(ref, "dbAccessionId") == id_from
-                                key_data = _get_nullable_attribute(ref,"dbResNum")
+                                key_data = _get_nullable_attribute(ref, "dbResNum")
                             end
                             if source == _name(db_to) && attribute(ref, "dbAccessionId") == id_to
-                                value_data = _get_nullable_attribute(ref,"dbResNum")
+                                value_data = _get_nullable_attribute(ref, "dbResNum")
                             end
                             if !in_chain && source == "PDB" # XML: <crossRefDb dbSource="PDB" ... dbChainId="E"/>
                                 in_chain = attribute(ref, "dbChainId") == chain
@@ -376,7 +405,7 @@ Returns a `Vector{SIFTSResidue}` parsed from a `SIFTSXML` file.
 By default, parses all the `chain`s and includes missing residues.
 """
 function Base.parse(document::LightXML.XMLDocument, ::Type{SIFTSXML};
-                    chain::Union{Type{All},String}=All, missings::Bool = true)
+                    chain::Union{Type{All},String}=All, missings::Bool=true)
     vector = SIFTSResidue[]
     for entity in _get_entities(document)
         for segment in _get_segments(entity)
@@ -400,7 +429,7 @@ end
 
 for F in (:findall, :filter!, :filter)
     @eval begin
-        function Base.$(F)(f::Function,list::AbstractVector{SIFTSResidue},db::Type{T}) where T<:DataBase
+        function Base.$(F)(f::Function, list::AbstractVector{SIFTSResidue}, db::Type{T}) where T <: DataBase
             $(F)(list) do res
                 database = get(res, db)
                 if !ismissing(database)

diff --git a/src/SIFTS/SIFTS.jl b/src/SIFTS/SIFTS.jl
@@ -10,7 +10,7 @@ SIFTS offers  more reliable association between sequence and structure residue n
 
 - Download and parse SIFTS XML files
 - Store residue-level mapping in Julia
-- Easy generation of `Dict`s between residues numbers
+- Easy generation of `OrderedDict`s between residues numbers
 
 ```julia
 using MIToS.SIFTS
@@ -20,9 +20,10 @@ module SIFTS
 
 using LightXML
 using AutoHashEquals
+using DataStructures
 using MIToS.Utils
 
-export  DataBase,
+export DataBase,
         dbPDBe,
         dbInterPro,
         dbUniProt,
@@ -31,6 +32,8 @@ export  DataBase,
         dbPDB,
         dbCATH,
         dbSCOP,
+        dbSCOP2,
+        dbSCOP2B,
         dbEnsembl,
         SIFTSResidue,
         downloadsifts,