Skip to content

Commit

Permalink
Add SCOP2(B) to SIFTS, use OrderedDict and bump version
Browse files Browse the repository at this point in the history
  • Loading branch information
diegozea committed Aug 5, 2020
1 parent 30564ce commit 60e1487
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 63 deletions.
14 changes: 9 additions & 5 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
## MIToS.jl Release Notes

### Changes from v2.4.0 to v2.4.1
### Changes from v2.4.0 to v2.5.0

MIToS v2.4.1 requires Julia v1.0 or higher. This release drops *Julia 0.7*
support and adds support for **Julia 1.5**.

* Several bug fixes.
MIToS v2.5.0 drops support for *Julia 0.7* and adds support for *Julia 1.5* and
includes several bug fixes.

* `Cookbook` section added to the docs using [Literate](https://github.com/fredrikekre/Literate.jl)

* The `SIFTS` module now includes the `dbSCOP2` and `dbSCOP2B` databases.

* `siftsmapping` now returns an `OrderedDict` instead of a `Dict`.

* `msacolumn2pdbresidue` now return an `OrderedDict` instead of a `Dict`.

### Changes from v2.3.0 to v2.4.0

MIToS v2.4 uses `Project.toml` and includes several bug fixes.
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "MIToS"
uuid = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
version = "2.4.1"
version = "2.5.0"

[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Expand Down
16 changes: 8 additions & 8 deletions src/Pfam/PDB.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ end
"""
`msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, siftsfile; strict=false, checkpdbname=false, missings=true)`
This function returns a `Dict{Int,String}` with **MSA column numbers on the input file**
This function returns a `OrderedDict{Int,String}` with **MSA column numbers on the input file**
as keys and PDB residue numbers (`""` for missings) as values. The mapping is performed
using SIFTS. This function needs correct *ColMap* and *SeqMap* annotations. This checks
correspondence of the residues between the MSA sequence and SIFTS
Expand All @@ -65,7 +65,7 @@ function msacolumn2pdbresidue(msa::AnnotatedMultipleSequenceAlignment,

siftsres = read(siftsfile, SIFTSXML, chain=chain, missings=missings)

up2res = Dict{String,Tuple{String,String,Char}}()
up2res = OrderedDict{String,Tuple{String,String,Char}}()
for res in siftsres
if !ismissing(res.Pfam) && res.Pfam.id == uppercase(pfamid)
pfnum = res.Pfam.number
Expand All @@ -90,7 +90,7 @@ function msacolumn2pdbresidue(msa::AnnotatedMultipleSequenceAlignment,
colmap = getcolumnmapping(msa)
N = ncolumns(msa)

m = Dict{Int,String}()
m = OrderedDict{Int,String}()
sizehint!(m, N)
for i in 1:N
up_number = string(seqmap[i])
Expand Down Expand Up @@ -131,7 +131,7 @@ end

"Returns a `BitVector` where there is a `true` for each column with PDB residue."
function hasresidues(msa::AnnotatedMultipleSequenceAlignment,
column2residues::Dict{Int,String})
column2residues::AbstractDict{Int,String})
colmap = getcolumnmapping(msa)
ncol = length(colmap)
mask = falses(ncol)
Expand All @@ -157,8 +157,8 @@ annotations and two dicts:
to `PDBResidue`. Residues on inserts are not included.
"""
function msaresidues(msa::AnnotatedMultipleSequenceAlignment,
residues::OrderedDict{String,PDBResidue},
column2residues::Dict{Int,String})
residues::AbstractDict{String,PDBResidue},
column2residues::AbstractDict{Int,String})
colmap = getcolumnmapping(msa)
msares = sizehint!(OrderedDict{Int,PDBResidue}(), length(colmap))
for col in colmap
Expand Down Expand Up @@ -190,8 +190,8 @@ equal to `distance_limit` (default to `6.05`) angstroms between any heavy atom.
indicates a missing value.
"""
function msacontacts(msa::AnnotatedMultipleSequenceAlignment,
residues::OrderedDict{String,PDBResidue},
column2residues::Dict{Int,String},
residues::AbstractDict{String,PDBResidue},
column2residues::AbstractDict{Int,String},
distance_limit::Float64=6.05)
colmap = getcolumnmapping(msa)
contacts = columnpairsmatrix(msa)
Expand Down
99 changes: 64 additions & 35 deletions src/SIFTS/ResidueMapping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ end
`dbNCBI` stores the residue `id`, `number` and `name` in NCBI as strings.
""" dbNCBI

for ref_type in [:dbPDB, :dbCATH, :dbSCOP]
for ref_type in [:dbPDB, :dbCATH, :dbSCOP, :dbSCOP2, :dbSCOP2B]
@eval begin

@auto_hash_equals struct $(ref_type) <: DataBase
Expand All @@ -74,13 +74,27 @@ end
""" dbPDB

@doc """
`dbCATH` stores the residue `id`, `number`, `name` and `chain` in CATH as strings.
`dbCATH` stores the residue `id`, `number`, `name` and `chain` in CATH as
strings.
""" dbCATH

@doc """
`dbSCOP` stores the residue `id`, `number`, `name` and `chain` in SCOP as strings.
`dbSCOP` stores the residue `id`, `number`, `name` and `chain` in SCOP as
strings.
""" dbSCOP

@doc """
`dbSCOP2` stores the residue `id`, `number`, `name` and `chain` in SCOP2 as
strings.
""" dbSCOP2

@doc """
`dbSCOP2B` stores the residue `id`, `number`, `name` and `chain` in SCOP2B as
strings. *SCOP2B* is expansion of *SCOP2* domain annotations at superfamily
level to every *PDB* with same *UniProt* accession having at least 80% *SCOP2*
domain coverage.
""" dbSCOP2B

"""
Returns "" if the attributte is missing
"""
Expand All @@ -97,12 +111,12 @@ end
Returns `missing` if the attributte is missing
"""
function _get_nullable_attribute(elem::LightXML.XMLElement,
attr::String)::Union{String, Missing}
attr::String)::Union{String,Missing}
text = attribute(elem, attr)
(text === nothing || text == "None") ? missing : text
end

for ref_type in [:dbPDB, :dbCATH, :dbSCOP]
for ref_type in [:dbPDB, :dbCATH, :dbSCOP, :dbSCOP2, :dbSCOP2B]
@eval begin

function $(ref_type)(map::LightXML.XMLElement)
Expand Down Expand Up @@ -152,7 +166,7 @@ function dbInterPro(map::LightXML.XMLElement)
end

function dbPDBe(map::LightXML.XMLElement)
dbPDBe(
dbPDBe(
_get_attribute(map, "dbResNum"),
_get_attribute(map, "dbResName")
)
Expand All @@ -169,6 +183,8 @@ following fields that you can access at any moment for query purposes:
- `InterPro` : An array of `dbInterPro` objects.
- `PDB` : A `dbPDB` object or `missing`.
- `SCOP` : A `dbSCOP` object or `missing`.
- `SCOP2` : An array of `dbSCOP2` objects.
- `SCOP2B` : A `dbSCOP2B` object or `missing`.
- `CATH` : A `dbCATH` object or `missing`.
- `Ensembl` : An array of `dbEnsembl` objects.
- `missing` : It's `true` if the residue is missing, i.e. not observed, in the structure.
Expand All @@ -178,14 +194,16 @@ following fields that you can access at any moment for query purposes:
@auto_hash_equals struct SIFTSResidue
PDBe::dbPDBe
# crossRefDb
UniProt::Union{dbUniProt, Missing}
Pfam::Union{dbPfam, Missing}
NCBI::Union{dbNCBI, Missing}
InterPro::Array{dbInterPro, 1}
PDB::Union{dbPDB, Missing}
SCOP::Union{dbSCOP, Missing}
CATH::Union{dbCATH, Missing}
Ensembl::Array{dbEnsembl, 1}
UniProt::Union{dbUniProt,Missing}
Pfam::Union{dbPfam,Missing}
NCBI::Union{dbNCBI,Missing}
InterPro::Array{dbInterPro,1}
PDB::Union{dbPDB,Missing}
SCOP::Union{dbSCOP,Missing}
SCOP2::Array{dbSCOP2,1}
SCOP2B::Union{dbSCOP2B,Missing}
CATH::Union{dbCATH,Missing}
Ensembl::Array{dbEnsembl,1}
# residueDetail
missing::Bool # XML: <residueDetail dbSource="PDBe" property="Annotation" ...
sscode::String # XML: <residueDetail dbSource="PDBe" property="codeSecondaryStructure"...
Expand All @@ -200,28 +218,30 @@ end
@inline _name(::Type{dbUniProt}) = "UniProt"
@inline _name(::Type{dbPfam}) = "Pfam"
@inline _name(::Type{dbNCBI}) = "NCBI"
@inline _name(::Type{dbInterPro})= "InterPro"
@inline _name(::Type{dbInterPro}) = "InterPro"
@inline _name(::Type{dbPDB}) = "PDB"
@inline _name(::Type{dbSCOP}) = "SCOP"
@inline _name(::Type{dbSCOP2}) = "SCOP2"
@inline _name(::Type{dbSCOP2B}) = "SCOP2B"
@inline _name(::Type{dbCATH}) = "CATH"
@inline _name(::Type{dbEnsembl}) = "Ensembl"

@inline Base.get(res::SIFTSResidue, db::Type{dbPDBe}) = res.PDBe
@inline Base.get(res::SIFTSResidue, db::Type{dbUniProt}) = res.UniProt
@inline Base.get(res::SIFTSResidue, db::Type{dbPfam}) = res.Pfam
@inline Base.get(res::SIFTSResidue, db::Type{dbNCBI}) = res.NCBI
@inline Base.get(res::SIFTSResidue, db::Type{dbInterPro})= res.InterPro
@inline Base.get(res::SIFTSResidue, db::Type{dbInterPro}) = res.InterPro
@inline Base.get(res::SIFTSResidue, db::Type{dbPDB}) = res.PDB
@inline Base.get(res::SIFTSResidue, db::Type{dbSCOP}) = res.SCOP
@inline Base.get(res::SIFTSResidue, db::Type{dbSCOP2}) = res.SCOP2
@inline Base.get(res::SIFTSResidue, db::Type{dbSCOP2B}) = res.SCOP2B
@inline Base.get(res::SIFTSResidue, db::Type{dbCATH}) = res.CATH
@inline Base.get(res::SIFTSResidue, db::Type{dbEnsembl}) = res.Ensembl

function Base.get(res::SIFTSResidue,
db::Type{T},
field::Symbol,
default::Union{String, Missing}=missing) where T <: Union{
dbUniProt, dbPfam, dbNCBI, dbPDB, dbSCOP, dbCATH
}
default::Union{String,Missing}=missing) where T <: Union{dbUniProt,dbPfam,dbNCBI,dbPDB,dbSCOP,dbSCOP2B,dbCATH}
database = get(res, db)
ismissing(database) ? default : getfield(database, field)
end
Expand All @@ -239,7 +259,7 @@ function Base.show(io::IO, res::SIFTSResidue)
println(io, " PDBe:")
println(io, " number: ", res.PDBe.number)
println(io, " name: ", res.PDBe.name)
for dbname in [:UniProt, :Pfam, :NCBI, :PDB, :SCOP, :CATH]
for dbname in [:UniProt, :Pfam, :NCBI, :PDB, :SCOP, :SCOP2B, :CATH]
dbfield = getfield(res, dbname)
if !ismissing(dbfield)
println(io, " ", dbname, " :")
Expand All @@ -248,8 +268,9 @@ function Base.show(io::IO, res::SIFTSResidue)
end
end
end
println(io, " InterPro: ", res.InterPro)
println(io, " Ensembl: ", res.Ensembl)
length(res.SCOP2) > 0 && println(io, " SCOP2: ", res.SCOP2)
length(res.InterPro) > 0 && println(io, " InterPro: ", res.InterPro)
length(res.Ensembl) > 0 && println(io, " Ensembl: ", res.Ensembl)
end

# Creation
Expand All @@ -264,6 +285,8 @@ function SIFTSResidue(residue::LightXML.XMLElement, missing_residue::Bool,
InterPro = dbInterPro[]
PDB = missing
SCOP = missing
SCOP2 = dbSCOP2[]
SCOP2B = missing
CATH = missing
Ensembl = dbEnsembl[]
for crossref in get_elements_by_tagname(residue, "crossRefDb")
Expand All @@ -280,6 +303,10 @@ function SIFTSResidue(residue::LightXML.XMLElement, missing_residue::Bool,
PDB = dbPDB(crossref)
elseif db == "SCOP"
SCOP = dbSCOP(crossref)
elseif db == "SCOP2"
push!(SCOP2, dbSCOP2(crossref))
elseif db == "SCOP2B"
SCOP2B = dbSCOP2B(crossref)
elseif db == "CATH"
CATH = dbCATH(crossref)
elseif db == "Ensembl"
Expand All @@ -295,6 +322,8 @@ function SIFTSResidue(residue::LightXML.XMLElement, missing_residue::Bool,
InterPro,
PDB,
SCOP,
SCOP2,
SCOP2B,
CATH,
Ensembl,
missing_residue,
Expand All @@ -313,19 +342,19 @@ _is_All(::Any) = false
_is_All(::Type{All}) = true

"""
Parses a SIFTS XML file and returns a `Dict` between residue numbers of two `DataBase`s
with the given identifiers. A `chain` could be specified (`All` by default). If `missings`
is `true` (default) all the residues are used, even if they haven’t coordinates in the
PDB file.
Parses a SIFTS XML file and returns a `OrderedDict` between residue numbers of
two `DataBase`s with the given identifiers. A `chain` could be specified
(`All` by default). If `missings` is `true` (default) all the residues are
used, even if they haven’t coordinates in the PDB file.
"""
function siftsmapping(filename::String,
db_from::Type{F},
id_from::String,
db_to::Type{T},
id_to::String;
chain::Union{Type{All},String} = All,
missings::Bool = true) where {F, T}
mapping = Dict{String,String}()
chain::Union{Type{All},String}=All,
missings::Bool=true) where {F,T}
mapping = OrderedDict{String,String}()
xdoc = parse_file(filename)
try
for entity in _get_entities(xdoc)
Expand All @@ -334,17 +363,17 @@ function siftsmapping(filename::String,
residues = _get_residues(segment)
for residue in residues
in_chain = _is_All(chain)
key_data = _name(db_from) == "PDBe" ? attribute(residue,"dbResNum") : missing
value_data = _name(db_to) == "PDBe" ? attribute(residue,"dbResNum") : missing
key_data = _name(db_from) == "PDBe" ? attribute(residue, "dbResNum") : missing
value_data = _name(db_to) == "PDBe" ? attribute(residue, "dbResNum") : missing
if missings || !_is_missing(residue)
crossref = get_elements_by_tagname(residue, "crossRefDb")
for ref in crossref
source = attribute(ref, "dbSource")
if source == _name(db_from) && attribute(ref, "dbAccessionId") == id_from
key_data = _get_nullable_attribute(ref,"dbResNum")
key_data = _get_nullable_attribute(ref, "dbResNum")
end
if source == _name(db_to) && attribute(ref, "dbAccessionId") == id_to
value_data = _get_nullable_attribute(ref,"dbResNum")
value_data = _get_nullable_attribute(ref, "dbResNum")
end
if !in_chain && source == "PDB" # XML: <crossRefDb dbSource="PDB" ... dbChainId="E"/>
in_chain = attribute(ref, "dbChainId") == chain
Expand Down Expand Up @@ -376,7 +405,7 @@ Returns a `Vector{SIFTSResidue}` parsed from a `SIFTSXML` file.
By default, parses all the `chain`s and includes missing residues.
"""
function Base.parse(document::LightXML.XMLDocument, ::Type{SIFTSXML};
chain::Union{Type{All},String}=All, missings::Bool = true)
chain::Union{Type{All},String}=All, missings::Bool=true)
vector = SIFTSResidue[]
for entity in _get_entities(document)
for segment in _get_segments(entity)
Expand All @@ -400,7 +429,7 @@ end

for F in (:findall, :filter!, :filter)
@eval begin
function Base.$(F)(f::Function,list::AbstractVector{SIFTSResidue},db::Type{T}) where T<:DataBase
function Base.$(F)(f::Function, list::AbstractVector{SIFTSResidue}, db::Type{T}) where T <: DataBase
$(F)(list) do res
database = get(res, db)
if !ismissing(database)
Expand Down
7 changes: 5 additions & 2 deletions src/SIFTS/SIFTS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ SIFTS offers more reliable association between sequence and structure residue n
- Download and parse SIFTS XML files
- Store residue-level mapping in Julia
- Easy generation of `Dict`s between residues numbers
- Easy generation of `OrderedDict`s between residues numbers
```julia
using MIToS.SIFTS
Expand All @@ -20,9 +20,10 @@ module SIFTS

using LightXML
using AutoHashEquals
using DataStructures
using MIToS.Utils

export DataBase,
export DataBase,
dbPDBe,
dbInterPro,
dbUniProt,
Expand All @@ -31,6 +32,8 @@ export DataBase,
dbPDB,
dbCATH,
dbSCOP,
dbSCOP2,
dbSCOP2B,
dbEnsembl,
SIFTSResidue,
downloadsifts,
Expand Down
Loading

0 comments on commit 60e1487

Please sign in to comment.