Skip to content

Commit

Permalink
shuffle -> shuffle_msa
Browse files Browse the repository at this point in the history
  • Loading branch information
diegozea committed Jun 24, 2024
1 parent cffb422 commit 553f6bb
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 63 deletions.
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
## MIToS.jl Release Notes

### Changes from v2.18.0 to v2.19.0

* *[Breaking change]* The `shuffle` and `shuffle!` functions are deprecated in favor of the
`shuffle_msa` and `shuffle_msa!` functions. The new functions take `dims` and
`fixedgaps` as keyword arguments instead of taking them as positional ones. The new
functions add a last positional argument to allow the selection of specific sequences
or columns to shuffle. Also, it adds the `fixed_reference` keyword argument to keep the
residues in the reference sequence fixed during the shuffling. As an example of migration,
`shuffle!(msa, 1, false)` should be replaced by `shuffle_msa!(msa, dims=1, fixedgaps=false)`.

### Changes from v2.17.0 to v2.18.0

* *[Breaking change]* The `read`, `parse`, `write`, and `print` functions for different
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "MIToS"
uuid = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
version = "2.18.0"
version = "2.19.0"

[deps]
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Expand Down
2 changes: 1 addition & 1 deletion docs/src/Information.md
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ This algorithm can be accessed through the `buslje09` function and includes:
3. Average Product Correction (APC) proposed by
[Dunn et. al. 2008![](./assets/external-link.png)](http://bioinformatics.oxfordjournals.org/content/24/3/333),
through the `APC!` function that takes a MI matrix.
4. Z score correction using the functions `shuffle!` from the MSA module and `zscore`
4. Z score correction using the functions `shuffle_msa!` from the MSA module and `zscore`
from the `PairwiseListMatrices` package.

```@docs
Expand Down
4 changes: 2 additions & 2 deletions src/Information/CorrectedMutualInformation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ function buslje09(aln::AbstractMatrix{Residue};
zmi = copy(mi)
residuematrix = getresidues(aln)
for ns in 1:samples
shuffle!(residuematrix,1,fixedgaps)
shuffle_msa!(residuematrix, dims=1, fixedgaps=fixedgaps)
rand_mi[ns] = getarray(_buslje09(aln, alphabet, clusters, lambda, apc))
end
PairwiseListMatrices.zscore!(rand_mi, getarray(zmi))
Expand Down Expand Up @@ -132,7 +132,7 @@ function BLMI(aln::AbstractMatrix{Residue};
zmi = copy(mi)
residuematrix = getresidues(aln)
for ns in 1:samples
shuffle!(residuematrix,1,fixedgaps)
shuffle_msa!(residuematrix, dims=1, fixedgaps=fixedgaps)
rand_mi[ns] = getarray(_BLMI(aln, clusters, numbercl, beta, apc, lambda))
end
PairwiseListMatrices.zscore(rand_mi, getarray(zmi))
Expand Down
5 changes: 4 additions & 1 deletion src/MSA/MSA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ using OrderedCollections # OrderedDicts for Annotations
using AutoHashEquals # Annotations, Clusters
using NamedArrays # Col and Seq names, basic sequence/MSA object
using FastaIO # FastaReader (fast)
using Random # GLOBAL_RNG, shuffle!, rand, Sampler, randstring
using Random # default_rng, shuffle!, rand, Sampler, randstring
using Dates # Dates.now()
using PairwiseListMatrices # Percent Identity Matrices
using Clustering # Used for sequence clustering: ClusteringResult
Expand Down Expand Up @@ -94,6 +94,9 @@ export # Residue
PIR,
# A3M
A3M, A2M,
# Shuffle
shuffle_msa!,
shuffle_msa,
# PLM
sequencepairsmatrix, columnpairsmatrix,
# Identity
Expand Down
193 changes: 144 additions & 49 deletions src/MSA/Shuffle.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,122 @@
function _subset_indices(msa::Matrix{Residue}, dims::Int, subset)::Vector{Int}
if subset === Colon()
nseq, ncol = size(msa)
if dims == 1
1:nseq
else
1:ncol
end
else
if eltype(subset) !== Int
throw(ArgumentError(
"For a Matrix{Residue}, subset must be an iterator of Int values or Colon()"))
end
if isa(subset, AbstractRange)
collect(subset)
else
subset
end
end
end

function _subset_indices(msa::NamedResidueMatrix, dims::Int, subset)::Vector{Int}
dict = dims == 1 ? msa.dicts[1] : msa.dicts[2]
NamedArrays.indices(dict, subset)
end

function _subset_indices(msa::AbstractMultipleSequenceAlignment, dims, subset)::Vector{Int}
_subset_indices(msa.matrix, dims, subset)
end

function shuffle_msa!(r::AbstractRNG, msa::AbstractMatrix{Residue}, subset=Colon();
dims::Int=2, fixedgaps::Bool=true, fixed_reference::Bool=false)
@assert dims == 1 || dims == 2 "dims must be 1 for shuffling along sequences or 2 for columns"
subset_indices = _subset_indices(msa, dims, subset)
msa_matrix = getresidues(msa)
nseq, ncol = size(msa_matrix)
mask = fixedgaps ? msa_matrix .!= GAP : trues(nseq, ncol)
if fixed_reference
if dims == 1
filter!(!=(1), subset_indices)
end
mask[1, :] .= 0
end
for i in subset_indices
to_shuffle = dims == 1 ? view(msa_matrix, i, mask[i,:]) : view(msa_matrix, mask[:,i], i)
shuffle!(r, to_shuffle)
end
msa
end


function shuffle_msa!(r::AbstractRNG, msa::MultipleSequenceAlignment, args...; kwargs...)
shuffle_msa!(r, getresidues(msa), args...; kwargs...)
msa
end

function shuffle_msa!(r::AbstractRNG, msa::AnnotatedMultipleSequenceAlignment, subset=Colon();
dims::Int=2, fixedgaps::Bool=true, fixed_reference::Bool=false)
shuffle_msa!(r, getresidues(msa), subset; dims, fixedgaps, fixed_reference)

# Annotate the modifications
subset_indices = _subset_indices(msa, dims, subset)
n = length(subset_indices)
entities = dims == 1 ? "sequences" : "columns"
message = "$n $entities shuffled."
fixed = if fixedgaps && fixed_reference
" Gaps and residues in the first sequence"
elseif fixedgaps
" Gaps"
elseif fixed_reference
" Residues in the first sequence"
else
""
end
if !isempty(fixed)
message *= fixed
message *= " were kept in their positions."
end
annotate_modification!(msa.annotations, message)
if dims == 1
seqnames = sequencenames(msa)
for i in subset_indices
setannotsequence!(msa, seqnames[i], "Shuffled", "true")
end
else
shuffled = zeros(Int, ncolumns(msa))
shuffled[subset_indices] .= 1
setannotcolumn!(msa, "Shuffled", join(shuffled))
end
msa
end


shuffle_msa_doc = md"""
It randomly permute residues in the MSA `msa` along sequences (`dims=1`) or columns
(`dims=2`, the default). The optional positional argument `subset` allows to shuffle only
a subset of the sequences or columns. The optional keyword argument `fixedgaps` indicates
if the gaps should remain their positions (`true` by default). The optional keyword
argument `fixed_reference` indicates if the residues in the first sequence should remain
in their positions (`false` by default).
"""
It's like `Random.shuffle`. When a `Matrix{Residue}` is used, you can indicate if the gaps
should remain their positions using the last boolean argument. The previous argument should
be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling
residues in a column.

"""
shuffle_msa!([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)
In-place version of [`shuffle_msa`](@ref). $shuffle_msa_doc
"""
function shuffle_msa!(msa::AbstractMatrix{Residue}, args...; kwargs...)
shuffle_msa!(Random.default_rng(), msa, args...; kwargs...)
end

function shuffle_msa(r::AbstractRNG, msa::AbstractMatrix{Residue}, args...; kwargs...)
shuffle_msa!(r, deepcopy(msa), args...; kwargs...)
end

"""
shuffle_msa([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)
$shuffle_msa_doc To shuffle in-place, see [`shuffle_msa!`](@ref).
```jldoctest
julia> using MIToS.MSA
Expand All @@ -17,75 +131,56 @@ julia> msa = hcat(res"RRE",res"DDK", res"G--")
julia> Random.seed!(42);
julia> shuffle(msa, 1, true)
julia> shuffle_msa(msa, dims=1, fixedgaps=true)
3×3 Matrix{Residue}:
G D R
R D -
E K -
julia> Random.seed!(42);
julia> shuffle(msa, 1, false)
julia> shuffle_msa(msa, dims=1, fixedgaps=false)
3×3 Matrix{Residue}:
G D R
R - D
E K -
```
"""
function Random.shuffle!(r::AbstractRNG, msa::Matrix{Residue},
function shuffle_msa(msa::AbstractMatrix{Residue}, args...; kwargs...)
shuffle_msa(Random.default_rng(), msa, args...; kwargs...)
end

"""
It's like `Random.shuffle`. When a `Matrix{Residue}` is used, you can indicate if the gaps
should remain their positions using the last boolean argument. The previous argument should
be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling
residues in a column.
**DEPRECATED:** This method is deprecated. Use [`shuffle_msa!`](@ref) instead.
"""
function Random.shuffle!(r::AbstractRNG, msa::AbstractMatrix{Residue},
dim::Int, fixedgaps::Bool=true)
nseq, ncol = size(msa)
@assert dim == 1 || dim == 2 "The dimension must be 1 (sequences) or 2 (columns)"
if fixedgaps
mask = msa .!= GAP
if dim == 2
@inbounds for i in 1:ncol
shuffle!(view(msa,mask[:,i],i))
end
elseif dim == 1
@inbounds for i in 1:nseq
shuffle!(view(msa,i,mask[i,:]))
end
end
else
if dim == 2
@inbounds for i in 1:ncol
shuffle!(view(msa,:,i))
end
elseif dim == 1
@inbounds for i in 1:nseq
shuffle!(view(msa,i,:))
end
end
end
msa
@warn "The function `shuffle!(r, msa, dim, fixedgaps)` is deprecated. Use `shuffle_msa!(r, msa; dims, fixedgaps)` instead."
shuffle_msa!(r, msa, Colon(); dims=dim, fixedgaps=fixedgaps) |> getresidues
end

function Random.shuffle!(msa::Matrix{Residue}, args...)
shuffle!(Random.GLOBAL_RNG, msa, args...)
function Random.shuffle!(msa::AbstractMatrix{Residue}, args...)
shuffle!(Random.default_rng(), msa, args...)
end

"""
It's like `shuffle` but in-place. When a `Matrix{Residue}` or a `AbstractAlignedObject`
(sequence or MSA) is used, you can indicate if the gaps should remain their positions
using the last boolean argument.
"""
function Random.shuffle(r::AbstractRNG, msa::Matrix{Residue}, args...)
shuffle!(r, copy(msa), args...)
end

function Random.shuffle(msa::Matrix{Residue}, args...)
shuffle!(Random.GLOBAL_RNG, copy(msa), args...)
end
function Random.shuffle(r::AbstractRNG,
msa::Union{AbstractAlignedObject,
NamedResidueMatrix{Array{Residue,2}}}, args...)
shuffle(r, copy(getresidues(msa)), args...)
**DEPRECATED:** This method is deprecated. Use [`shuffle_msa`](@ref) instead.
"""
function Random.shuffle(r::AbstractRNG, msa::AbstractMatrix{Residue}, dim::Int, fixedgaps::Bool=true)
@warn "The function `shuffle(r, msa, dim, fixedgaps)` is deprecated. Use `shuffle_msa(r, msa; dims, fixedgaps)` instead."
shuffle_msa(r, msa, Colon(); dims=dim, fixedgaps=fixedgaps) |> getresidues
end

function Random.shuffle(msa::Union{AbstractAlignedObject,
NamedResidueMatrix{Array{Residue,2}}}, args...)
shuffle(Random.GLOBAL_RNG, copy(getresidues(msa)), args...)
function Random.shuffle(msa::AbstractMatrix{Residue}, args...)
shuffle(Random.GLOBAL_RNG, msa, args...)
end
Loading

2 comments on commit 553f6bb

@diegozea
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/109683

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v2.19.0 -m "<description of version>" 553f6bb18ca20e3b829a5db704701fdbe6e2b5d3
git push origin v2.19.0

Please sign in to comment.