From 553f6bb18ca20e3b829a5db704701fdbe6e2b5d3 Mon Sep 17 00:00:00 2001 From: Diego Javier Zea Date: Mon, 24 Jun 2024 17:54:44 +0200 Subject: [PATCH] shuffle -> shuffle_msa --- NEWS.md | 10 + Project.toml | 2 +- docs/src/Information.md | 2 +- src/Information/CorrectedMutualInformation.jl | 4 +- src/MSA/MSA.jl | 5 +- src/MSA/Shuffle.jl | 193 +++++++++++++----- test/MSA/Shuffle.jl | 62 +++++- 7 files changed, 215 insertions(+), 63 deletions(-) diff --git a/NEWS.md b/NEWS.md index e882f67b..309d1c17 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,15 @@ ## MIToS.jl Release Notes +### Changes from v2.18.0 to v2.19.0 + +* *[Breaking change]* The `shuffle` and `shuffle!` functions are deprecated in favor of the + `shuffle_msa` and `shuffle_msa!` functions. The new functions take `dims` and + `fixedgaps` as keyword arguments instead of taking them as positional ones. The new + functions add a last positional argument to allow the selection of specific sequences + or columns to shuffle. Also, it adds the `fixed_reference` keyword argument to keep the + residues in the reference sequence fixed during the shuffling. As an example of migration, + `shuffle!(msa, 1, false)` should be replaced by `shuffle_msa!(msa, dims=1, fixedgaps=false)`. + ### Changes from v2.17.0 to v2.18.0 * *[Breaking change]* The `read`, `parse`, `write`, and `print` functions for different diff --git a/Project.toml b/Project.toml index 1dff4743..0451b6a2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "MIToS" uuid = "51bafb47-8a16-5ded-8b04-24ef4eede0b5" -version = "2.18.0" +version = "2.19.0" [deps] ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" diff --git a/docs/src/Information.md b/docs/src/Information.md index a5d37c80..5326a6b3 100644 --- a/docs/src/Information.md +++ b/docs/src/Information.md @@ -420,7 +420,7 @@ This algorithm can be accessed through the `buslje09` function and includes: 3. Average Product Correction (APC) proposed by [Dunn et. al. 2008![](./assets/external-link.png)](http://bioinformatics.oxfordjournals.org/content/24/3/333), through the `APC!` function that takes a MI matrix. -4. Z score correction using the functions `shuffle!` from the MSA module and `zscore` +4. Z score correction using the functions `shuffle_msa!` from the MSA module and `zscore` from the `PairwiseListMatrices` package. ```@docs diff --git a/src/Information/CorrectedMutualInformation.jl b/src/Information/CorrectedMutualInformation.jl index f6c2d5b7..568af1c1 100644 --- a/src/Information/CorrectedMutualInformation.jl +++ b/src/Information/CorrectedMutualInformation.jl @@ -57,7 +57,7 @@ function buslje09(aln::AbstractMatrix{Residue}; zmi = copy(mi) residuematrix = getresidues(aln) for ns in 1:samples - shuffle!(residuematrix,1,fixedgaps) + shuffle_msa!(residuematrix, dims=1, fixedgaps=fixedgaps) rand_mi[ns] = getarray(_buslje09(aln, alphabet, clusters, lambda, apc)) end PairwiseListMatrices.zscore!(rand_mi, getarray(zmi)) @@ -132,7 +132,7 @@ function BLMI(aln::AbstractMatrix{Residue}; zmi = copy(mi) residuematrix = getresidues(aln) for ns in 1:samples - shuffle!(residuematrix,1,fixedgaps) + shuffle_msa!(residuematrix, dims=1, fixedgaps=fixedgaps) rand_mi[ns] = getarray(_BLMI(aln, clusters, numbercl, beta, apc, lambda)) end PairwiseListMatrices.zscore(rand_mi, getarray(zmi)) diff --git a/src/MSA/MSA.jl b/src/MSA/MSA.jl index 2595db2f..8f8f2f36 100644 --- a/src/MSA/MSA.jl +++ b/src/MSA/MSA.jl @@ -19,7 +19,7 @@ using OrderedCollections # OrderedDicts for Annotations using AutoHashEquals # Annotations, Clusters using NamedArrays # Col and Seq names, basic sequence/MSA object using FastaIO # FastaReader (fast) -using Random # GLOBAL_RNG, shuffle!, rand, Sampler, randstring +using Random # default_rng, shuffle!, rand, Sampler, randstring using Dates # Dates.now() using PairwiseListMatrices # Percent Identity Matrices using Clustering # Used for sequence clustering: ClusteringResult @@ -94,6 +94,9 @@ export # Residue PIR, # A3M A3M, A2M, + # Shuffle + shuffle_msa!, + shuffle_msa, # PLM sequencepairsmatrix, columnpairsmatrix, # Identity diff --git a/src/MSA/Shuffle.jl b/src/MSA/Shuffle.jl index 3d84d54c..6c8eabfe 100644 --- a/src/MSA/Shuffle.jl +++ b/src/MSA/Shuffle.jl @@ -1,8 +1,122 @@ +function _subset_indices(msa::Matrix{Residue}, dims::Int, subset)::Vector{Int} + if subset === Colon() + nseq, ncol = size(msa) + if dims == 1 + 1:nseq + else + 1:ncol + end + else + if eltype(subset) !== Int + throw(ArgumentError( + "For a Matrix{Residue}, subset must be an iterator of Int values or Colon()")) + end + if isa(subset, AbstractRange) + collect(subset) + else + subset + end + end +end + +function _subset_indices(msa::NamedResidueMatrix, dims::Int, subset)::Vector{Int} + dict = dims == 1 ? msa.dicts[1] : msa.dicts[2] + NamedArrays.indices(dict, subset) +end + +function _subset_indices(msa::AbstractMultipleSequenceAlignment, dims, subset)::Vector{Int} + _subset_indices(msa.matrix, dims, subset) +end + +function shuffle_msa!(r::AbstractRNG, msa::AbstractMatrix{Residue}, subset=Colon(); + dims::Int=2, fixedgaps::Bool=true, fixed_reference::Bool=false) + @assert dims == 1 || dims == 2 "dims must be 1 for shuffling along sequences or 2 for columns" + subset_indices = _subset_indices(msa, dims, subset) + msa_matrix = getresidues(msa) + nseq, ncol = size(msa_matrix) + mask = fixedgaps ? msa_matrix .!= GAP : trues(nseq, ncol) + if fixed_reference + if dims == 1 + filter!(!=(1), subset_indices) + end + mask[1, :] .= 0 + end + for i in subset_indices + to_shuffle = dims == 1 ? view(msa_matrix, i, mask[i,:]) : view(msa_matrix, mask[:,i], i) + shuffle!(r, to_shuffle) + end + msa +end + + +function shuffle_msa!(r::AbstractRNG, msa::MultipleSequenceAlignment, args...; kwargs...) + shuffle_msa!(r, getresidues(msa), args...; kwargs...) + msa +end + +function shuffle_msa!(r::AbstractRNG, msa::AnnotatedMultipleSequenceAlignment, subset=Colon(); + dims::Int=2, fixedgaps::Bool=true, fixed_reference::Bool=false) + shuffle_msa!(r, getresidues(msa), subset; dims, fixedgaps, fixed_reference) + + # Annotate the modifications + subset_indices = _subset_indices(msa, dims, subset) + n = length(subset_indices) + entities = dims == 1 ? "sequences" : "columns" + message = "$n $entities shuffled." + fixed = if fixedgaps && fixed_reference + " Gaps and residues in the first sequence" + elseif fixedgaps + " Gaps" + elseif fixed_reference + " Residues in the first sequence" + else + "" + end + if !isempty(fixed) + message *= fixed + message *= " were kept in their positions." + end + annotate_modification!(msa.annotations, message) + if dims == 1 + seqnames = sequencenames(msa) + for i in subset_indices + setannotsequence!(msa, seqnames[i], "Shuffled", "true") + end + else + shuffled = zeros(Int, ncolumns(msa)) + shuffled[subset_indices] .= 1 + setannotcolumn!(msa, "Shuffled", join(shuffled)) + end + msa +end + + +shuffle_msa_doc = md""" +It randomly permute residues in the MSA `msa` along sequences (`dims=1`) or columns +(`dims=2`, the default). The optional positional argument `subset` allows to shuffle only +a subset of the sequences or columns. The optional keyword argument `fixedgaps` indicates +if the gaps should remain their positions (`true` by default). The optional keyword +argument `fixed_reference` indicates if the residues in the first sequence should remain +in their positions (`false` by default). """ -It's like `Random.shuffle`. When a `Matrix{Residue}` is used, you can indicate if the gaps -should remain their positions using the last boolean argument. The previous argument should -be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling -residues in a column. + +""" + shuffle_msa!([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false) + +In-place version of [`shuffle_msa`](@ref). $shuffle_msa_doc +""" +function shuffle_msa!(msa::AbstractMatrix{Residue}, args...; kwargs...) + shuffle_msa!(Random.default_rng(), msa, args...; kwargs...) +end + +function shuffle_msa(r::AbstractRNG, msa::AbstractMatrix{Residue}, args...; kwargs...) + shuffle_msa!(r, deepcopy(msa), args...; kwargs...) +end + +""" + shuffle_msa([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false) + +$shuffle_msa_doc To shuffle in-place, see [`shuffle_msa!`](@ref). ```jldoctest julia> using MIToS.MSA @@ -17,7 +131,7 @@ julia> msa = hcat(res"RRE",res"DDK", res"G--") julia> Random.seed!(42); -julia> shuffle(msa, 1, true) +julia> shuffle_msa(msa, dims=1, fixedgaps=true) 3×3 Matrix{Residue}: G D R R D - @@ -25,7 +139,7 @@ julia> shuffle(msa, 1, true) julia> Random.seed!(42); -julia> shuffle(msa, 1, false) +julia> shuffle_msa(msa, dims=1, fixedgaps=false) 3×3 Matrix{Residue}: G D R R - D @@ -33,59 +147,40 @@ julia> shuffle(msa, 1, false) ``` """ -function Random.shuffle!(r::AbstractRNG, msa::Matrix{Residue}, +function shuffle_msa(msa::AbstractMatrix{Residue}, args...; kwargs...) + shuffle_msa(Random.default_rng(), msa, args...; kwargs...) +end + +""" +It's like `Random.shuffle`. When a `Matrix{Residue}` is used, you can indicate if the gaps +should remain their positions using the last boolean argument. The previous argument should +be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling +residues in a column. + +**DEPRECATED:** This method is deprecated. Use [`shuffle_msa!`](@ref) instead. +""" +function Random.shuffle!(r::AbstractRNG, msa::AbstractMatrix{Residue}, dim::Int, fixedgaps::Bool=true) - nseq, ncol = size(msa) - @assert dim == 1 || dim == 2 "The dimension must be 1 (sequences) or 2 (columns)" - if fixedgaps - mask = msa .!= GAP - if dim == 2 - @inbounds for i in 1:ncol - shuffle!(view(msa,mask[:,i],i)) - end - elseif dim == 1 - @inbounds for i in 1:nseq - shuffle!(view(msa,i,mask[i,:])) - end - end - else - if dim == 2 - @inbounds for i in 1:ncol - shuffle!(view(msa,:,i)) - end - elseif dim == 1 - @inbounds for i in 1:nseq - shuffle!(view(msa,i,:)) - end - end - end - msa + @warn "The function `shuffle!(r, msa, dim, fixedgaps)` is deprecated. Use `shuffle_msa!(r, msa; dims, fixedgaps)` instead." + shuffle_msa!(r, msa, Colon(); dims=dim, fixedgaps=fixedgaps) |> getresidues end -function Random.shuffle!(msa::Matrix{Residue}, args...) - shuffle!(Random.GLOBAL_RNG, msa, args...) +function Random.shuffle!(msa::AbstractMatrix{Residue}, args...) + shuffle!(Random.default_rng(), msa, args...) end """ It's like `shuffle` but in-place. When a `Matrix{Residue}` or a `AbstractAlignedObject` (sequence or MSA) is used, you can indicate if the gaps should remain their positions using the last boolean argument. -""" -function Random.shuffle(r::AbstractRNG, msa::Matrix{Residue}, args...) - shuffle!(r, copy(msa), args...) -end - -function Random.shuffle(msa::Matrix{Residue}, args...) - shuffle!(Random.GLOBAL_RNG, copy(msa), args...) -end -function Random.shuffle(r::AbstractRNG, - msa::Union{AbstractAlignedObject, - NamedResidueMatrix{Array{Residue,2}}}, args...) - shuffle(r, copy(getresidues(msa)), args...) +**DEPRECATED:** This method is deprecated. Use [`shuffle_msa`](@ref) instead. +""" +function Random.shuffle(r::AbstractRNG, msa::AbstractMatrix{Residue}, dim::Int, fixedgaps::Bool=true) + @warn "The function `shuffle(r, msa, dim, fixedgaps)` is deprecated. Use `shuffle_msa(r, msa; dims, fixedgaps)` instead." + shuffle_msa(r, msa, Colon(); dims=dim, fixedgaps=fixedgaps) |> getresidues end -function Random.shuffle(msa::Union{AbstractAlignedObject, - NamedResidueMatrix{Array{Residue,2}}}, args...) - shuffle(Random.GLOBAL_RNG, copy(getresidues(msa)), args...) +function Random.shuffle(msa::AbstractMatrix{Residue}, args...) + shuffle(Random.GLOBAL_RNG, msa, args...) end diff --git a/test/MSA/Shuffle.jl b/test/MSA/Shuffle.jl index a24bd430..71871fea 100644 --- a/test/MSA/Shuffle.jl +++ b/test/MSA/Shuffle.jl @@ -25,23 +25,23 @@ for dim in [1,2] aln = copy(msa) - shuffle!(aln, dim) + shuffle_msa!(aln, dims=dim) @test aln != msa @test (aln .== GAP) == gaps[1] # default: fixed gaps aln = copy(msa) - shuffle!(aln, dim, true) + shuffle_msa!(aln, dims=dim, fixedgaps=true) @test aln != msa @test (aln .== GAP) == gaps[1] aln = copy(msa) - shuffle!(aln, dim, false) + shuffle_msa!(aln, dims=dim, fixedgaps=false) @test aln != msa @test (aln .== GAP) != gaps[1] end - @test_throws AssertionError shuffle(msa, 0, true) - @test_throws AssertionError shuffle(msa, 3, true) + @test_throws AssertionError shuffle_msa(msa, dims=0, fixedgaps=true) + @test_throws AssertionError shuffle_msa(msa, dims=3, fixedgaps=true) end @testset "Columns" begin @@ -49,13 +49,13 @@ for i in 1:N # Fixed gaps msa = msas[i] - aln = shuffle(msa, 2, true) + aln = shuffle_msa(msa, dims=2, fixedgaps=true) @test aln != getresidues(msa) @test (aln .== GAP) == gaps[i] @test lcol[i] == mean(aln .== Residue('L'), dims=1) @test lseq[i] != mean(aln .== Residue('L'), dims=2) # Change gap positions - aln = shuffle(msa, 2, false) + aln = shuffle_msa(msa, dims=2, fixedgaps=false) @test aln != getresidues(msa) @test (aln .== GAP) != gaps[i] @test lcol[i] == mean(aln .== Residue('L'), dims=1) @@ -67,16 +67,60 @@ for i in 1:N # Fixed gaps msa = msas[i] - aln = shuffle(msa, 1, true) + aln = shuffle_msa(msa, dims=1, fixedgaps=true) @test aln != getresidues(msa) @test (aln .== GAP) == gaps[i] @test lcol[i] != mean(aln .== Residue('L'), dims=1) @test lseq[i] == mean(aln .== Residue('L'), dims=2) # Change gap positions - aln = shuffle(msa, 1, false) + aln = shuffle_msa(msa, dims=1, fixedgaps=false) @test aln != getresidues(msa) @test (aln .== GAP) != gaps[i] @test lseq[i] == mean(aln .== Residue('L'), dims=2) end end + + @testset "Reference" begin + + for msa in msas + ref = getsequence(msa, 1) + for dims in [1, 2] + for gaps in [true, false] + aln = shuffle_msa(msa, dims=dims, fixedgaps=gaps, fixed_reference=true) + @test getsequence(aln, 1) == ref + end + end + end + end + + @testset "Subset" begin + for msa in msas + + seqs_to_move = [3, 4] + shuffled_seqs = shuffle_msa(msa, seqs_to_move, dims=1) + @test msa[1:2, :] == shuffled_seqs[1:2, :] + @test msa[seqs_to_move, :] != shuffled_seqs[seqs_to_move, :] + + cols_to_move = [9, 10, 11, 12] + shuffled_cols = shuffle_msa(MersenneTwister(0), msa, cols_to_move, dims=2) + @test msa[:, 1:8] == shuffled_cols[:, 1:8] + @test msa[:, cols_to_move] != shuffled_cols[:, cols_to_move] + + # Annotations + if isa(msa, AnnotatedMultipleSequenceAlignment) + @test isempty(getannotsequence(shuffled_seqs, "C3N734_SULIY/1-95", "Shuffled", "")) + @test isempty(getannotsequence(shuffled_seqs, "H2C869_9CREN/7-104", "Shuffled", "")) + @test getannotsequence(shuffled_seqs, "Y070_ATV/2-70", "Shuffled", "") == "true" + @test getannotsequence(shuffled_seqs, "F112_SSV1/3-112", "Shuffled", "") == "true" + + # 1111 + # 1234567890123 + @test startswith(getannotcolumn(shuffled_cols, "Shuffled", ""), "0000000011110") + + # MIToS modifications + any(startswith("2 sequences shuffled."), values(getannotfile(shuffled_seqs))) + any(startswith("4 columns shuffled."), values(getannotfile(shuffled_cols))) + end + end + end end