From 553f6bb18ca20e3b829a5db704701fdbe6e2b5d3 Mon Sep 17 00:00:00 2001
From: Diego Javier Zea <diegozea@gmail.com>
Date: Mon, 24 Jun 2024 17:54:44 +0200
Subject: [PATCH] shuffle -> shuffle_msa

---
 NEWS.md                                       |  10 +
 Project.toml                                  |   2 +-
 docs/src/Information.md                       |   2 +-
 src/Information/CorrectedMutualInformation.jl |   4 +-
 src/MSA/MSA.jl                                |   5 +-
 src/MSA/Shuffle.jl                            | 193 +++++++++++++-----
 test/MSA/Shuffle.jl                           |  62 +++++-
 7 files changed, 215 insertions(+), 63 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index e882f67b..309d1c17 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,15 @@
 ## MIToS.jl Release Notes
 
+### Changes from v2.18.0 to v2.19.0
+
+* *[Breaking change]* The `shuffle` and `shuffle!` functions are deprecated in favor of the 
+  `shuffle_msa` and `shuffle_msa!` functions. The new functions take `dims` and 
+  `fixedgaps` as keyword arguments instead of taking them as positional ones. The new
+  functions add a last positional argument to allow the selection of specific sequences 
+  or columns to shuffle. Also, it adds the `fixed_reference` keyword argument to keep the 
+  residues in the reference sequence fixed during the shuffling. As an example of migration,
+  `shuffle!(msa, 1, false)` should be replaced by `shuffle_msa!(msa, dims=1, fixedgaps=false)`.
+
 ### Changes from v2.17.0 to v2.18.0
 
 * *[Breaking change]* The `read`, `parse`, `write`, and `print` functions for different 
diff --git a/Project.toml b/Project.toml
index 1dff4743..0451b6a2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "MIToS"
 uuid = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
-version = "2.18.0"
+version = "2.19.0"
 
 [deps]
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
diff --git a/docs/src/Information.md b/docs/src/Information.md
index a5d37c80..5326a6b3 100644
--- a/docs/src/Information.md
+++ b/docs/src/Information.md
@@ -420,7 +420,7 @@ This algorithm can be accessed through the `buslje09` function and includes:
 3. Average Product Correction (APC) proposed by
 [Dunn et. al. 2008![](./assets/external-link.png)](http://bioinformatics.oxfordjournals.org/content/24/3/333),
 through the `APC!` function that takes a MI matrix.
-4. Z score correction using the functions `shuffle!` from the MSA module and `zscore`
+4. Z score correction using the functions `shuffle_msa!` from the MSA module and `zscore`
 from the `PairwiseListMatrices` package.  
 
 ```@docs
diff --git a/src/Information/CorrectedMutualInformation.jl b/src/Information/CorrectedMutualInformation.jl
index f6c2d5b7..568af1c1 100644
--- a/src/Information/CorrectedMutualInformation.jl
+++ b/src/Information/CorrectedMutualInformation.jl
@@ -57,7 +57,7 @@ function buslje09(aln::AbstractMatrix{Residue};
         zmi = copy(mi)
         residuematrix = getresidues(aln)
         for ns in 1:samples
-            shuffle!(residuematrix,1,fixedgaps)
+            shuffle_msa!(residuematrix, dims=1, fixedgaps=fixedgaps)
             rand_mi[ns] = getarray(_buslje09(aln, alphabet, clusters, lambda, apc))
         end
         PairwiseListMatrices.zscore!(rand_mi, getarray(zmi))
@@ -132,7 +132,7 @@ function BLMI(aln::AbstractMatrix{Residue};
         zmi = copy(mi)
         residuematrix = getresidues(aln)
         for ns in 1:samples
-            shuffle!(residuematrix,1,fixedgaps)
+            shuffle_msa!(residuematrix, dims=1, fixedgaps=fixedgaps)
             rand_mi[ns] = getarray(_BLMI(aln, clusters, numbercl, beta, apc, lambda))
         end
         PairwiseListMatrices.zscore(rand_mi, getarray(zmi))
diff --git a/src/MSA/MSA.jl b/src/MSA/MSA.jl
index 2595db2f..8f8f2f36 100644
--- a/src/MSA/MSA.jl
+++ b/src/MSA/MSA.jl
@@ -19,7 +19,7 @@ using OrderedCollections        # OrderedDicts for Annotations
 using AutoHashEquals        # Annotations, Clusters
 using NamedArrays           # Col and Seq names, basic sequence/MSA object
 using FastaIO               # FastaReader (fast)
-using Random                # GLOBAL_RNG, shuffle!, rand, Sampler, randstring
+using Random                # default_rng, shuffle!, rand, Sampler, randstring
 using Dates                 # Dates.now()
 using PairwiseListMatrices  # Percent Identity Matrices
 using Clustering            # Used for sequence clustering: ClusteringResult
@@ -94,6 +94,9 @@ export  # Residue
         PIR,
         # A3M
         A3M, A2M,
+        # Shuffle
+        shuffle_msa!,
+        shuffle_msa,
         # PLM
         sequencepairsmatrix, columnpairsmatrix,
         # Identity
diff --git a/src/MSA/Shuffle.jl b/src/MSA/Shuffle.jl
index 3d84d54c..6c8eabfe 100644
--- a/src/MSA/Shuffle.jl
+++ b/src/MSA/Shuffle.jl
@@ -1,8 +1,122 @@
+function _subset_indices(msa::Matrix{Residue}, dims::Int, subset)::Vector{Int}
+    if subset === Colon()
+        nseq, ncol = size(msa)
+        if dims == 1
+            1:nseq
+        else
+            1:ncol
+        end
+    else
+        if eltype(subset) !== Int
+            throw(ArgumentError(
+                "For a Matrix{Residue}, subset must be an iterator of Int values or Colon()"))
+        end
+        if isa(subset, AbstractRange)
+            collect(subset)
+        else
+            subset
+        end
+    end
+end
+
+function _subset_indices(msa::NamedResidueMatrix, dims::Int, subset)::Vector{Int}
+    dict = dims == 1 ? msa.dicts[1] : msa.dicts[2]
+    NamedArrays.indices(dict, subset)
+end
+
+function _subset_indices(msa::AbstractMultipleSequenceAlignment, dims, subset)::Vector{Int}
+    _subset_indices(msa.matrix, dims, subset)
+end
+
+function shuffle_msa!(r::AbstractRNG, msa::AbstractMatrix{Residue}, subset=Colon();
+    dims::Int=2, fixedgaps::Bool=true, fixed_reference::Bool=false)
+    @assert dims == 1 || dims == 2 "dims must be 1 for shuffling along sequences or 2 for columns"
+    subset_indices = _subset_indices(msa, dims, subset)
+    msa_matrix = getresidues(msa)
+    nseq, ncol = size(msa_matrix)
+    mask =  fixedgaps ? msa_matrix .!= GAP : trues(nseq, ncol)
+    if fixed_reference
+        if dims == 1
+            filter!(!=(1), subset_indices)
+        end
+        mask[1, :] .= 0
+    end
+    for i in subset_indices
+        to_shuffle = dims == 1 ? view(msa_matrix, i, mask[i,:]) : view(msa_matrix, mask[:,i], i)
+        shuffle!(r, to_shuffle)
+    end
+    msa
+end
+
+
+function shuffle_msa!(r::AbstractRNG, msa::MultipleSequenceAlignment, args...; kwargs...)
+    shuffle_msa!(r, getresidues(msa), args...; kwargs...)
+    msa
+end
+
+function shuffle_msa!(r::AbstractRNG, msa::AnnotatedMultipleSequenceAlignment, subset=Colon();
+    dims::Int=2, fixedgaps::Bool=true, fixed_reference::Bool=false)
+    shuffle_msa!(r, getresidues(msa), subset; dims, fixedgaps, fixed_reference)
+
+    # Annotate the modifications
+    subset_indices = _subset_indices(msa, dims, subset)
+    n = length(subset_indices)
+    entities = dims == 1 ? "sequences" : "columns"
+    message = "$n $entities shuffled."
+    fixed = if fixedgaps && fixed_reference
+        " Gaps and residues in the first sequence"
+    elseif fixedgaps
+        " Gaps"
+    elseif fixed_reference
+        " Residues in the first sequence"
+    else 
+        ""
+    end
+    if !isempty(fixed)
+        message *= fixed
+        message *= " were kept in their positions."
+    end
+    annotate_modification!(msa.annotations, message)
+    if dims == 1
+        seqnames = sequencenames(msa)
+        for i in subset_indices
+            setannotsequence!(msa, seqnames[i], "Shuffled", "true")
+        end
+    else
+        shuffled = zeros(Int, ncolumns(msa))
+        shuffled[subset_indices] .= 1
+        setannotcolumn!(msa, "Shuffled", join(shuffled))
+    end
+    msa
+end
+
+
+shuffle_msa_doc = md"""
+It randomly permute residues in the MSA `msa` along sequences (`dims=1`) or columns 
+(`dims=2`, the default). The optional positional argument `subset` allows to shuffle only 
+a subset of the sequences or columns. The optional keyword argument `fixedgaps` indicates 
+if the gaps should remain their positions (`true` by default). The optional keyword
+argument `fixed_reference` indicates if the residues in the first sequence should remain
+in their positions (`false` by default).
 """
-It's like `Random.shuffle`. When a `Matrix{Residue}` is used, you can indicate if the gaps
-should remain their positions using the last boolean argument. The previous argument should
-be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling
-residues in a column.
+
+"""
+    shuffle_msa!([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)
+
+In-place version of [`shuffle_msa`](@ref). $shuffle_msa_doc
+"""
+function shuffle_msa!(msa::AbstractMatrix{Residue}, args...; kwargs...)
+    shuffle_msa!(Random.default_rng(), msa, args...; kwargs...)
+end
+
+function shuffle_msa(r::AbstractRNG, msa::AbstractMatrix{Residue}, args...; kwargs...)
+    shuffle_msa!(r, deepcopy(msa), args...; kwargs...)
+end
+
+"""
+    shuffle_msa([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)
+
+$shuffle_msa_doc To shuffle in-place, see [`shuffle_msa!`](@ref).
 
 ```jldoctest
 julia> using MIToS.MSA
@@ -17,7 +131,7 @@ julia> msa = hcat(res"RRE",res"DDK", res"G--")
 
 julia> Random.seed!(42);
 
-julia> shuffle(msa, 1, true)
+julia> shuffle_msa(msa, dims=1, fixedgaps=true)
 3×3 Matrix{Residue}:
  G  D  R
  R  D  -
@@ -25,7 +139,7 @@ julia> shuffle(msa, 1, true)
 
 julia> Random.seed!(42);
 
-julia> shuffle(msa, 1, false)
+julia> shuffle_msa(msa, dims=1, fixedgaps=false)
 3×3 Matrix{Residue}:
  G  D  R
  R  -  D
@@ -33,59 +147,40 @@ julia> shuffle(msa, 1, false)
 
 ```
 """
-function Random.shuffle!(r::AbstractRNG, msa::Matrix{Residue},
+function shuffle_msa(msa::AbstractMatrix{Residue}, args...; kwargs...)
+    shuffle_msa(Random.default_rng(), msa, args...; kwargs...)
+end
+
+"""
+It's like `Random.shuffle`. When a `Matrix{Residue}` is used, you can indicate if the gaps
+should remain their positions using the last boolean argument. The previous argument should
+be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling
+residues in a column.
+
+**DEPRECATED:** This method is deprecated. Use [`shuffle_msa!`](@ref) instead.
+"""
+function Random.shuffle!(r::AbstractRNG, msa::AbstractMatrix{Residue},
                        dim::Int, fixedgaps::Bool=true)
-    nseq, ncol = size(msa)
-    @assert dim == 1 || dim == 2 "The dimension must be 1 (sequences) or 2 (columns)"
-    if fixedgaps
-        mask = msa .!= GAP
-        if dim == 2
-            @inbounds for i in 1:ncol
-                shuffle!(view(msa,mask[:,i],i))
-            end
-        elseif dim == 1
-            @inbounds for i in 1:nseq
-                shuffle!(view(msa,i,mask[i,:]))
-            end
-        end
-    else
-        if dim == 2
-            @inbounds for i in 1:ncol
-                shuffle!(view(msa,:,i))
-            end
-        elseif dim == 1
-            @inbounds for i in 1:nseq
-                shuffle!(view(msa,i,:))
-            end
-        end
-    end
-    msa
+    @warn "The function `shuffle!(r, msa, dim, fixedgaps)` is deprecated. Use `shuffle_msa!(r, msa; dims, fixedgaps)` instead."
+    shuffle_msa!(r, msa, Colon(); dims=dim, fixedgaps=fixedgaps) |> getresidues
 end
 
-function Random.shuffle!(msa::Matrix{Residue}, args...)
-    shuffle!(Random.GLOBAL_RNG, msa, args...)
+function Random.shuffle!(msa::AbstractMatrix{Residue}, args...)
+    shuffle!(Random.default_rng(), msa, args...)
 end
 
 """
 It's like `shuffle` but in-place. When a `Matrix{Residue}` or a `AbstractAlignedObject`
 (sequence or MSA) is used, you can indicate if the gaps should remain their positions
 using the last boolean argument.
-"""
-function Random.shuffle(r::AbstractRNG, msa::Matrix{Residue}, args...)
-    shuffle!(r, copy(msa), args...)
-end
-
-function Random.shuffle(msa::Matrix{Residue}, args...)
-    shuffle!(Random.GLOBAL_RNG, copy(msa), args...)
-end
 
-function Random.shuffle(r::AbstractRNG,
-                      msa::Union{AbstractAlignedObject,
-                                 NamedResidueMatrix{Array{Residue,2}}}, args...)
-    shuffle(r, copy(getresidues(msa)), args...)
+**DEPRECATED:** This method is deprecated. Use [`shuffle_msa`](@ref) instead.
+"""
+function Random.shuffle(r::AbstractRNG, msa::AbstractMatrix{Residue}, dim::Int, fixedgaps::Bool=true)
+    @warn "The function `shuffle(r, msa, dim, fixedgaps)` is deprecated. Use `shuffle_msa(r, msa; dims, fixedgaps)` instead."
+    shuffle_msa(r, msa, Colon(); dims=dim, fixedgaps=fixedgaps) |> getresidues
 end
 
-function Random.shuffle(msa::Union{AbstractAlignedObject,
-                                   NamedResidueMatrix{Array{Residue,2}}}, args...)
-    shuffle(Random.GLOBAL_RNG, copy(getresidues(msa)), args...)
+function Random.shuffle(msa::AbstractMatrix{Residue}, args...)
+    shuffle(Random.GLOBAL_RNG, msa, args...)
 end
diff --git a/test/MSA/Shuffle.jl b/test/MSA/Shuffle.jl
index a24bd430..71871fea 100644
--- a/test/MSA/Shuffle.jl
+++ b/test/MSA/Shuffle.jl
@@ -25,23 +25,23 @@
         for dim in [1,2]
 
             aln = copy(msa)
-            shuffle!(aln, dim)
+            shuffle_msa!(aln, dims=dim)
             @test aln != msa
             @test (aln .== GAP) == gaps[1] # default: fixed gaps
 
             aln = copy(msa)
-            shuffle!(aln, dim, true)
+            shuffle_msa!(aln, dims=dim, fixedgaps=true)
             @test aln != msa
             @test (aln .== GAP) == gaps[1]
 
             aln = copy(msa)
-            shuffle!(aln, dim, false)
+            shuffle_msa!(aln, dims=dim, fixedgaps=false)
             @test aln != msa
             @test (aln .== GAP) != gaps[1]
         end
 
-        @test_throws AssertionError shuffle(msa, 0, true)
-        @test_throws AssertionError shuffle(msa, 3, true)
+        @test_throws AssertionError shuffle_msa(msa, dims=0, fixedgaps=true)
+        @test_throws AssertionError shuffle_msa(msa, dims=3, fixedgaps=true)
     end
 
     @testset "Columns" begin
@@ -49,13 +49,13 @@
         for i in 1:N
             # Fixed gaps
             msa = msas[i]
-            aln = shuffle(msa, 2, true)
+            aln = shuffle_msa(msa, dims=2, fixedgaps=true)
             @test aln != getresidues(msa)
             @test (aln .== GAP) == gaps[i]
             @test lcol[i] == mean(aln .== Residue('L'), dims=1)
             @test lseq[i] != mean(aln .== Residue('L'), dims=2)
             # Change gap positions
-            aln = shuffle(msa, 2, false)
+            aln = shuffle_msa(msa, dims=2, fixedgaps=false)
             @test aln != getresidues(msa)
             @test (aln .== GAP) != gaps[i]
             @test lcol[i] == mean(aln .== Residue('L'), dims=1)
@@ -67,16 +67,60 @@
         for  i in 1:N
             # Fixed gaps
             msa = msas[i]
-            aln = shuffle(msa, 1, true)
+            aln = shuffle_msa(msa, dims=1, fixedgaps=true)
             @test aln != getresidues(msa)
             @test (aln .== GAP) == gaps[i]
             @test lcol[i] != mean(aln .== Residue('L'), dims=1)
             @test lseq[i] == mean(aln .== Residue('L'), dims=2)
             # Change gap positions
-            aln = shuffle(msa, 1, false)
+            aln = shuffle_msa(msa, dims=1, fixedgaps=false)
             @test aln != getresidues(msa)
             @test (aln .== GAP) != gaps[i]
             @test lseq[i] == mean(aln .== Residue('L'), dims=2)
         end
     end
+
+    @testset "Reference" begin
+        
+        for msa in msas
+            ref = getsequence(msa, 1)
+            for dims in [1, 2]
+                for gaps in [true, false]
+                    aln = shuffle_msa(msa, dims=dims, fixedgaps=gaps, fixed_reference=true)
+                    @test getsequence(aln, 1) == ref
+                end
+            end
+        end
+    end
+
+    @testset "Subset" begin
+        for msa in msas
+            
+            seqs_to_move = [3, 4]
+            shuffled_seqs = shuffle_msa(msa, seqs_to_move, dims=1)
+            @test msa[1:2, :] == shuffled_seqs[1:2, :]
+            @test msa[seqs_to_move, :] != shuffled_seqs[seqs_to_move, :]
+
+            cols_to_move = [9, 10, 11, 12]
+            shuffled_cols = shuffle_msa(MersenneTwister(0), msa, cols_to_move, dims=2)
+            @test msa[:, 1:8] == shuffled_cols[:, 1:8]
+            @test msa[:, cols_to_move] != shuffled_cols[:, cols_to_move]
+
+            # Annotations
+            if isa(msa, AnnotatedMultipleSequenceAlignment)
+                @test isempty(getannotsequence(shuffled_seqs, "C3N734_SULIY/1-95", "Shuffled", ""))
+                @test isempty(getannotsequence(shuffled_seqs, "H2C869_9CREN/7-104", "Shuffled", ""))
+                @test getannotsequence(shuffled_seqs, "Y070_ATV/2-70", "Shuffled", "") == "true"
+                @test getannotsequence(shuffled_seqs, "F112_SSV1/3-112", "Shuffled", "") == "true"
+                
+                #                                                                         1111
+                #                                                                1234567890123
+                @test startswith(getannotcolumn(shuffled_cols, "Shuffled", ""), "0000000011110")
+
+                # MIToS modifications
+                any(startswith("2 sequences shuffled."), values(getannotfile(shuffled_seqs)))
+                any(startswith("4 columns shuffled."), values(getannotfile(shuffled_cols)))
+            end
+        end
+    end
 end