Skip to content

Commit

Permalink
Added command : gotree extract mutations
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Sep 6, 2023
1 parent 46057ab commit 76f7e39
Show file tree
Hide file tree
Showing 5 changed files with 285 additions and 1 deletion.
109 changes: 109 additions & 0 deletions cmd/extractmutations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package cmd

import (
"bufio"
"fmt"
goio "io"
"os"

"github.com/evolbioinfo/goalign/align"
"github.com/evolbioinfo/goalign/io/fasta"
"github.com/evolbioinfo/goalign/io/phylip"
"github.com/evolbioinfo/gotree/io"
"github.com/evolbioinfo/gotree/io/utils"
"github.com/evolbioinfo/gotree/mutations"
"github.com/evolbioinfo/gotree/tree"
"github.com/spf13/cobra"
)

var mutationsalign string
var mutationsphylip bool
var mutationsinputstrict bool
var outfile string

// mutationsCmd represents the mutations command
var mutationsCmd = &cobra.Command{
Use: "mutations",
Short: "Extract the list of mutations along the branches of the phylogeny.",
Long: `Extract the list of mutations along the branches of the phylogeny, given
the full list of ancestral (and terminal) sequences.
The input tree must have internal node names specified and must be rooted.
The input alignment (fasta or phylip only) must specify one sequence per internal
node name and tip.
The output consists of the list of mutations that appear along the branches of the
tree, tab separated text file:
1. Tree index (useful if several trees in the input tree file)
2. Alignment site index
3. Branch index
4. Child node name
5. Parent character
6. Child character
7. Number of descendent tips
8. Number of descendent tips that have the child character
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var align align.Alignment
var fi goio.Closer
var r *bufio.Reader
var treefile goio.Closer
var treechan <-chan tree.Trees
var f *os.File
var muts *mutations.MutationList

// Reading the alignment
if fi, r, err = utils.GetReader(mutationsalign); err != nil {
io.LogError(err)
return
}
if mutationsphylip {
if align, err = phylip.NewParser(r, mutationsinputstrict).Parse(); err != nil {
io.LogError(err)
return
}
} else {
if align, err = fasta.NewParser(r).Parse(); err != nil {
io.LogError(err)
return
}
}
fi.Close()

// Reading the trees
if treefile, treechan, err = readTrees(intreefile); err != nil {
io.LogError(err)
return
}
defer treefile.Close()

if f, err = openWriteFile(outfile); err != nil {
io.LogError(err)
return
}
defer closeWriteFile(f, outfile)

fmt.Fprintf(f, "Tree ID\tSite\tBranch ID\tNode Name\tParent Character\tChild Character\tTotal tips\tSame Character Tips\n")

for t := range treechan {
if muts, err = mutations.CountMutations(t.Tree, align); err != nil {
io.LogError(err)
return
}
for _, m := range muts.Mutations {
fmt.Fprintf(f, "%d\t%d\t%d\t%s\t%c\t%c\t%d\t%d\n", t.Id, m.AlignmentSite, m.BranchIndex, m.ChildNodeName, m.ParentCharacter, m.ChildCharacter, m.NumTips, m.NumTipsWithChildCharacter)
}
}
return
},
}

func init() {
computeCmd.AddCommand(mutationsCmd)
mutationsCmd.PersistentFlags().StringVarP(&mutationsalign, "align", "a", "stdin", "Alignment input file")
mutationsCmd.PersistentFlags().BoolVarP(&mutationsphylip, "phylip", "p", false, "Alignment is in phylip? default : false (Fasta)")
mutationsCmd.PersistentFlags().BoolVar(&mutationsinputstrict, "input-strict", false, "Strict phylip input format (only used with -p)")
mutationsCmd.PersistentFlags().StringVarP(&intreefile, "input", "i", "stdin", "Input tree")
mutationsCmd.PersistentFlags().StringVarP(&outfile, "output", "o", "stdout", "Output file")
}
4 changes: 3 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.19

require (
github.com/ajstarks/svgo v0.0.0-20210406150507-75cfd577ce75
github.com/evolbioinfo/goalign v0.3.4
github.com/evolbioinfo/goalign v0.3.7-0.20230906113011-fcecb09f9d43
github.com/fredericlemoine/bitset v1.2.0
github.com/fredericlemoine/cobrashell v0.0.0-20180921081141-49c72f93426c
github.com/fredericlemoine/gostats v0.1.1
Expand All @@ -25,5 +25,7 @@ require (
github.com/mattn/go-colorable v0.1.8 // indirect
github.com/mattn/go-isatty v0.0.12 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/ulikunitz/xz v0.5.10 // indirect
golang.org/x/sys v0.8.0 // indirect
gonum.org/v1/gonum v0.9.3 // indirect
)
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZm
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
github.com/evolbioinfo/goalign v0.3.4 h1:+sNkWC5Mm+9BjyEVPRaVDFnBvWkUssCZhXUrxBdD71Y=
github.com/evolbioinfo/goalign v0.3.4/go.mod h1:CUR+czHQ8kX76kFchGiGrcVZfbfwVSFDJ4ia78sL5KM=
github.com/evolbioinfo/goalign v0.3.7-0.20230906113011-fcecb09f9d43 h1:FPlPwwFF9OzEj9CclD2fFQYLJ4aXxiLKw/AoFy3Pse4=
github.com/evolbioinfo/goalign v0.3.7-0.20230906113011-fcecb09f9d43/go.mod h1:2W3eQYCYJ+a1uTI9tCelGT4DlgiztrLB6mAUGRt0c4Y=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/fatih/color v1.10.0 h1:s36xzo75JdqLaaWoiEHk767eHiwo0598uUxyfiPkDsg=
github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM=
Expand Down Expand Up @@ -237,6 +239,7 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
github.com/ulikunitz/xz v0.5.10 h1:t92gobL9l3HE202wg3rlk19F6X+JOxl9BBrCCMYEYd8=
github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
Expand Down Expand Up @@ -394,6 +397,8 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T
gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
gonum.org/v1/gonum v0.9.1-0.20210325102323-76f2be9ab53e/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s=
gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
Expand Down
125 changes: 125 additions & 0 deletions mutations/countmutations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package mutations

import (
"fmt"

"github.com/evolbioinfo/goalign/align"
"github.com/evolbioinfo/gotree/io"
"github.com/evolbioinfo/gotree/tree"
)

func CountMutations(t *tree.Tree, a align.Alignment) (mutations *MutationList, err error) {
var sitemutations *MutationList
mutations = NewMutationList()

// We set branch ids
nbranches := 0
for _, e := range t.Edges() {
e.SetId(nbranches)
nbranches++
}

// We set nodes ids identical to index in alignment
// and we check that sequences correspond to all tree nodes
// and all tree nodes have a name
for _, n := range t.Nodes() {
var i int

if n.Name() == "" {
err = fmt.Errorf("all nodes of the phylogeny must have a name")
io.LogError(err)
return
}
if i = a.GetSequenceIdByName(n.Name()); i < 0 {
err = fmt.Errorf("node %s of the phylogeny does not have an associated sequence in the alignment", n.Name())
io.LogError(err)
return
}
n.SetId(i)
}

// We iterate over alignment sites
for i := 0; i < a.Length(); i++ {
if sitemutations, err = countMutationsSite(t, a, i); err != nil {
io.LogError(err)
return
}
if err = mutations.Append(sitemutations); err != nil {
io.LogError(err)
return
}
}

return
}

func countMutationsSite(t *tree.Tree, a align.Alignment, site int) (mutations *MutationList, err error) {
mutations, _, _, err = countMutationSiteBranch(t, nil, t.Root(), nil, a, site)
return
}

func countMutationSiteBranch(t *tree.Tree, prevNode *tree.Node, currentNode *tree.Node, currentBranch *tree.Edge, a align.Alignment, site int) (mutations *MutationList, ntips int, characterDistribution map[uint8]int, err error) {
var tmpMutations *MutationList
var tmpntips, nidtips int
var tmpCharacterDistribution map[uint8]int
var nextNode *tree.Node
var nextIndex int
var prevChar, curChar uint8
var prevSeq, curSeq []uint8

mutations = NewMutationList()
characterDistribution = make(map[uint8]int)

curSeq, _ = a.GetSequenceCharById(currentNode.Id())
curChar = curSeq[site]

if currentNode.Tip() {
ntips = 1
characterDistribution[curChar] = 1
} else {
for nextIndex, nextNode = range currentNode.Neigh() {
if nextNode != prevNode {
if tmpMutations, tmpntips, tmpCharacterDistribution, err = countMutationSiteBranch(t, currentNode, nextNode, currentNode.Edges()[nextIndex], a, site); err != nil {
return
}
for char, nb := range tmpCharacterDistribution {
if _, exist := characterDistribution[char]; !exist {
characterDistribution[char] = nb
} else {
characterDistribution[char] += nb
}
}
ntips += tmpntips
if err = mutations.Append(tmpMutations); err != nil {
return
}
}
}
}

if prevNode != nil {
prevSeq, _ = a.GetSequenceCharById(prevNode.Id())
prevChar = prevSeq[site]

if n, exist := characterDistribution[curChar]; !exist {
nidtips = 0
} else {
nidtips = n
}

if prevChar != curChar {
k := fmt.Sprintf("%d-%d-%c-%c", site, currentBranch.Id(), rune(prevChar), rune(curChar))
m := Mutation{
AlignmentSite: site,
BranchIndex: currentBranch.Id(),
ChildNodeName: currentNode.Name(),
ParentCharacter: prevChar,
ChildCharacter: curChar,
NumTips: ntips,
NumTipsWithChildCharacter: nidtips,
}
mutations.Mutations[k] = m
}
}
return
}
43 changes: 43 additions & 0 deletions mutations/mutations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// package mutations provides data structures & functions
// for counting mutations given an alignment of ancestral and tips sequences
package mutations

import (
"fmt"

"github.com/evolbioinfo/gotree/io"
)

type Mutation struct {
AlignmentSite int // Index of the site of the alignment
BranchIndex int // Index of the branch
ChildNodeName string // Name of the parent of the clade
ParentCharacter uint8 // Parent character
ChildCharacter uint8 // Child character
NumTips int // Total number of descendent tips
NumTipsWithChildCharacter int // Number of descendent tips that have the child character
}

type MutationList struct {
Mutations map[string]Mutation // Key: "AlignmentSite-BranchIndex-ParentCharacter-ChildCharacter"
}

func NewMutationList() (mutations *MutationList) {
mutations = &MutationList{
Mutations: make(map[string]Mutation),
}
return
}

func (m *MutationList) Append(mapp *MutationList) (err error) {
var exist bool
for k, v := range mapp.Mutations {
if _, exist = m.Mutations[k]; exist {
err = fmt.Errorf("mutation %s already exist in the list", k)
io.LogError(err)
return
}
m.Mutations[k] = v
}
return
}

0 comments on commit 76f7e39

Please sign in to comment.