From 76f7e3981d033877e71a4ee2683b00951d703bfe Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Wed, 6 Sep 2023 16:21:03 +0200 Subject: [PATCH] Added command : gotree extract mutations --- cmd/extractmutations.go | 109 +++++++++++++++++++++++++++++++ go.mod | 4 +- go.sum | 5 ++ mutations/countmutations.go | 125 ++++++++++++++++++++++++++++++++++++ mutations/mutations.go | 43 +++++++++++++ 5 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 cmd/extractmutations.go create mode 100644 mutations/countmutations.go create mode 100644 mutations/mutations.go diff --git a/cmd/extractmutations.go b/cmd/extractmutations.go new file mode 100644 index 0000000..6eb2c41 --- /dev/null +++ b/cmd/extractmutations.go @@ -0,0 +1,109 @@ +package cmd + +import ( + "bufio" + "fmt" + goio "io" + "os" + + "github.com/evolbioinfo/goalign/align" + "github.com/evolbioinfo/goalign/io/fasta" + "github.com/evolbioinfo/goalign/io/phylip" + "github.com/evolbioinfo/gotree/io" + "github.com/evolbioinfo/gotree/io/utils" + "github.com/evolbioinfo/gotree/mutations" + "github.com/evolbioinfo/gotree/tree" + "github.com/spf13/cobra" +) + +var mutationsalign string +var mutationsphylip bool +var mutationsinputstrict bool +var outfile string + +// mutationsCmd represents the mutations command +var mutationsCmd = &cobra.Command{ + Use: "mutations", + Short: "Extract the list of mutations along the branches of the phylogeny.", + Long: `Extract the list of mutations along the branches of the phylogeny, given + the full list of ancestral (and terminal) sequences. + + The input tree must have internal node names specified and must be rooted. + The input alignment (fasta or phylip only) must specify one sequence per internal + node name and tip. + + The output consists of the list of mutations that appear along the branches of the + tree, tab separated text file: + + 1. Tree index (useful if several trees in the input tree file) + 2. Alignment site index + 3. Branch index + 4. Child node name + 5. Parent character + 6. Child character + 7. Number of descendent tips + 8. Number of descendent tips that have the child character +`, + RunE: func(cmd *cobra.Command, args []string) (err error) { + var align align.Alignment + var fi goio.Closer + var r *bufio.Reader + var treefile goio.Closer + var treechan <-chan tree.Trees + var f *os.File + var muts *mutations.MutationList + + // Reading the alignment + if fi, r, err = utils.GetReader(mutationsalign); err != nil { + io.LogError(err) + return + } + if mutationsphylip { + if align, err = phylip.NewParser(r, mutationsinputstrict).Parse(); err != nil { + io.LogError(err) + return + } + } else { + if align, err = fasta.NewParser(r).Parse(); err != nil { + io.LogError(err) + return + } + } + fi.Close() + + // Reading the trees + if treefile, treechan, err = readTrees(intreefile); err != nil { + io.LogError(err) + return + } + defer treefile.Close() + + if f, err = openWriteFile(outfile); err != nil { + io.LogError(err) + return + } + defer closeWriteFile(f, outfile) + + fmt.Fprintf(f, "Tree ID\tSite\tBranch ID\tNode Name\tParent Character\tChild Character\tTotal tips\tSame Character Tips\n") + + for t := range treechan { + if muts, err = mutations.CountMutations(t.Tree, align); err != nil { + io.LogError(err) + return + } + for _, m := range muts.Mutations { + fmt.Fprintf(f, "%d\t%d\t%d\t%s\t%c\t%c\t%d\t%d\n", t.Id, m.AlignmentSite, m.BranchIndex, m.ChildNodeName, m.ParentCharacter, m.ChildCharacter, m.NumTips, m.NumTipsWithChildCharacter) + } + } + return + }, +} + +func init() { + computeCmd.AddCommand(mutationsCmd) + mutationsCmd.PersistentFlags().StringVarP(&mutationsalign, "align", "a", "stdin", "Alignment input file") + mutationsCmd.PersistentFlags().BoolVarP(&mutationsphylip, "phylip", "p", false, "Alignment is in phylip? default : false (Fasta)") + mutationsCmd.PersistentFlags().BoolVar(&mutationsinputstrict, "input-strict", false, "Strict phylip input format (only used with -p)") + mutationsCmd.PersistentFlags().StringVarP(&intreefile, "input", "i", "stdin", "Input tree") + mutationsCmd.PersistentFlags().StringVarP(&outfile, "output", "o", "stdout", "Output file") +} diff --git a/go.mod b/go.mod index b72f96a..1f7b7ed 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.19 require ( github.com/ajstarks/svgo v0.0.0-20210406150507-75cfd577ce75 - github.com/evolbioinfo/goalign v0.3.4 + github.com/evolbioinfo/goalign v0.3.7-0.20230906113011-fcecb09f9d43 github.com/fredericlemoine/bitset v1.2.0 github.com/fredericlemoine/cobrashell v0.0.0-20180921081141-49c72f93426c github.com/fredericlemoine/gostats v0.1.1 @@ -25,5 +25,7 @@ require ( github.com/mattn/go-colorable v0.1.8 // indirect github.com/mattn/go-isatty v0.0.12 // indirect github.com/spf13/pflag v1.0.5 // indirect + github.com/ulikunitz/xz v0.5.10 // indirect golang.org/x/sys v0.8.0 // indirect + gonum.org/v1/gonum v0.9.3 // indirect ) diff --git a/go.sum b/go.sum index 466d3fb..d5b74ff 100644 --- a/go.sum +++ b/go.sum @@ -60,6 +60,8 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZm github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/evolbioinfo/goalign v0.3.4 h1:+sNkWC5Mm+9BjyEVPRaVDFnBvWkUssCZhXUrxBdD71Y= github.com/evolbioinfo/goalign v0.3.4/go.mod h1:CUR+czHQ8kX76kFchGiGrcVZfbfwVSFDJ4ia78sL5KM= +github.com/evolbioinfo/goalign v0.3.7-0.20230906113011-fcecb09f9d43 h1:FPlPwwFF9OzEj9CclD2fFQYLJ4aXxiLKw/AoFy3Pse4= +github.com/evolbioinfo/goalign v0.3.7-0.20230906113011-fcecb09f9d43/go.mod h1:2W3eQYCYJ+a1uTI9tCelGT4DlgiztrLB6mAUGRt0c4Y= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.10.0 h1:s36xzo75JdqLaaWoiEHk767eHiwo0598uUxyfiPkDsg= github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= @@ -237,6 +239,7 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= +github.com/ulikunitz/xz v0.5.10 h1:t92gobL9l3HE202wg3rlk19F6X+JOxl9BBrCCMYEYd8= github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= @@ -394,6 +397,8 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.1-0.20210325102323-76f2be9ab53e/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= +gonum.org/v1/gonum v0.9.3 h1:DnoIG+QAMaF5NvxnGe/oKsgKcAc6PcUyl8q0VetfQ8s= +gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= diff --git a/mutations/countmutations.go b/mutations/countmutations.go new file mode 100644 index 0000000..7b4c551 --- /dev/null +++ b/mutations/countmutations.go @@ -0,0 +1,125 @@ +package mutations + +import ( + "fmt" + + "github.com/evolbioinfo/goalign/align" + "github.com/evolbioinfo/gotree/io" + "github.com/evolbioinfo/gotree/tree" +) + +func CountMutations(t *tree.Tree, a align.Alignment) (mutations *MutationList, err error) { + var sitemutations *MutationList + mutations = NewMutationList() + + // We set branch ids + nbranches := 0 + for _, e := range t.Edges() { + e.SetId(nbranches) + nbranches++ + } + + // We set nodes ids identical to index in alignment + // and we check that sequences correspond to all tree nodes + // and all tree nodes have a name + for _, n := range t.Nodes() { + var i int + + if n.Name() == "" { + err = fmt.Errorf("all nodes of the phylogeny must have a name") + io.LogError(err) + return + } + if i = a.GetSequenceIdByName(n.Name()); i < 0 { + err = fmt.Errorf("node %s of the phylogeny does not have an associated sequence in the alignment", n.Name()) + io.LogError(err) + return + } + n.SetId(i) + } + + // We iterate over alignment sites + for i := 0; i < a.Length(); i++ { + if sitemutations, err = countMutationsSite(t, a, i); err != nil { + io.LogError(err) + return + } + if err = mutations.Append(sitemutations); err != nil { + io.LogError(err) + return + } + } + + return +} + +func countMutationsSite(t *tree.Tree, a align.Alignment, site int) (mutations *MutationList, err error) { + mutations, _, _, err = countMutationSiteBranch(t, nil, t.Root(), nil, a, site) + return +} + +func countMutationSiteBranch(t *tree.Tree, prevNode *tree.Node, currentNode *tree.Node, currentBranch *tree.Edge, a align.Alignment, site int) (mutations *MutationList, ntips int, characterDistribution map[uint8]int, err error) { + var tmpMutations *MutationList + var tmpntips, nidtips int + var tmpCharacterDistribution map[uint8]int + var nextNode *tree.Node + var nextIndex int + var prevChar, curChar uint8 + var prevSeq, curSeq []uint8 + + mutations = NewMutationList() + characterDistribution = make(map[uint8]int) + + curSeq, _ = a.GetSequenceCharById(currentNode.Id()) + curChar = curSeq[site] + + if currentNode.Tip() { + ntips = 1 + characterDistribution[curChar] = 1 + } else { + for nextIndex, nextNode = range currentNode.Neigh() { + if nextNode != prevNode { + if tmpMutations, tmpntips, tmpCharacterDistribution, err = countMutationSiteBranch(t, currentNode, nextNode, currentNode.Edges()[nextIndex], a, site); err != nil { + return + } + for char, nb := range tmpCharacterDistribution { + if _, exist := characterDistribution[char]; !exist { + characterDistribution[char] = nb + } else { + characterDistribution[char] += nb + } + } + ntips += tmpntips + if err = mutations.Append(tmpMutations); err != nil { + return + } + } + } + } + + if prevNode != nil { + prevSeq, _ = a.GetSequenceCharById(prevNode.Id()) + prevChar = prevSeq[site] + + if n, exist := characterDistribution[curChar]; !exist { + nidtips = 0 + } else { + nidtips = n + } + + if prevChar != curChar { + k := fmt.Sprintf("%d-%d-%c-%c", site, currentBranch.Id(), rune(prevChar), rune(curChar)) + m := Mutation{ + AlignmentSite: site, + BranchIndex: currentBranch.Id(), + ChildNodeName: currentNode.Name(), + ParentCharacter: prevChar, + ChildCharacter: curChar, + NumTips: ntips, + NumTipsWithChildCharacter: nidtips, + } + mutations.Mutations[k] = m + } + } + return +} diff --git a/mutations/mutations.go b/mutations/mutations.go new file mode 100644 index 0000000..2796474 --- /dev/null +++ b/mutations/mutations.go @@ -0,0 +1,43 @@ +// package mutations provides data structures & functions +// for counting mutations given an alignment of ancestral and tips sequences +package mutations + +import ( + "fmt" + + "github.com/evolbioinfo/gotree/io" +) + +type Mutation struct { + AlignmentSite int // Index of the site of the alignment + BranchIndex int // Index of the branch + ChildNodeName string // Name of the parent of the clade + ParentCharacter uint8 // Parent character + ChildCharacter uint8 // Child character + NumTips int // Total number of descendent tips + NumTipsWithChildCharacter int // Number of descendent tips that have the child character +} + +type MutationList struct { + Mutations map[string]Mutation // Key: "AlignmentSite-BranchIndex-ParentCharacter-ChildCharacter" +} + +func NewMutationList() (mutations *MutationList) { + mutations = &MutationList{ + Mutations: make(map[string]Mutation), + } + return +} + +func (m *MutationList) Append(mapp *MutationList) (err error) { + var exist bool + for k, v := range mapp.Mutations { + if _, exist = m.Mutations[k]; exist { + err = fmt.Errorf("mutation %s already exist in the list", k) + io.LogError(err) + return + } + m.Mutations[k] = v + } + return +}