-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcons.cpp
134 lines (120 loc) · 2.62 KB
/
cons.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include "piler2.h"
static const char Letter[5] = { 'A', 'C', 'G', 'T', '-'};
static void GetCounts(const char *Seqs, int ColCount, int SeqCount,
int ColIndex, int Counts[5])
{
memset(Counts, 0, 5*sizeof(unsigned));
for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)
{
char c = Seqs[SeqIndex*ColCount + ColIndex];
c = toupper(c);
switch (c)
{
case 'a':
case 'A':
++(Counts[0]);
break;
case 'c':
case 'C':
++(Counts[1]);
break;
case 'g':
case 'G':
++(Counts[2]);
break;
case 't':
case 'T':
++(Counts[3]);
break;
case '-':
++(Counts[4]);
break;
}
}
}
static bool Conserved(const char *Seqs, int ColCount, int SeqCount, int Col)
{
int Counts[5];
GetCounts(Seqs, ColCount, SeqCount, Col, Counts);
if (Counts[4] > 0)
return false;
for (int i = 0; i < 4; ++i)
if (Counts[i] != 0 && Counts[i] != SeqCount)
return false;
return true;
}
static void FindStartEndExact(const char *Seqs, int ColCount, int SeqCount,
int *ptrStart, int *ptrEnd)
{
int BestLength = 0;
int BestStart = 0;
int BestEnd = 0;
int Length = 0;
int Start = 0;
for (int Col = 0; Col <= ColCount; ++Col)
{
if (Col != ColCount && Conserved(Seqs, ColCount, SeqCount, Col))
{
if (Start == -1)
{
Start = Col;
Length = 1;
}
else
++Length;
}
else
{
if (Length > BestLength)
{
BestStart = Start;
BestEnd = Col - 1;
if (BestEnd - BestStart + 1 != Length)
Quit("BestEnd");
}
Length = -1;
Start = -1;
}
}
*ptrStart = BestStart;
*ptrEnd = BestEnd;
}
void Cons()
{
const char *InputFileName = RequiredValueOpt("cons");
const char *OutputFileName = RequiredValueOpt("out");
const char *Label = RequiredValueOpt("label");
bool Exact = FlagOpt("exact");
ProgressStart("Reading alignment");
int ColCount;
int SeqCount;
const char *Seqs = ReadAFA(InputFileName, &ColCount, &SeqCount);
ProgressDone();
Progress("%d seqs, length %d", SeqCount, ColCount);
char *ConsSeq = all(char, ColCount+1);
int ConsSeqLength = 0;
int StartCol = 0;
int EndCol = ColCount - 1;
if (Exact)
FindStartEndExact(Seqs, ColCount, SeqCount, &StartCol, &EndCol);
for (int Col = StartCol; Col <= EndCol; ++Col)
{
int Counts[5];
GetCounts(Seqs, ColCount, SeqCount, Col, Counts);
int MaxCount = 0;
char MaxLetter = 'A';
for (int i = 0; i < 4; ++i)
{
if (Counts[i] > MaxCount)
{
MaxLetter = Letter[i];
MaxCount = Counts[i];
}
}
if (MaxLetter == '-')
continue;
ConsSeq[ConsSeqLength++] = MaxLetter;
}
FILE *f = OpenStdioFile(OutputFileName, FILEIO_MODE_WriteOnly);
WriteFasta(f, ConsSeq, ConsSeqLength, Label);
}