-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtanmotif2fasta.cpp
114 lines (101 loc) · 2.98 KB
/
tanmotif2fasta.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include "piler2.h"
static void LogRecords(const MotifData *Motifs, int MotifCount)
{
Log(" Contig From To Length Family\n");
Log("-------------------- ---------- ---------- ------ ------\n");
for (int MotifIndex = 0; MotifIndex < MotifCount; ++MotifIndex)
{
const MotifData &Motif = Motifs[MotifIndex];
Log("%20.20s %10d %10d %6d %d\n",
Motif.ContigLabel,
Motif.ContigFrom,
Motif.ContigTo,
Motif.ContigTo - Motif.ContigFrom + 1,
Motif.FamIndex);
}
}
static int CmpMotif(const void *s1, const void *s2)
{
const MotifData *Motif1 = (const MotifData *) s1;
const MotifData *Motif2 = (const MotifData *) s2;
return Motif1->FamIndex - Motif2->FamIndex;
}
static char *FamFileName(const char *Path, int Fam)
{
static char FileName[256];
char s[128];
sprintf(s, "/%d", Fam);
if (strlen(Path) + strlen(s) + 3 >= sizeof(FileName))
Quit("Path name too long");
strcpy(FileName, Path);
strcat(FileName, s);
fprintf(stderr, "\n\n*** FILE NAME *** '%s'\n", FileName);
return FileName;
}
static char *MotifLabel(const char *Prefix, const MotifData &Motif)
{
int n = (int) strlen(Motif.ContigLabel) + 128;
char *s = all(char, n);
if (0 != Prefix)
sprintf(s, "%s.%d %s:%d",
Prefix,
Motif.FamIndex,
Motif.ContigLabel,
Motif.ContigFrom+1);
else
sprintf(s, "%d %s:%d",
Motif.FamIndex,
Motif.ContigLabel,
Motif.ContigFrom+1);
return s;
}
void Tanmotif2Fasta()
{
const char *MotifFileName = RequiredValueOpt("tanmotif2fasta");
const char *SeqFileName = RequiredValueOpt("seq");
const char *Path = ValueOpt("path");
const char *strMaxFam = ValueOpt("maxfam");
const char *Prefix = ValueOpt("prefix");
int MaxFam = DEFAULT_MAX_FAM;
if (strMaxFam != 0)
MaxFam = atoi(strMaxFam);
if (0 == Path)
Path = ".";
ProgressStart("Reading seq file");
int SeqLength;
const char *Seq = ReadMFA(SeqFileName, &SeqLength);
ProgressDone();
Progress("Seq length %d bases, %.3g Mb", SeqLength, SeqLength/1e6);
ProgressStart("Read Motif file");
int MotifCount;
MotifData *Motifs = ReadMotif(MotifFileName, &MotifCount);
ProgressDone();
Progress("%d records", MotifCount);
ProgressStart("Sorting by family");
qsort((void *) Motifs, MotifCount, sizeof(MotifData), CmpMotif);
ProgressDone();
FILE *f = 0;
int CurrentFamily = -1;
int MemberCount = 0;
for (int MotifIndex = 0; MotifIndex < MotifCount; ++MotifIndex)
{
const MotifData &Motif = Motifs[MotifIndex];
if (Motif.FamIndex != CurrentFamily)
{
if (f != 0)
fclose(f);
char *FastaFileName = FamFileName(Path, Motif.FamIndex);
f = OpenStdioFile(FastaFileName, FILEIO_MODE_WriteOnly);
CurrentFamily = Motif.FamIndex;
MemberCount = 0;
}
++MemberCount;
if (MemberCount > MaxFam)
continue;
const int From = ContigToGlobal(Motif.ContigFrom, Motif.ContigLabel);
const int Length = Motif.ContigTo - Motif.ContigFrom + 1;
char *Label = MotifLabel(Prefix, Motif);
WriteFasta(f, Seq + From, Length, Label, false);
freemem(Label);
}
}