forked from yhlink/kssdtree
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign.c
371 lines (307 loc) · 11.6 KB
/
align.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
/**********************************************************************
* File: align.c
* Author: Kevin Howe
* Copyright (C) Genome Research Limited, 2002-
*-------------------------------------------------------------------
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------
* NOTES:
* Functions for the manipulation of multiple sequence
* alignments
**********************************************************************/
#include "njheaders/align.h"
static char comment = '#';
static char *whitespace = " \t\r\n";
static char *terminator = "//";
/**********************************************************************
FUNCTION: free_Alignment
DESCRIPTION:
Frees the memory used by the given alignment
ARGS:
An Alignment
RETURNS: A null pointer
NOTES:
**********************************************************************/
void *free_Alignment( struct Alignment *al ) {
unsigned int i;
if (al != NULL) {
if (al->seqs != NULL) {
for( i=0; i < al->numseqs; i++) {
al->seqs[i] = free_Sequence( al->seqs[i] );
}
al->seqs = free_util( al->seqs );
}
al = free_util( al );
}
return al;
}
/**********************************************************************
FUNCTION: read_MUL_Alignment
DESCRIPTION:
Reads in a muliple alignment from the given file handle and returns
it
ARGS:
A file handle
RETURNS: A number denoting the status:
A pointer to the struct Alignment object created, or NULL of there
was an error parsing the file
NOTES: It is assumed that the aligment file is in Pfam (MUL) format.
Garbage results should be expected if the input file is not in this
format
The function allocates all the memory necessary for the
alignment. The caller should call free_Alignment (align.h) to
free this memory when the alignment is no longer needed
This method is deprecated, because MUL format alignments are
parsed by the more general read_Stockholm_Alignment
**********************************************************************/
struct Alignment *read_MUL_Alignment( FILE *stream ) {
struct Alignment *aln;
unsigned int index = 0;
char tempname[MAX_NAME_LENGTH];
int c;
aln = (struct Alignment *) malloc_util( sizeof( struct Alignment ));
aln->numseqs = 0;
aln->seqs = NULL;
aln->length = 0;
while (( c = fgetc(stream)) != EOF) {
if (aln->numseqs % SEQ_BLOCKSIZE == 0) {
/* we need to allocate some more memory. But is it a malloc or realloc ? */
if (aln->numseqs == 0) {
aln->seqs = (struct Sequence **) malloc_util( SEQ_BLOCKSIZE * sizeof( struct Sequence *));
}
else {
aln->seqs = (struct Sequence **)
realloc_util( aln->seqs, (aln->numseqs + SEQ_BLOCKSIZE) * sizeof( struct Sequence *));
}
}
index = 0;
while (! isspace(c) ) {
tempname[index++] = c;
c = fgetc(stream);
}
tempname[index] = '\0';
while ( isspace(c = fgetc(stream)) );
aln->seqs[aln->numseqs] = empty_Sequence();
aln->seqs[aln->numseqs]->name = (char *) malloc_util( MAX_NAME_LENGTH * sizeof(char));
strcpy(aln->seqs[aln->numseqs]->name, tempname);
index = 0;
do {
if (index % RES_BLOCKSIZE == 0) {
/* we need to allocate some more memory. But is it a malloc or realloc ? */
if (index == 0) {
aln->seqs[aln->numseqs]->seq = (char *) malloc_util( RES_BLOCKSIZE * sizeof(char));
}
else {
aln->seqs[aln->numseqs]->seq = (char *)
realloc_util( aln->seqs[aln->numseqs]->seq, (index + RES_BLOCKSIZE) * sizeof(char));
}
}
if (! isspace(c) )
aln->seqs[aln->numseqs]->seq[index++] = c;
c = fgetc(stream);
} while ( c != '\n' && c != EOF);
/* Since we now know the length of the sequence, we can do a bit of resizing */
if (aln->numseqs == 0 || index < aln->length) {
/* The first sequence is the trend setter */
aln->length = index;
}
/* First, resize the current sequence */
aln->seqs[aln->numseqs]->seq = (char *)
realloc_util( aln->seqs[aln->numseqs]->seq, aln->length * sizeof(char));
/* if we reduced the aligment length, The earlier sequences are too big;
Since we will only read up to the length of the smallest sequence, this
will only prove a problem if we run out of memory; however, if the user
given non-flused alignments, then they are asking for everything they get...
*/
aln->numseqs++;
}
/* now we can resize the seq array, and all the actual sequences themselves; this resizing
will only save significant memory if a non-flush alignment has been given. Is it worth
it? Well, we have to set the length of each sequence, so may as well do it while we are
here*/
aln->seqs = (struct Sequence **) realloc_util( aln->seqs, aln->numseqs * sizeof( struct Sequence *));
for (index=0; index < aln->numseqs; index++) {
aln->seqs[index]->length = aln->length;
aln->seqs[index]->seq = (char *)
realloc_util( aln->seqs[index]->seq, aln->length * sizeof(char));
}
return aln;
}
/**********************************************************************
FUNCTION: write_MUL_Alignment
DESCRIPTION:
Prints a rep. of the alignment to the given handle (in MUL format)
ARGS:
FILE *
struct Alignment (align.h)
RETURNS: struct Alignment (align.h)
NOTES:
**********************************************************************/
void write_MUL_Alignment( FILE *handle, struct Alignment *al ) {
unsigned int i,j;
for( i=0; i < al->numseqs; i++ ) {
fprintf( handle, "%-24s ", al->seqs[i]->name);
for( j=0; j < al->length; j++) {
fprintf( handle, "%c", al->seqs[i]->seq[j]);
}
fprintf( handle, "\n");
}
fflush( handle );
}
/**********************************************************************
FUNCTION: read_Stockholm_Alignment
DESCRIPTION:
This function fills a simple alignment structure from an
file assumed to be in Stockholm format. At this stage, I am ignoring
all mark-up information (all lines beginning with '#' are ignored).
The function also allows for wrapped alignments. Note that Pfam
alignments in MUL format will be handled correctly by this function
ARGS:
FILE *
RETURNS: struct Alignment (align.h)
NOTES:
**********************************************************************/
struct Alignment *read_Stockholm_Alignment( FILE *handle ) {
struct Alignment *aln;
char *line = (char *) malloc_util( MAX_LINE_LEN * sizeof(char) );
unsigned int lineAllocSize = MAX_LINE_LEN;
char *name_ptr = NULL;
char *seq_ptr = NULL;
unsigned int i, j;
unsigned int numblocks = 0;
unsigned int thisseq = 0;
unsigned int last_idx = 0;
unsigned int got_line = 0;
unsigned int saw_blank_line = 0;
aln = (struct Alignment *) malloc_util( sizeof( struct Alignment ));
aln->numseqs = 0;
aln->seqs = NULL;
aln->length = 0;
numblocks = 0;
thisseq = 0;
/* skip to the start of the first block */
got_line = 0;
do {
if (fgets(line, lineAllocSize, handle) == NULL)
break;
else {
if (strchr(line, '\n') == NULL) { /* did not read whole line! */
do {
lineAllocSize += MAX_LINE_LEN;
line = (char *) realloc_util( line, lineAllocSize * sizeof(char) );
} while (fgets(line + lineAllocSize - MAX_LINE_LEN - 1, MAX_LINE_LEN+1,
handle) != NULL &&
strchr(line, '\n') == NULL);
}
if (line[0] == comment)
continue;
if (line[0] == '\n')
continue;
else if (strncmp(line, terminator, 2) == 0) {
break;
}
else if ( (name_ptr = strtok(line, whitespace)) != NULL &&
(name_ptr == line) &&
(seq_ptr = strtok( NULL, whitespace )) != NULL) {
got_line = 1;
} else {
fatal_util( "The alignment file is not valid Stockholm format\n");
}
}
} while (! got_line);
if (! got_line)
fatal_util( "The alignment file appears to have no valid lines in Stockholm format\n");
while (got_line) {
if (numblocks == 0) {
/* if this is the first block, set up the memory for the sequences themselves */
if (thisseq == 0) {
aln->seqs = (struct Sequence **) malloc_util ( sizeof( struct Sequence *));
aln->numseqs = 1;
}
else {
aln->numseqs++;
aln->seqs = (struct Sequence **) realloc_util( aln->seqs, aln->numseqs * sizeof( struct Sequence *));
}
aln->seqs[thisseq] = empty_Sequence();
aln->seqs[thisseq]->length = 0;
aln->seqs[thisseq]->name = (char *) malloc_util( (strlen( name_ptr ) + 1) * sizeof(char));
strcpy( aln->seqs[thisseq]->name, name_ptr );
/* get rid of whitespace at end of line */
for ( last_idx = strlen(seq_ptr) - 1; strchr(whitespace, seq_ptr[last_idx]) != NULL; last_idx--);
aln->seqs[thisseq]->seq = (char *) malloc_util( (last_idx + 1) * sizeof (char ) );
}
else {
if (strcmp(aln->seqs[thisseq]->name, name_ptr) != 0) {
warning_util("Your seq names are inconsistent across blocks. Using the names in the first block");
}
/* The following accounts for the fact that there may be whitepasce at the end of the line */
for ( last_idx = strlen(seq_ptr) - 1; strchr(whitespace, seq_ptr[last_idx]) != NULL; last_idx--);
aln->seqs[thisseq]->seq = (char *) realloc_util(aln->seqs[thisseq]->seq,
(aln->seqs[thisseq]->length + last_idx + 1) * sizeof(char));
}
/* and finally, copy the string across */
for( j=0; j <= last_idx; j++, aln->seqs[thisseq]->length++)
aln->seqs[thisseq]->seq[aln->seqs[thisseq]->length] = seq_ptr[j];
/* read the next line. If we come across one or more blank lines while
we do so, then assume that we are starting a new block */
got_line = saw_blank_line = 0;
do {
if (fgets(line, lineAllocSize, handle) != NULL) {
if (strchr(line, '\n') == NULL) { /* did not read whole line! */
do {
lineAllocSize += MAX_LINE_LEN;
line = (char *) realloc_util( line, lineAllocSize * sizeof(char) );
} while (fgets(line + lineAllocSize - MAX_LINE_LEN-1, MAX_LINE_LEN+1,
handle) != NULL &&
strchr(line, '\n') == NULL);
}
if (line[0] == comment)
continue;
else if (strncmp(line, terminator, 2) == 0) {
break;
}
else {
if ( (name_ptr = strtok(line, whitespace)) == NULL) {
saw_blank_line = 1;
}
else if (name_ptr == line &&
(seq_ptr = strtok(NULL, whitespace)) != NULL) {
got_line = 1;
}
else {
fatal_util( "The alignment file is not valid Stockholm format\n");
}
}
}
else
break;
} while (! got_line );
if (got_line) {
if (saw_blank_line) {
numblocks++;
thisseq = 0;
}
else {
thisseq++;
}
}
}
free_util(line);
/* just before we return, check the alignment */
for(i=0; i < aln->numseqs; i++) {
if (i==0)
aln->length = aln->seqs[i]->length;
else if (aln->length != aln->seqs[i]->length) {
fatal_util( "Your alignment segments were of different sizes!\n");
}
}
return aln;
}