9
|
1 /*
|
|
2 *
|
|
3 * Sintax: ./frags2text fragsFILE.frags fastaX fastaY fragsFILE.txt
|
|
4
|
|
5 */
|
|
6
|
|
7 #include <stdio.h>
|
|
8 #include <stdlib.h>
|
|
9 #include <string.h>
|
|
10 #include <inttypes.h>
|
|
11 #include "structs.h"
|
|
12 #include "commonFunctions.h"
|
|
13 #include "comparisonFunctions.h"
|
|
14
|
|
15 #define TAB_INSERT 70
|
|
16 #define READING_FRAG_BUFFER 10000
|
|
17
|
|
18 void csv_frag_to_struct_frag(char * l, struct FragFile * f){
|
|
19
|
|
20 //0 1 2 3 4 5 6 7 8 9 10 11 12 13
|
|
21 //Type xStart yStart xEnd yEnd strand(f/r) block length score ident similarity %ident SeqX SeqY
|
|
22 //Frag 3147493 3006054 3154663 2998884 f 0 7171 20868 6194 72.75 0.86 0 0
|
|
23
|
|
24
|
|
25 float bin;
|
|
26
|
|
27 sscanf(l, "%*s %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %c %"PRId64" %"PRIu64" %"PRIu64" %"PRIu64" %f %f %"PRIu64" %"PRIu64, &f->xStart, &f->yStart, &f->xEnd, &f->yEnd, &f->strand, &f->block, &f->length, &f->score, &f->ident, &f->similarity, &bin, &f->seqX, &f->seqY);
|
|
28
|
|
29 //printf("Read %d items\n", items);
|
|
30
|
|
31 }
|
|
32
|
|
33 struct rIndex2 * loadReadsIndex(char * filename, uint64_t * nReads){
|
|
34 struct rIndex2 * RR;
|
|
35 uint64_t nR=0,i;
|
|
36 FILE *f;
|
|
37 uint64_t fsize;
|
|
38
|
|
39 if ((f=fopen(filename,"rb"))==NULL) terror("Could not open index input file");
|
|
40
|
|
41 fseeko(f, 0, SEEK_END);
|
|
42 fsize = ftello(f);
|
|
43 rewind(f);
|
|
44 nR = fsize/sizeof(struct rIndex2);
|
|
45
|
|
46 if ((RR =(struct rIndex2*) calloc(nR,sizeof(struct rIndex2)))==NULL) terror("Could not allocate index");
|
|
47
|
|
48 for (i=0; i<nR; i++){
|
|
49 if(0 == fread(&RR[i],sizeof(struct rIndex2),1,f)) break;
|
|
50 }
|
|
51 fclose(f);
|
|
52 (*nReads) = nR;
|
|
53 return RR;
|
|
54 }
|
|
55
|
|
56 void write_headers(FILE * f1, FILE * f2, FILE * fO, uint64_t pos1, uint64_t pos2, struct FragFile * f){
|
|
57 fseek(f1, pos1, SEEK_SET);
|
|
58 char c = fgetc(f1);
|
|
59 while(c != '\n'){ fprintf(fO, "%c", c); c = fgetc(f1); }
|
|
60 fprintf(fO, " ALIGNED WITH ");
|
|
61 fseek(f2, pos2, SEEK_SET);
|
|
62 c = fgetc(f2);
|
|
63 while(c != '\n'){ fprintf(fO, "%c", c); c = fgetc(f2); }
|
|
64 fprintf(fO, " LENGTH: %"PRIu64" IDENT: %"PRIu64" STRAND: %c @(%"PRIu64", %"PRIu64")\n", f->length, f->ident, f->strand, f->xStart, f->yStart);
|
|
65 }
|
|
66
|
|
67 void get_both_seqs(char * fastaX, char * fastaY, uint64_t iniX, uint64_t finX, uint64_t iniY, uint64_t finY, uint64_t posX, uint64_t posY, uint64_t LacX, uint64_t LacY, uint64_t lX, uint64_t lY, FILE * fO){
|
|
68 char copyX[TAB_INSERT+1];
|
|
69 char copyY[TAB_INSERT+1];
|
|
70 memset(copyX, 0x0, TAB_INSERT+1);
|
|
71 memset(copyY, 0x0, TAB_INSERT+1);
|
|
72 uint64_t atcopyX = 0;
|
|
73 uint64_t atcopyY = 0;
|
|
74 uint64_t i;
|
|
75
|
|
76 uint64_t pos_used_x, pos_used_y;
|
|
77
|
|
78 // The X one
|
|
79 //fseek(fastaX, posX, SEEK_SET);
|
|
80 pos_used_x = posX;
|
|
81 char cX = fastaX[pos_used_x++];
|
|
82 while(cX != '\n'){ cX = fastaX[pos_used_x++]; }
|
|
83 uint64_t gpX = iniX - LacX;
|
|
84 uint64_t currX = 0, tab_counter;
|
|
85 while(currX < gpX){
|
|
86 cX = fastaX[pos_used_x++];
|
|
87 if(cX != '\n') ++currX;
|
|
88 }
|
|
89
|
|
90 // the other Y
|
|
91 //fseek(fastaY, posY, SEEK_SET);
|
|
92 pos_used_y = posY;
|
|
93 char cY = fastaY[pos_used_y++];
|
|
94 while(cY != '\n'){ cY = fastaY[pos_used_y++]; }
|
|
95 uint64_t gpY = iniY - LacY;
|
|
96 uint64_t currY = 0;
|
|
97 while(currY < gpY){
|
|
98 cY = fastaY[pos_used_y++];
|
|
99 if(cY != '\n') ++currY;
|
|
100 }
|
|
101
|
|
102
|
|
103
|
|
104
|
|
105 // Reached the region to show
|
|
106 currX = 0;
|
|
107 currY = 0;
|
|
108 cX = fastaX[pos_used_x++];
|
|
109 cY = fastaY[pos_used_y++];
|
|
110 //fprintf(fO, "\t");
|
|
111 tab_counter = 0;
|
|
112 while(currX < lX && currY < lY){
|
|
113
|
|
114 if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; }
|
|
115 cX = fastaX[pos_used_x++];
|
|
116 while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++];
|
|
117
|
|
118
|
|
119 if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; }
|
|
120 cY = fastaY[pos_used_y++];
|
|
121 while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++];
|
|
122
|
|
123 while(currX > currY){
|
|
124 if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; }
|
|
125 cY = fastaY[pos_used_y++];
|
|
126 while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++];
|
|
127 }
|
|
128 while(currX < currY){
|
|
129 if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; }
|
|
130 cX = fastaX[pos_used_x++];
|
|
131 while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++];
|
|
132 }
|
|
133
|
|
134 ++tab_counter;
|
|
135
|
|
136
|
|
137
|
|
138 if(tab_counter >= TAB_INSERT && currX == currY){
|
|
139
|
|
140 copyX[TAB_INSERT] = '\0';
|
|
141 copyY[TAB_INSERT] = '\0';
|
|
142 fprintf(fO, "X:\t%.*s\n\t", TAB_INSERT, copyX);
|
|
143 for(i=0; i<TAB_INSERT; i++){
|
|
144 if(copyX[i] == copyY[i]) fprintf(fO, "|"); else fprintf(fO, " ");
|
|
145 }
|
|
146 fprintf(fO, "\nY:\t%.*s\n\n", TAB_INSERT, copyY);
|
|
147 tab_counter = 0;
|
|
148 atcopyX = 0; atcopyY = 0;
|
|
149 }
|
|
150 }
|
|
151 if(atcopyX > 0){
|
|
152 copyX[atcopyX] = '\0';
|
|
153 copyY[atcopyY] = '\0';
|
|
154 fprintf(fO, "X:\t%.*s\n\t", (int)atcopyX, copyX);
|
|
155 for(i=0; i<atcopyX; i++){
|
|
156 if(copyX[i] == copyY[i]) fprintf(fO, "|"); else fprintf(fO, " ");
|
|
157 }
|
|
158 fprintf(fO, "\nY:\t%.*s\n\n", (int)atcopyY, copyY);
|
|
159 atcopyX = 0; atcopyY = 0;
|
|
160 }
|
|
161 fprintf(fO, "\n");
|
|
162 }
|
|
163
|
|
164
|
|
165 void get_seq_from_to(FILE * fasta, FILE * output, uint64_t ini, uint64_t fin, uint64_t pos, uint64_t Lac, uint64_t seqNum, uint64_t l, FILE * fO){
|
|
166 fseek(fasta, pos, SEEK_SET);
|
|
167 char c = fgetc(fasta);
|
|
168 while(c != '\n'){ c = fgetc(fasta); }
|
|
169 uint64_t gp = ini - Lac;
|
|
170 uint64_t curr = 0, tab_counter;
|
|
171 while(curr < gp){
|
|
172 c = fgetc(fasta);
|
|
173 if(c != '\n') ++curr;
|
|
174 }
|
|
175 // Reached the region to show
|
|
176 curr = 0;
|
|
177 c = fgetc(fasta);
|
|
178 fprintf(fO, "\t");
|
|
179 tab_counter = 0;
|
|
180 while(curr < l){
|
|
181 if(c != '\n') fprintf(fO, "%c", c);
|
|
182 c = fgetc(fasta);
|
|
183 if(c != '\n' || feof(fasta)){ ++curr; ++tab_counter; }
|
|
184 if(tab_counter == TAB_INSERT){ fprintf(fO, "\n\t"); tab_counter = 0;}
|
|
185 }
|
|
186 fprintf(fO, "\n");
|
|
187 }
|
|
188
|
|
189
|
|
190 void get_seq_from_to_rev(FILE * fasta, FILE * output, uint64_t ini, uint64_t fin, uint64_t pos, uint64_t Lac, uint64_t seqNum, uint64_t l, FILE * fO){
|
|
191 fseek(fasta, pos, SEEK_SET);
|
|
192 char c = fgetc(fasta);
|
|
193 while(c != '\n'){ c = fgetc(fasta); }
|
|
194 uint64_t gp = ini - Lac;
|
|
195 uint64_t curr = 0, tab_counter;
|
|
196 while(curr < gp){
|
|
197 c = fgetc(fasta);
|
|
198 if(c != '\n') ++curr;
|
|
199 }
|
|
200 // Reached the region to show
|
|
201 curr = 0;
|
|
202 c = fgetc(fasta);
|
|
203 fprintf(fO, "\t");
|
|
204 tab_counter = 0;
|
|
205 while(curr < l){
|
|
206 if(c != '\n') fprintf(fO, "%c", c);
|
|
207 c = fgetc(fasta);
|
|
208 if(c != '\n' || feof(fasta)){ ++curr; ++tab_counter; }
|
|
209 if(tab_counter == TAB_INSERT){ fprintf(fO, "\n\t"); tab_counter = 0;}
|
|
210 }
|
|
211 fprintf(fO, "\n");
|
|
212 }
|
|
213
|
|
214
|
|
215
|
|
216 int main(int ac, char** av) {
|
|
217 FILE* fFrags;
|
|
218 struct FragFile frag;
|
|
219
|
|
220 fprintf(stdout, "[WARNING] - Remember that if using a CSV make sure that the reverse y coordinates are transformed\n");
|
|
221
|
|
222 if (ac != 9)
|
|
223 terror("USE: ./frags2text fragsFILE.csv fastaX fastaY fastaYrev indexX indexY indexYrev fragsFILE.txt");
|
|
224
|
|
225 // prepared for multiple files
|
|
226 if ((fFrags = fopen(av[1], "rt")) == NULL)
|
|
227 terror("Opening Frags csv file");
|
|
228
|
|
229 // Open fastas
|
|
230
|
|
231 FILE * fX = NULL, * fY = NULL, * fYrev = NULL, * fO = NULL;
|
|
232 fX = fopen(av[2], "rt");
|
|
233 if(fX == NULL) terror("Could not open fasta X file");
|
|
234 fY = fopen(av[3], "rt");
|
|
235 if(fY == NULL) terror("Could not open fasta Y file");
|
|
236 fYrev = fopen(av[4], "rt");
|
|
237 if(fYrev == NULL) terror("Could not open fasta Y-rev file");
|
|
238
|
|
239
|
|
240 // Get file lengths
|
|
241 fseek(fX, 0, SEEK_END);
|
|
242 uint64_t aprox_lenX = ftell(fX);
|
|
243 rewind(fX);
|
|
244 char * strfastaX = (char *) malloc(aprox_lenX*sizeof(char));
|
|
245 fseek(fY, 0, SEEK_END);
|
|
246 uint64_t aprox_lenY = ftell(fY);
|
|
247 rewind(fY);
|
|
248 char * strfastaY = (char *) malloc(aprox_lenY*sizeof(char));
|
|
249 fseek(fYrev, 0, SEEK_END);
|
|
250 uint64_t aprox_lenYrev = ftell(fYrev);
|
|
251 rewind(fYrev);
|
|
252 char * strfastaYrev = (char *) malloc(aprox_lenYrev*sizeof(char));
|
|
253
|
|
254 if(strfastaX == NULL || strfastaY == NULL || strfastaYrev == NULL) terror("Could not allocate string sequences");
|
|
255
|
|
256 if(aprox_lenX != fread(strfastaX, sizeof(char), aprox_lenX, fX)) terror("Read wrong number of chars at X sequence");
|
|
257 if(aprox_lenY != fread(strfastaY, sizeof(char), aprox_lenY, fY)) terror("Read wrong number of chars at Y sequence");
|
|
258 if(aprox_lenYrev != fread(strfastaYrev, sizeof(char), aprox_lenYrev, fYrev)) terror("Read wrong number of chars at Y reversed sequence");
|
|
259
|
|
260
|
|
261 struct rIndex2 * RI_X, * RI_Y, * RI_Yrev;
|
|
262
|
|
263
|
|
264 uint64_t nReads_X, nReads_Y;
|
|
265 RI_X = loadReadsIndex(av[5], &nReads_X);
|
|
266 RI_Y = loadReadsIndex(av[6], &nReads_Y);
|
|
267 RI_Yrev = loadReadsIndex(av[7], &nReads_Y);
|
|
268
|
|
269 fO = fopen(av[8], "wt");
|
|
270 if(fO == NULL) terror("Could not open output alignments file");
|
|
271
|
|
272
|
|
273 //readSequenceLength(&n1, fFrags);
|
|
274 //readSequenceLength(&n2, fFrags);
|
|
275 // Skip first lines
|
|
276 char buffer[READING_FRAG_BUFFER];
|
12
|
277 int skipper = 0; while(skipper < 18){ if(NULL==fgets(buffer, READING_FRAG_BUFFER, fFrags)) terror("Couldnt read buffer"); skipper++; }
|
9
|
278
|
|
279
|
|
280
|
|
281
|
|
282 //readFragment(&frag, fFrags);
|
|
283
|
|
284
|
|
285 csv_frag_to_struct_frag(buffer, &frag);
|
|
286
|
|
287 // RI_X is the forward index for fasta file X
|
|
288 // RI_Y is the forward index for fasta file Y
|
|
289 // RI_Yrev is the reverse index for fasta file Y
|
|
290
|
|
291 int exit = 0;
|
|
292
|
|
293 while (!feof(fFrags) && exit == 0) {
|
|
294
|
|
295 //RI[id].pos is position in file of >
|
|
296 //RI[id].Lac is the sum of the reads length prior
|
|
297
|
|
298 if(frag.strand == 'f'){
|
|
299 write_headers(fX, fY, fO, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, &frag);
|
|
300 }else{
|
|
301 write_headers(fX, fYrev, fO, RI_X[frag.seqX].pos, RI_Yrev[(nReads_Y - frag.seqY) - 1].pos, &frag);
|
|
302 }
|
|
303
|
|
304
|
|
305 //get_seq_from_to(fX, fO, frag.xStart, frag.xEnd, RI_X[frag.seqX].pos, RI_X[frag.seqX].Lac, frag.seqX, frag.length, fO);
|
|
306
|
|
307
|
|
308
|
|
309 if(frag.strand == 'f'){
|
|
310 get_both_seqs(strfastaX, strfastaY, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, RI_X[frag.seqX].Lac, RI_Y[frag.seqY].Lac, frag.length, frag.length, fO);
|
|
311 //get_seq_from_to(fY, fO, frag.yStart, frag.yEnd, RI_Y[frag.seqY].pos, RI_Y[frag.seqY].Lac, frag.seqY, frag.length, fO);
|
|
312 }else{
|
|
313 uint64_t seqYnew;
|
|
314 seqYnew = (nReads_Y - frag.seqY) - 1;
|
|
315 get_both_seqs(strfastaX, strfastaYrev, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Yrev[seqYnew].pos, RI_X[frag.seqX].Lac, RI_Yrev[seqYnew].Lac, frag.length, frag.length, fO);
|
|
316 //get_seq_from_to_rev(fYrev, fO, frag.yStart, frag.yEnd, RI_Yrev[seqYnew].pos, RI_Yrev[seqYnew].Lac, seqYnew, frag.length, fO);
|
|
317 }
|
|
318
|
|
319 //readFragment(&frag, fFrags);
|
|
320 if(!feof(fFrags)){
|
12
|
321 if(NULL==fgets(buffer, READING_FRAG_BUFFER, fFrags)) terror("Couldnt read buffer");
|
9
|
322 csv_frag_to_struct_frag(buffer, &frag);
|
|
323
|
|
324
|
|
325 }else{
|
|
326 exit = 1;
|
|
327 }
|
|
328 //printf("line read: %s\n", buffer);
|
|
329 //printf("HI im a frag: %"PRIu64", %"PRIu64" - %"PRIu64", %"PRIu64"\n", frag.xStart, frag.xEnd, frag.yStart, frag.yEnd);
|
|
330 //getchar();
|
|
331 }
|
|
332
|
|
333 fclose(fFrags);
|
|
334 fclose(fX);
|
|
335 fclose(fY);
|
|
336 fclose(fO);
|
|
337
|
|
338 free(RI_X);
|
|
339 free(RI_Y);
|
|
340 free(RI_Yrev);
|
|
341
|
|
342 free(strfastaX);
|
|
343 free(strfastaY);
|
|
344 free(strfastaYrev);
|
|
345
|
|
346 return 0;
|
|
347 }
|