Mercurial > repos > bitlab > gecko
comparison gecko/src/csvFrags2text.c @ 9:aec70bb1ae27 draft
Uploaded
author | bitlab |
---|---|
date | Wed, 18 Nov 2020 08:08:25 +0000 |
parents | |
children | cf4c0c822ca9 |
comparison
equal
deleted
inserted
replaced
8:dbdb10e4c8b5 | 9:aec70bb1ae27 |
---|---|
1 /* | |
2 * | |
3 * Sintax: ./frags2text fragsFILE.frags fastaX fastaY fragsFILE.txt | |
4 | |
5 */ | |
6 | |
7 #include <stdio.h> | |
8 #include <stdlib.h> | |
9 #include <string.h> | |
10 #include <inttypes.h> | |
11 #include "structs.h" | |
12 #include "commonFunctions.h" | |
13 #include "comparisonFunctions.h" | |
14 | |
15 #define TAB_INSERT 70 | |
16 #define READING_FRAG_BUFFER 10000 | |
17 | |
18 void csv_frag_to_struct_frag(char * l, struct FragFile * f){ | |
19 | |
20 //0 1 2 3 4 5 6 7 8 9 10 11 12 13 | |
21 //Type xStart yStart xEnd yEnd strand(f/r) block length score ident similarity %ident SeqX SeqY | |
22 //Frag 3147493 3006054 3154663 2998884 f 0 7171 20868 6194 72.75 0.86 0 0 | |
23 | |
24 | |
25 float bin; | |
26 | |
27 sscanf(l, "%*s %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %c %"PRId64" %"PRIu64" %"PRIu64" %"PRIu64" %f %f %"PRIu64" %"PRIu64, &f->xStart, &f->yStart, &f->xEnd, &f->yEnd, &f->strand, &f->block, &f->length, &f->score, &f->ident, &f->similarity, &bin, &f->seqX, &f->seqY); | |
28 | |
29 //printf("Read %d items\n", items); | |
30 | |
31 } | |
32 | |
33 struct rIndex2 * loadReadsIndex(char * filename, uint64_t * nReads){ | |
34 struct rIndex2 * RR; | |
35 uint64_t nR=0,i; | |
36 FILE *f; | |
37 uint64_t fsize; | |
38 | |
39 if ((f=fopen(filename,"rb"))==NULL) terror("Could not open index input file"); | |
40 | |
41 fseeko(f, 0, SEEK_END); | |
42 fsize = ftello(f); | |
43 rewind(f); | |
44 nR = fsize/sizeof(struct rIndex2); | |
45 | |
46 if ((RR =(struct rIndex2*) calloc(nR,sizeof(struct rIndex2)))==NULL) terror("Could not allocate index"); | |
47 | |
48 for (i=0; i<nR; i++){ | |
49 if(0 == fread(&RR[i],sizeof(struct rIndex2),1,f)) break; | |
50 } | |
51 fclose(f); | |
52 (*nReads) = nR; | |
53 return RR; | |
54 } | |
55 | |
56 void write_headers(FILE * f1, FILE * f2, FILE * fO, uint64_t pos1, uint64_t pos2, struct FragFile * f){ | |
57 fseek(f1, pos1, SEEK_SET); | |
58 char c = fgetc(f1); | |
59 while(c != '\n'){ fprintf(fO, "%c", c); c = fgetc(f1); } | |
60 fprintf(fO, " ALIGNED WITH "); | |
61 fseek(f2, pos2, SEEK_SET); | |
62 c = fgetc(f2); | |
63 while(c != '\n'){ fprintf(fO, "%c", c); c = fgetc(f2); } | |
64 fprintf(fO, " LENGTH: %"PRIu64" IDENT: %"PRIu64" STRAND: %c @(%"PRIu64", %"PRIu64")\n", f->length, f->ident, f->strand, f->xStart, f->yStart); | |
65 } | |
66 | |
67 void get_both_seqs(char * fastaX, char * fastaY, uint64_t iniX, uint64_t finX, uint64_t iniY, uint64_t finY, uint64_t posX, uint64_t posY, uint64_t LacX, uint64_t LacY, uint64_t lX, uint64_t lY, FILE * fO){ | |
68 char copyX[TAB_INSERT+1]; | |
69 char copyY[TAB_INSERT+1]; | |
70 memset(copyX, 0x0, TAB_INSERT+1); | |
71 memset(copyY, 0x0, TAB_INSERT+1); | |
72 uint64_t atcopyX = 0; | |
73 uint64_t atcopyY = 0; | |
74 uint64_t i; | |
75 | |
76 uint64_t pos_used_x, pos_used_y; | |
77 | |
78 // The X one | |
79 //fseek(fastaX, posX, SEEK_SET); | |
80 pos_used_x = posX; | |
81 char cX = fastaX[pos_used_x++]; | |
82 while(cX != '\n'){ cX = fastaX[pos_used_x++]; } | |
83 uint64_t gpX = iniX - LacX; | |
84 uint64_t currX = 0, tab_counter; | |
85 while(currX < gpX){ | |
86 cX = fastaX[pos_used_x++]; | |
87 if(cX != '\n') ++currX; | |
88 } | |
89 | |
90 // the other Y | |
91 //fseek(fastaY, posY, SEEK_SET); | |
92 pos_used_y = posY; | |
93 char cY = fastaY[pos_used_y++]; | |
94 while(cY != '\n'){ cY = fastaY[pos_used_y++]; } | |
95 uint64_t gpY = iniY - LacY; | |
96 uint64_t currY = 0; | |
97 while(currY < gpY){ | |
98 cY = fastaY[pos_used_y++]; | |
99 if(cY != '\n') ++currY; | |
100 } | |
101 | |
102 | |
103 | |
104 | |
105 // Reached the region to show | |
106 currX = 0; | |
107 currY = 0; | |
108 cX = fastaX[pos_used_x++]; | |
109 cY = fastaY[pos_used_y++]; | |
110 //fprintf(fO, "\t"); | |
111 tab_counter = 0; | |
112 while(currX < lX && currY < lY){ | |
113 | |
114 if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } | |
115 cX = fastaX[pos_used_x++]; | |
116 while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; | |
117 | |
118 | |
119 if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } | |
120 cY = fastaY[pos_used_y++]; | |
121 while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; | |
122 | |
123 while(currX > currY){ | |
124 if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } | |
125 cY = fastaY[pos_used_y++]; | |
126 while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; | |
127 } | |
128 while(currX < currY){ | |
129 if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } | |
130 cX = fastaX[pos_used_x++]; | |
131 while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; | |
132 } | |
133 | |
134 ++tab_counter; | |
135 | |
136 | |
137 | |
138 if(tab_counter >= TAB_INSERT && currX == currY){ | |
139 | |
140 copyX[TAB_INSERT] = '\0'; | |
141 copyY[TAB_INSERT] = '\0'; | |
142 fprintf(fO, "X:\t%.*s\n\t", TAB_INSERT, copyX); | |
143 for(i=0; i<TAB_INSERT; i++){ | |
144 if(copyX[i] == copyY[i]) fprintf(fO, "|"); else fprintf(fO, " "); | |
145 } | |
146 fprintf(fO, "\nY:\t%.*s\n\n", TAB_INSERT, copyY); | |
147 tab_counter = 0; | |
148 atcopyX = 0; atcopyY = 0; | |
149 } | |
150 } | |
151 if(atcopyX > 0){ | |
152 copyX[atcopyX] = '\0'; | |
153 copyY[atcopyY] = '\0'; | |
154 fprintf(fO, "X:\t%.*s\n\t", (int)atcopyX, copyX); | |
155 for(i=0; i<atcopyX; i++){ | |
156 if(copyX[i] == copyY[i]) fprintf(fO, "|"); else fprintf(fO, " "); | |
157 } | |
158 fprintf(fO, "\nY:\t%.*s\n\n", (int)atcopyY, copyY); | |
159 atcopyX = 0; atcopyY = 0; | |
160 } | |
161 fprintf(fO, "\n"); | |
162 } | |
163 | |
164 | |
165 void get_seq_from_to(FILE * fasta, FILE * output, uint64_t ini, uint64_t fin, uint64_t pos, uint64_t Lac, uint64_t seqNum, uint64_t l, FILE * fO){ | |
166 fseek(fasta, pos, SEEK_SET); | |
167 char c = fgetc(fasta); | |
168 while(c != '\n'){ c = fgetc(fasta); } | |
169 uint64_t gp = ini - Lac; | |
170 uint64_t curr = 0, tab_counter; | |
171 while(curr < gp){ | |
172 c = fgetc(fasta); | |
173 if(c != '\n') ++curr; | |
174 } | |
175 // Reached the region to show | |
176 curr = 0; | |
177 c = fgetc(fasta); | |
178 fprintf(fO, "\t"); | |
179 tab_counter = 0; | |
180 while(curr < l){ | |
181 if(c != '\n') fprintf(fO, "%c", c); | |
182 c = fgetc(fasta); | |
183 if(c != '\n' || feof(fasta)){ ++curr; ++tab_counter; } | |
184 if(tab_counter == TAB_INSERT){ fprintf(fO, "\n\t"); tab_counter = 0;} | |
185 } | |
186 fprintf(fO, "\n"); | |
187 } | |
188 | |
189 | |
190 void get_seq_from_to_rev(FILE * fasta, FILE * output, uint64_t ini, uint64_t fin, uint64_t pos, uint64_t Lac, uint64_t seqNum, uint64_t l, FILE * fO){ | |
191 fseek(fasta, pos, SEEK_SET); | |
192 char c = fgetc(fasta); | |
193 while(c != '\n'){ c = fgetc(fasta); } | |
194 uint64_t gp = ini - Lac; | |
195 uint64_t curr = 0, tab_counter; | |
196 while(curr < gp){ | |
197 c = fgetc(fasta); | |
198 if(c != '\n') ++curr; | |
199 } | |
200 // Reached the region to show | |
201 curr = 0; | |
202 c = fgetc(fasta); | |
203 fprintf(fO, "\t"); | |
204 tab_counter = 0; | |
205 while(curr < l){ | |
206 if(c != '\n') fprintf(fO, "%c", c); | |
207 c = fgetc(fasta); | |
208 if(c != '\n' || feof(fasta)){ ++curr; ++tab_counter; } | |
209 if(tab_counter == TAB_INSERT){ fprintf(fO, "\n\t"); tab_counter = 0;} | |
210 } | |
211 fprintf(fO, "\n"); | |
212 } | |
213 | |
214 | |
215 | |
216 int main(int ac, char** av) { | |
217 FILE* fFrags; | |
218 struct FragFile frag; | |
219 | |
220 fprintf(stdout, "[WARNING] - Remember that if using a CSV make sure that the reverse y coordinates are transformed\n"); | |
221 | |
222 if (ac != 9) | |
223 terror("USE: ./frags2text fragsFILE.csv fastaX fastaY fastaYrev indexX indexY indexYrev fragsFILE.txt"); | |
224 | |
225 // prepared for multiple files | |
226 if ((fFrags = fopen(av[1], "rt")) == NULL) | |
227 terror("Opening Frags csv file"); | |
228 | |
229 // Open fastas | |
230 | |
231 FILE * fX = NULL, * fY = NULL, * fYrev = NULL, * fO = NULL; | |
232 fX = fopen(av[2], "rt"); | |
233 if(fX == NULL) terror("Could not open fasta X file"); | |
234 fY = fopen(av[3], "rt"); | |
235 if(fY == NULL) terror("Could not open fasta Y file"); | |
236 fYrev = fopen(av[4], "rt"); | |
237 if(fYrev == NULL) terror("Could not open fasta Y-rev file"); | |
238 | |
239 | |
240 // Get file lengths | |
241 fseek(fX, 0, SEEK_END); | |
242 uint64_t aprox_lenX = ftell(fX); | |
243 rewind(fX); | |
244 char * strfastaX = (char *) malloc(aprox_lenX*sizeof(char)); | |
245 fseek(fY, 0, SEEK_END); | |
246 uint64_t aprox_lenY = ftell(fY); | |
247 rewind(fY); | |
248 char * strfastaY = (char *) malloc(aprox_lenY*sizeof(char)); | |
249 fseek(fYrev, 0, SEEK_END); | |
250 uint64_t aprox_lenYrev = ftell(fYrev); | |
251 rewind(fYrev); | |
252 char * strfastaYrev = (char *) malloc(aprox_lenYrev*sizeof(char)); | |
253 | |
254 if(strfastaX == NULL || strfastaY == NULL || strfastaYrev == NULL) terror("Could not allocate string sequences"); | |
255 | |
256 if(aprox_lenX != fread(strfastaX, sizeof(char), aprox_lenX, fX)) terror("Read wrong number of chars at X sequence"); | |
257 if(aprox_lenY != fread(strfastaY, sizeof(char), aprox_lenY, fY)) terror("Read wrong number of chars at Y sequence"); | |
258 if(aprox_lenYrev != fread(strfastaYrev, sizeof(char), aprox_lenYrev, fYrev)) terror("Read wrong number of chars at Y reversed sequence"); | |
259 | |
260 | |
261 struct rIndex2 * RI_X, * RI_Y, * RI_Yrev; | |
262 | |
263 | |
264 uint64_t nReads_X, nReads_Y; | |
265 RI_X = loadReadsIndex(av[5], &nReads_X); | |
266 RI_Y = loadReadsIndex(av[6], &nReads_Y); | |
267 RI_Yrev = loadReadsIndex(av[7], &nReads_Y); | |
268 | |
269 fO = fopen(av[8], "wt"); | |
270 if(fO == NULL) terror("Could not open output alignments file"); | |
271 | |
272 | |
273 //readSequenceLength(&n1, fFrags); | |
274 //readSequenceLength(&n2, fFrags); | |
275 // Skip first lines | |
276 char buffer[READING_FRAG_BUFFER]; | |
277 int skipper = 0; while(skipper < 18){ fgets(buffer, READING_FRAG_BUFFER, fFrags); skipper++; } | |
278 | |
279 | |
280 | |
281 | |
282 //readFragment(&frag, fFrags); | |
283 | |
284 | |
285 csv_frag_to_struct_frag(buffer, &frag); | |
286 | |
287 // RI_X is the forward index for fasta file X | |
288 // RI_Y is the forward index for fasta file Y | |
289 // RI_Yrev is the reverse index for fasta file Y | |
290 | |
291 int exit = 0; | |
292 | |
293 while (!feof(fFrags) && exit == 0) { | |
294 | |
295 //RI[id].pos is position in file of > | |
296 //RI[id].Lac is the sum of the reads length prior | |
297 | |
298 if(frag.strand == 'f'){ | |
299 write_headers(fX, fY, fO, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, &frag); | |
300 }else{ | |
301 write_headers(fX, fYrev, fO, RI_X[frag.seqX].pos, RI_Yrev[(nReads_Y - frag.seqY) - 1].pos, &frag); | |
302 } | |
303 | |
304 | |
305 //get_seq_from_to(fX, fO, frag.xStart, frag.xEnd, RI_X[frag.seqX].pos, RI_X[frag.seqX].Lac, frag.seqX, frag.length, fO); | |
306 | |
307 | |
308 | |
309 if(frag.strand == 'f'){ | |
310 get_both_seqs(strfastaX, strfastaY, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, RI_X[frag.seqX].Lac, RI_Y[frag.seqY].Lac, frag.length, frag.length, fO); | |
311 //get_seq_from_to(fY, fO, frag.yStart, frag.yEnd, RI_Y[frag.seqY].pos, RI_Y[frag.seqY].Lac, frag.seqY, frag.length, fO); | |
312 }else{ | |
313 uint64_t seqYnew; | |
314 seqYnew = (nReads_Y - frag.seqY) - 1; | |
315 get_both_seqs(strfastaX, strfastaYrev, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Yrev[seqYnew].pos, RI_X[frag.seqX].Lac, RI_Yrev[seqYnew].Lac, frag.length, frag.length, fO); | |
316 //get_seq_from_to_rev(fYrev, fO, frag.yStart, frag.yEnd, RI_Yrev[seqYnew].pos, RI_Yrev[seqYnew].Lac, seqYnew, frag.length, fO); | |
317 } | |
318 | |
319 //readFragment(&frag, fFrags); | |
320 if(!feof(fFrags)){ | |
321 fgets(buffer, READING_FRAG_BUFFER, fFrags); | |
322 csv_frag_to_struct_frag(buffer, &frag); | |
323 | |
324 | |
325 }else{ | |
326 exit = 1; | |
327 } | |
328 //printf("line read: %s\n", buffer); | |
329 //printf("HI im a frag: %"PRIu64", %"PRIu64" - %"PRIu64", %"PRIu64"\n", frag.xStart, frag.xEnd, frag.yStart, frag.yEnd); | |
330 //getchar(); | |
331 } | |
332 | |
333 fclose(fFrags); | |
334 fclose(fX); | |
335 fclose(fY); | |
336 fclose(fO); | |
337 | |
338 free(RI_X); | |
339 free(RI_Y); | |
340 free(RI_Yrev); | |
341 | |
342 free(strfastaX); | |
343 free(strfastaY); | |
344 free(strfastaYrev); | |
345 | |
346 return 0; | |
347 } |