Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/AddGeneName.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdio.h> | |
2 #include <string.h> | |
3 #include <stdlib.h> | |
4 | |
5 #include <string> | |
6 #include <map> | |
7 #include <vector> | |
8 #include <algorithm> | |
9 | |
10 char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n" | |
11 "\t-o: the directory for output gtf files (default: ./)\n" ; | |
12 | |
13 struct _interval | |
14 { | |
15 char chrom[97] ; | |
16 char geneName[97] ; | |
17 | |
18 int start, end ; | |
19 } ; | |
20 | |
21 char line[10000] ; | |
22 char buffer[10000], buffer2[10000], buffer3[10000] ; | |
23 | |
24 int CompInterval( const struct _interval &a, const struct _interval &b ) | |
25 { | |
26 int chromCmp = strcmp( a.chrom, b.chrom ) ; | |
27 if ( chromCmp ) | |
28 return chromCmp ; | |
29 else if ( a.start != b.start ) | |
30 return a.start - b.start ; | |
31 else | |
32 return a.end - b.end ; | |
33 } | |
34 | |
35 bool Comp( const struct _interval &a, const struct _interval &b ) | |
36 { | |
37 int cmp = CompInterval( a, b ) ; | |
38 if ( cmp < 0 ) | |
39 return true ; | |
40 else | |
41 return false ; | |
42 | |
43 return false ; | |
44 } | |
45 | |
46 void SortAndCleanIntervals( std::vector<struct _interval> &it ) | |
47 { | |
48 std::sort( it.begin(), it.end(), Comp ) ; | |
49 | |
50 int i, k ; | |
51 int size = it.size() ; | |
52 if ( size == 0 ) | |
53 return ; | |
54 | |
55 k = 1 ; | |
56 for ( i = 1 ; i < size ; ++i ) | |
57 { | |
58 if ( CompInterval( it[k - 1], it[i] ) ) | |
59 { | |
60 it[k] = it[i] ; | |
61 ++k ; | |
62 } | |
63 } | |
64 it.resize( k ) ; | |
65 } | |
66 | |
67 int GetGTFField( char *ret, char *line, const char *fid ) | |
68 { | |
69 char *p = strstr( line, fid ) ; | |
70 if ( p == NULL ) | |
71 return 0 ; | |
72 //printf( "%s %s\n", line, fid ) ; | |
73 for ( ; *p != ' ' ; ++p ) | |
74 ; | |
75 p += 2 ; | |
76 | |
77 sscanf( p, "%s", ret ) ; | |
78 p = ret + strlen( ret ) ; | |
79 while ( p != ret && *p != '\"' ) | |
80 --p ; | |
81 *p = '\0' ; | |
82 return 1 ; | |
83 } | |
84 | |
85 void UpdateIdToAnnoId( std::vector<struct _interval> &transcript, char *tid, int exonTag, int intronTag, std::vector<struct _interval> &exons, std::vector<struct _interval> &introns, | |
86 std::map<std::string, std::string> &txptIdToAnnoId, std::map<std::string, std::string> &geneIdToAnnoId ) | |
87 { | |
88 int i, k ; | |
89 bool flag = false ; | |
90 // First, try whether intron works. | |
91 int ecnt = transcript.size() ; | |
92 int intronSize = introns.size() ; | |
93 int exonSize = exons.size() ; | |
94 | |
95 struct _interval itron ; | |
96 strcpy( itron.chrom, transcript[0].chrom ) ; | |
97 k = intronTag ; | |
98 for ( i = 0 ; i < ecnt - 1 && !flag ; ++i ) | |
99 { | |
100 itron.start = transcript[i].end ; | |
101 itron.end = transcript[i + 1].start ; | |
102 for ( ; k < intronSize ; ++k ) | |
103 { | |
104 //printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ; | |
105 if ( strcmp( introns[k].chrom, itron.chrom ) ) | |
106 break ; | |
107 | |
108 int cmp = CompInterval( introns[k], itron ) ; | |
109 if ( cmp == 0 ) | |
110 { | |
111 txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ; | |
112 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ; | |
113 flag = true ; | |
114 break ; | |
115 } | |
116 else if ( cmp > 0 ) | |
117 break ; | |
118 } | |
119 } | |
120 | |
121 // Next, try whether exon works | |
122 k = exonTag ; | |
123 for ( i = 0 ; i < ecnt && !flag ; ++i ) | |
124 { | |
125 for ( ; k < exonSize ; ++k ) | |
126 { | |
127 if ( strcmp( exons[k].chrom, transcript[i].chrom ) ) | |
128 break ; | |
129 | |
130 if ( exons[k].end < transcript[i].start ) | |
131 continue ; | |
132 else if ( exons[k].start > transcript[i].end ) | |
133 break ; | |
134 else | |
135 { | |
136 txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ; | |
137 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ; | |
138 flag = true ; | |
139 break ; | |
140 } | |
141 } | |
142 } | |
143 } | |
144 | |
145 int main( int argc, char *argv[] ) | |
146 { | |
147 int i, j, k ; | |
148 FILE *fp ; | |
149 FILE *fpList ; | |
150 char outputPath[1024] = "./" ; | |
151 char prevTid[97] = "" ; | |
152 | |
153 std::vector<struct _interval> introns, exons ; | |
154 std::map<std::string, std::string> txptIdToAnnoId, geneIdToAnnoId ; | |
155 std::map<std::string, int> chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom. | |
156 | |
157 if ( argc < 3 ) | |
158 { | |
159 fprintf( stderr, "%s", usage ) ; | |
160 exit( 1 ) ; | |
161 } | |
162 | |
163 for ( i = 3 ; i < argc ; ++i ) | |
164 { | |
165 if ( !strcmp( argv[i], "-o" ) ) | |
166 { | |
167 strcpy( outputPath, argv[i + 1 ] ) ; | |
168 ++i ; | |
169 } | |
170 else | |
171 { | |
172 fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; | |
173 exit( 1 ) ; | |
174 } | |
175 } | |
176 | |
177 // Get the exons, introns from the annotation. | |
178 fp = fopen( argv[1], "r" ) ; | |
179 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
180 { | |
181 if ( line[0] == '#' ) | |
182 continue ; | |
183 char tid[97] ; | |
184 char gname[97] ; | |
185 char chrom[97] ; | |
186 char type[50] ; | |
187 int start, end ; | |
188 | |
189 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; | |
190 if ( strcmp( type, "exon" ) ) | |
191 continue ; | |
192 if ( GetGTFField( gname, line, "gene_name" ) ) | |
193 { | |
194 struct _interval ne ; | |
195 strcpy( ne.chrom, chrom ) ; | |
196 strcpy( ne.geneName, gname ) ; | |
197 ne.start = start ; ne.end = end ; | |
198 | |
199 exons.push_back( ne ) ; | |
200 } | |
201 else | |
202 { | |
203 fprintf( stderr, "%s has no field of gene_name.\n", line ) ; | |
204 exit( 1 ) ; | |
205 } | |
206 | |
207 if ( GetGTFField( tid, line, "transcript_id" ) ) | |
208 { | |
209 if ( !strcmp( tid, prevTid ) ) | |
210 { | |
211 struct _interval ni ; | |
212 | |
213 strcpy( ni.chrom, chrom ) ; | |
214 strcpy( ni.geneName, gname ) ; | |
215 | |
216 int size = exons.size() ; | |
217 if ( size < 2 ) | |
218 continue ; | |
219 | |
220 struct _interval &e1 = exons[ size - 2 ] ; | |
221 struct _interval &e2 = exons[ size - 1 ] ; | |
222 if ( e1.start < e2.start ) | |
223 { | |
224 ni.start = e1.end ; | |
225 ni.end = e2.start ; | |
226 } | |
227 else | |
228 { | |
229 ni.start = e2.end ; | |
230 ni.end = e1.start ; | |
231 } | |
232 | |
233 introns.push_back( ni ) ; | |
234 } | |
235 else | |
236 strcpy( prevTid, tid ) ; | |
237 } | |
238 else | |
239 { | |
240 fprintf( stderr, "%s has no field of transcript_id.\n", line ) ; | |
241 exit( 1 ) ; | |
242 } | |
243 } | |
244 fclose( fp ) ; | |
245 SortAndCleanIntervals( exons ) ; | |
246 SortAndCleanIntervals( introns ) ; | |
247 int exonSize = exons.size() ; | |
248 int intronSize = introns.size() ; | |
249 // Determine the offset for each chrom on the list of features. | |
250 if ( exonSize ) | |
251 { | |
252 chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ; | |
253 for ( i = 1 ; i < exonSize ; ++i ) | |
254 { | |
255 if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) ) | |
256 chromIdMapExons[ std::string( exons[i].chrom ) ] = i ; | |
257 } | |
258 } | |
259 | |
260 if ( intronSize ) | |
261 { | |
262 chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ; | |
263 for ( i = 1 ; i < intronSize ; ++i ) | |
264 { | |
265 if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) ) | |
266 { | |
267 //printf( "%s %d\n", introns[i].chrom, i ) ; | |
268 chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ; | |
269 } | |
270 } | |
271 | |
272 //for ( i = 0 ; i < intronSize ; ++i ) | |
273 // printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ; | |
274 } | |
275 //printf( "%d %d\n", exonSize, intronSize ) ; | |
276 | |
277 // Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name | |
278 fpList = fopen( argv[2], "r ") ; | |
279 std::vector<struct _interval> transcript ; | |
280 int exonTag = 0 ; | |
281 int intronTag = 0 ; | |
282 | |
283 while ( fscanf( fpList, "%s", line ) != EOF ) | |
284 { | |
285 // Set the output file | |
286 //printf( "hi: %s\n", line ) ; | |
287 char *p ; | |
288 p = line + strlen( line ) ; | |
289 while ( p != line && *p != '/' ) | |
290 { | |
291 if ( *p == '\n' ) | |
292 *p = '\0' ; | |
293 --p ; | |
294 } | |
295 sprintf( buffer, "%s/%s", outputPath, p ) ; | |
296 | |
297 // Test whether this will overwrite the input gtf file. | |
298 if ( realpath( buffer, buffer2 ) != NULL ) | |
299 { | |
300 if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) ) | |
301 { | |
302 fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ; | |
303 exit( 1 ) ; | |
304 } | |
305 } | |
306 | |
307 fp = fopen( line, "r" ) ; | |
308 transcript.resize( 0 ) ; // hold the exons in the transcript. | |
309 prevTid[0] = '\0' ; | |
310 exonTag = 0 ; | |
311 intronTag = 0 ; | |
312 int farthest = 0 ; | |
313 | |
314 while ( fgets( line, sizeof( line ), fp ) ) | |
315 { | |
316 char tid[97] ; | |
317 //char gname[97] ; | |
318 char chrom[97] ; | |
319 char type[50] ; | |
320 int start, end ; | |
321 | |
322 if ( line[0] == '#' ) | |
323 continue ; | |
324 | |
325 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; | |
326 if ( strcmp( type, "exon" ) ) | |
327 continue ; | |
328 | |
329 if ( GetGTFField( tid, line, "transcript_id" ) ) | |
330 { | |
331 struct _interval ne ; | |
332 strcpy( ne.chrom, chrom ) ; | |
333 ne.start = start ; | |
334 ne.end = end ; | |
335 GetGTFField( ne.geneName, line, "gene_id" ) ; | |
336 | |
337 if ( !strcmp( tid, prevTid ) ) | |
338 { | |
339 transcript.push_back( ne ) ; | |
340 if ( end > farthest ) | |
341 farthest = end ; | |
342 } | |
343 else if ( transcript.size() > 0 ) | |
344 { | |
345 // the non-existed chrom will be put to offset 0, and that's fine | |
346 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
347 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
348 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
349 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
350 | |
351 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; | |
352 | |
353 // Adjust the offset if we are done with a gene cluster | |
354 // We don't need to worry about the case of changing chrom here. | |
355 if ( !strcmp( ne.geneName, transcript[0].geneName ) && | |
356 start > farthest ) // Use farthest to avoid inteleaved gene. | |
357 { | |
358 while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom ) | |
359 && introns[intronTag].end < ne.start ) | |
360 ++intronTag ; | |
361 | |
362 while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom ) | |
363 && exons[ exonTag ].end < ne.start ) | |
364 ++exonTag ; | |
365 | |
366 farthest = end ; | |
367 } | |
368 | |
369 transcript.resize( 0 ) ; | |
370 transcript.push_back( ne ) ; | |
371 | |
372 // Find the overlaps. | |
373 strcpy( prevTid, tid ) ; | |
374 } | |
375 else | |
376 strcpy( prevTid, tid ) ; | |
377 } | |
378 else | |
379 { | |
380 fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ; | |
381 exit( 1 ) ; | |
382 } | |
383 } | |
384 | |
385 if ( transcript.size() > 0 ) | |
386 { | |
387 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
388 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
389 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
390 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
391 | |
392 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; | |
393 } | |
394 fclose( fp ) ; | |
395 } | |
396 fclose( fpList ) ; | |
397 | |
398 // Add the gene_name field. | |
399 fpList = fopen( argv[2], "r" ) ; | |
400 int novelCnt = 0 ; | |
401 while ( fscanf( fpList, "%s", line ) != EOF ) | |
402 { | |
403 FILE *fpOut ; | |
404 // Set the output file | |
405 char *p ; | |
406 p = line + strlen( line ) ; | |
407 while ( p != line && *p != '/' ) | |
408 { | |
409 if ( *p == '\n' ) | |
410 *p = '\0' ; | |
411 --p ; | |
412 } | |
413 sprintf( buffer, "%s/%s", outputPath, p ) ; | |
414 fpOut = fopen( buffer, "w" ) ; | |
415 | |
416 // Process the input file | |
417 fp = fopen( line, "r" ) ; | |
418 | |
419 transcript.resize( 0 ) ; | |
420 prevTid[0] = '\0' ; | |
421 while ( fgets( line, sizeof( line ), fp ) ) | |
422 { | |
423 char gname[97] ; | |
424 if ( line[0] == '#' ) | |
425 { | |
426 fprintf( fpOut, "%s", line ) ; | |
427 continue ; | |
428 } | |
429 | |
430 if ( GetGTFField( buffer, line, "transcript_id" ) ) | |
431 { | |
432 if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() ) | |
433 { | |
434 int len = strlen( line ) ; | |
435 if ( line[len - 1] == '\n' ) | |
436 { | |
437 line[len - 1] = '\0' ; | |
438 --len ; | |
439 } | |
440 fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ; | |
441 continue ; | |
442 } | |
443 } | |
444 | |
445 if ( GetGTFField( gname, line, "gene_id" ) ) | |
446 { | |
447 int len = strlen( line ) ; | |
448 if ( line[len - 1] == '\n' ) | |
449 { | |
450 line[len - 1] = '\0' ; | |
451 --len ; | |
452 } | |
453 if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() ) | |
454 { | |
455 fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ; | |
456 } | |
457 else | |
458 { | |
459 sprintf( buffer, "novel_%d", novelCnt ) ; | |
460 geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ; | |
461 fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ; | |
462 ++novelCnt ; | |
463 } | |
464 | |
465 } | |
466 else | |
467 fprintf( fpOut, "%s", line ) ; | |
468 | |
469 } | |
470 | |
471 fclose( fp ) ; | |
472 fclose( fpOut ) ; | |
473 } | |
474 | |
475 return 0 ; | |
476 } |