Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/AddGeneName.cpp @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #include <stdio.h> | |
| 2 #include <string.h> | |
| 3 #include <stdlib.h> | |
| 4 | |
| 5 #include <string> | |
| 6 #include <map> | |
| 7 #include <vector> | |
| 8 #include <algorithm> | |
| 9 | |
| 10 char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n" | |
| 11 "\t-o: the directory for output gtf files (default: ./)\n" ; | |
| 12 | |
| 13 struct _interval | |
| 14 { | |
| 15 char chrom[97] ; | |
| 16 char geneName[97] ; | |
| 17 | |
| 18 int start, end ; | |
| 19 } ; | |
| 20 | |
| 21 char line[10000] ; | |
| 22 char buffer[10000], buffer2[10000], buffer3[10000] ; | |
| 23 | |
| 24 int CompInterval( const struct _interval &a, const struct _interval &b ) | |
| 25 { | |
| 26 int chromCmp = strcmp( a.chrom, b.chrom ) ; | |
| 27 if ( chromCmp ) | |
| 28 return chromCmp ; | |
| 29 else if ( a.start != b.start ) | |
| 30 return a.start - b.start ; | |
| 31 else | |
| 32 return a.end - b.end ; | |
| 33 } | |
| 34 | |
| 35 bool Comp( const struct _interval &a, const struct _interval &b ) | |
| 36 { | |
| 37 int cmp = CompInterval( a, b ) ; | |
| 38 if ( cmp < 0 ) | |
| 39 return true ; | |
| 40 else | |
| 41 return false ; | |
| 42 | |
| 43 return false ; | |
| 44 } | |
| 45 | |
| 46 void SortAndCleanIntervals( std::vector<struct _interval> &it ) | |
| 47 { | |
| 48 std::sort( it.begin(), it.end(), Comp ) ; | |
| 49 | |
| 50 int i, k ; | |
| 51 int size = it.size() ; | |
| 52 if ( size == 0 ) | |
| 53 return ; | |
| 54 | |
| 55 k = 1 ; | |
| 56 for ( i = 1 ; i < size ; ++i ) | |
| 57 { | |
| 58 if ( CompInterval( it[k - 1], it[i] ) ) | |
| 59 { | |
| 60 it[k] = it[i] ; | |
| 61 ++k ; | |
| 62 } | |
| 63 } | |
| 64 it.resize( k ) ; | |
| 65 } | |
| 66 | |
| 67 int GetGTFField( char *ret, char *line, const char *fid ) | |
| 68 { | |
| 69 char *p = strstr( line, fid ) ; | |
| 70 if ( p == NULL ) | |
| 71 return 0 ; | |
| 72 //printf( "%s %s\n", line, fid ) ; | |
| 73 for ( ; *p != ' ' ; ++p ) | |
| 74 ; | |
| 75 p += 2 ; | |
| 76 | |
| 77 sscanf( p, "%s", ret ) ; | |
| 78 p = ret + strlen( ret ) ; | |
| 79 while ( p != ret && *p != '\"' ) | |
| 80 --p ; | |
| 81 *p = '\0' ; | |
| 82 return 1 ; | |
| 83 } | |
| 84 | |
| 85 void UpdateIdToAnnoId( std::vector<struct _interval> &transcript, char *tid, int exonTag, int intronTag, std::vector<struct _interval> &exons, std::vector<struct _interval> &introns, | |
| 86 std::map<std::string, std::string> &txptIdToAnnoId, std::map<std::string, std::string> &geneIdToAnnoId ) | |
| 87 { | |
| 88 int i, k ; | |
| 89 bool flag = false ; | |
| 90 // First, try whether intron works. | |
| 91 int ecnt = transcript.size() ; | |
| 92 int intronSize = introns.size() ; | |
| 93 int exonSize = exons.size() ; | |
| 94 | |
| 95 struct _interval itron ; | |
| 96 strcpy( itron.chrom, transcript[0].chrom ) ; | |
| 97 k = intronTag ; | |
| 98 for ( i = 0 ; i < ecnt - 1 && !flag ; ++i ) | |
| 99 { | |
| 100 itron.start = transcript[i].end ; | |
| 101 itron.end = transcript[i + 1].start ; | |
| 102 for ( ; k < intronSize ; ++k ) | |
| 103 { | |
| 104 //printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ; | |
| 105 if ( strcmp( introns[k].chrom, itron.chrom ) ) | |
| 106 break ; | |
| 107 | |
| 108 int cmp = CompInterval( introns[k], itron ) ; | |
| 109 if ( cmp == 0 ) | |
| 110 { | |
| 111 txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ; | |
| 112 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ; | |
| 113 flag = true ; | |
| 114 break ; | |
| 115 } | |
| 116 else if ( cmp > 0 ) | |
| 117 break ; | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 // Next, try whether exon works | |
| 122 k = exonTag ; | |
| 123 for ( i = 0 ; i < ecnt && !flag ; ++i ) | |
| 124 { | |
| 125 for ( ; k < exonSize ; ++k ) | |
| 126 { | |
| 127 if ( strcmp( exons[k].chrom, transcript[i].chrom ) ) | |
| 128 break ; | |
| 129 | |
| 130 if ( exons[k].end < transcript[i].start ) | |
| 131 continue ; | |
| 132 else if ( exons[k].start > transcript[i].end ) | |
| 133 break ; | |
| 134 else | |
| 135 { | |
| 136 txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ; | |
| 137 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ; | |
| 138 flag = true ; | |
| 139 break ; | |
| 140 } | |
| 141 } | |
| 142 } | |
| 143 } | |
| 144 | |
| 145 int main( int argc, char *argv[] ) | |
| 146 { | |
| 147 int i, j, k ; | |
| 148 FILE *fp ; | |
| 149 FILE *fpList ; | |
| 150 char outputPath[1024] = "./" ; | |
| 151 char prevTid[97] = "" ; | |
| 152 | |
| 153 std::vector<struct _interval> introns, exons ; | |
| 154 std::map<std::string, std::string> txptIdToAnnoId, geneIdToAnnoId ; | |
| 155 std::map<std::string, int> chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom. | |
| 156 | |
| 157 if ( argc < 3 ) | |
| 158 { | |
| 159 fprintf( stderr, "%s", usage ) ; | |
| 160 exit( 1 ) ; | |
| 161 } | |
| 162 | |
| 163 for ( i = 3 ; i < argc ; ++i ) | |
| 164 { | |
| 165 if ( !strcmp( argv[i], "-o" ) ) | |
| 166 { | |
| 167 strcpy( outputPath, argv[i + 1 ] ) ; | |
| 168 ++i ; | |
| 169 } | |
| 170 else | |
| 171 { | |
| 172 fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; | |
| 173 exit( 1 ) ; | |
| 174 } | |
| 175 } | |
| 176 | |
| 177 // Get the exons, introns from the annotation. | |
| 178 fp = fopen( argv[1], "r" ) ; | |
| 179 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
| 180 { | |
| 181 if ( line[0] == '#' ) | |
| 182 continue ; | |
| 183 char tid[97] ; | |
| 184 char gname[97] ; | |
| 185 char chrom[97] ; | |
| 186 char type[50] ; | |
| 187 int start, end ; | |
| 188 | |
| 189 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; | |
| 190 if ( strcmp( type, "exon" ) ) | |
| 191 continue ; | |
| 192 if ( GetGTFField( gname, line, "gene_name" ) ) | |
| 193 { | |
| 194 struct _interval ne ; | |
| 195 strcpy( ne.chrom, chrom ) ; | |
| 196 strcpy( ne.geneName, gname ) ; | |
| 197 ne.start = start ; ne.end = end ; | |
| 198 | |
| 199 exons.push_back( ne ) ; | |
| 200 } | |
| 201 else | |
| 202 { | |
| 203 fprintf( stderr, "%s has no field of gene_name.\n", line ) ; | |
| 204 exit( 1 ) ; | |
| 205 } | |
| 206 | |
| 207 if ( GetGTFField( tid, line, "transcript_id" ) ) | |
| 208 { | |
| 209 if ( !strcmp( tid, prevTid ) ) | |
| 210 { | |
| 211 struct _interval ni ; | |
| 212 | |
| 213 strcpy( ni.chrom, chrom ) ; | |
| 214 strcpy( ni.geneName, gname ) ; | |
| 215 | |
| 216 int size = exons.size() ; | |
| 217 if ( size < 2 ) | |
| 218 continue ; | |
| 219 | |
| 220 struct _interval &e1 = exons[ size - 2 ] ; | |
| 221 struct _interval &e2 = exons[ size - 1 ] ; | |
| 222 if ( e1.start < e2.start ) | |
| 223 { | |
| 224 ni.start = e1.end ; | |
| 225 ni.end = e2.start ; | |
| 226 } | |
| 227 else | |
| 228 { | |
| 229 ni.start = e2.end ; | |
| 230 ni.end = e1.start ; | |
| 231 } | |
| 232 | |
| 233 introns.push_back( ni ) ; | |
| 234 } | |
| 235 else | |
| 236 strcpy( prevTid, tid ) ; | |
| 237 } | |
| 238 else | |
| 239 { | |
| 240 fprintf( stderr, "%s has no field of transcript_id.\n", line ) ; | |
| 241 exit( 1 ) ; | |
| 242 } | |
| 243 } | |
| 244 fclose( fp ) ; | |
| 245 SortAndCleanIntervals( exons ) ; | |
| 246 SortAndCleanIntervals( introns ) ; | |
| 247 int exonSize = exons.size() ; | |
| 248 int intronSize = introns.size() ; | |
| 249 // Determine the offset for each chrom on the list of features. | |
| 250 if ( exonSize ) | |
| 251 { | |
| 252 chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ; | |
| 253 for ( i = 1 ; i < exonSize ; ++i ) | |
| 254 { | |
| 255 if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) ) | |
| 256 chromIdMapExons[ std::string( exons[i].chrom ) ] = i ; | |
| 257 } | |
| 258 } | |
| 259 | |
| 260 if ( intronSize ) | |
| 261 { | |
| 262 chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ; | |
| 263 for ( i = 1 ; i < intronSize ; ++i ) | |
| 264 { | |
| 265 if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) ) | |
| 266 { | |
| 267 //printf( "%s %d\n", introns[i].chrom, i ) ; | |
| 268 chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ; | |
| 269 } | |
| 270 } | |
| 271 | |
| 272 //for ( i = 0 ; i < intronSize ; ++i ) | |
| 273 // printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ; | |
| 274 } | |
| 275 //printf( "%d %d\n", exonSize, intronSize ) ; | |
| 276 | |
| 277 // Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name | |
| 278 fpList = fopen( argv[2], "r ") ; | |
| 279 std::vector<struct _interval> transcript ; | |
| 280 int exonTag = 0 ; | |
| 281 int intronTag = 0 ; | |
| 282 | |
| 283 while ( fscanf( fpList, "%s", line ) != EOF ) | |
| 284 { | |
| 285 // Set the output file | |
| 286 //printf( "hi: %s\n", line ) ; | |
| 287 char *p ; | |
| 288 p = line + strlen( line ) ; | |
| 289 while ( p != line && *p != '/' ) | |
| 290 { | |
| 291 if ( *p == '\n' ) | |
| 292 *p = '\0' ; | |
| 293 --p ; | |
| 294 } | |
| 295 sprintf( buffer, "%s/%s", outputPath, p ) ; | |
| 296 | |
| 297 // Test whether this will overwrite the input gtf file. | |
| 298 if ( realpath( buffer, buffer2 ) != NULL ) | |
| 299 { | |
| 300 if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) ) | |
| 301 { | |
| 302 fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ; | |
| 303 exit( 1 ) ; | |
| 304 } | |
| 305 } | |
| 306 | |
| 307 fp = fopen( line, "r" ) ; | |
| 308 transcript.resize( 0 ) ; // hold the exons in the transcript. | |
| 309 prevTid[0] = '\0' ; | |
| 310 exonTag = 0 ; | |
| 311 intronTag = 0 ; | |
| 312 int farthest = 0 ; | |
| 313 | |
| 314 while ( fgets( line, sizeof( line ), fp ) ) | |
| 315 { | |
| 316 char tid[97] ; | |
| 317 //char gname[97] ; | |
| 318 char chrom[97] ; | |
| 319 char type[50] ; | |
| 320 int start, end ; | |
| 321 | |
| 322 if ( line[0] == '#' ) | |
| 323 continue ; | |
| 324 | |
| 325 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; | |
| 326 if ( strcmp( type, "exon" ) ) | |
| 327 continue ; | |
| 328 | |
| 329 if ( GetGTFField( tid, line, "transcript_id" ) ) | |
| 330 { | |
| 331 struct _interval ne ; | |
| 332 strcpy( ne.chrom, chrom ) ; | |
| 333 ne.start = start ; | |
| 334 ne.end = end ; | |
| 335 GetGTFField( ne.geneName, line, "gene_id" ) ; | |
| 336 | |
| 337 if ( !strcmp( tid, prevTid ) ) | |
| 338 { | |
| 339 transcript.push_back( ne ) ; | |
| 340 if ( end > farthest ) | |
| 341 farthest = end ; | |
| 342 } | |
| 343 else if ( transcript.size() > 0 ) | |
| 344 { | |
| 345 // the non-existed chrom will be put to offset 0, and that's fine | |
| 346 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
| 347 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
| 348 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
| 349 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
| 350 | |
| 351 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; | |
| 352 | |
| 353 // Adjust the offset if we are done with a gene cluster | |
| 354 // We don't need to worry about the case of changing chrom here. | |
| 355 if ( !strcmp( ne.geneName, transcript[0].geneName ) && | |
| 356 start > farthest ) // Use farthest to avoid inteleaved gene. | |
| 357 { | |
| 358 while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom ) | |
| 359 && introns[intronTag].end < ne.start ) | |
| 360 ++intronTag ; | |
| 361 | |
| 362 while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom ) | |
| 363 && exons[ exonTag ].end < ne.start ) | |
| 364 ++exonTag ; | |
| 365 | |
| 366 farthest = end ; | |
| 367 } | |
| 368 | |
| 369 transcript.resize( 0 ) ; | |
| 370 transcript.push_back( ne ) ; | |
| 371 | |
| 372 // Find the overlaps. | |
| 373 strcpy( prevTid, tid ) ; | |
| 374 } | |
| 375 else | |
| 376 strcpy( prevTid, tid ) ; | |
| 377 } | |
| 378 else | |
| 379 { | |
| 380 fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ; | |
| 381 exit( 1 ) ; | |
| 382 } | |
| 383 } | |
| 384 | |
| 385 if ( transcript.size() > 0 ) | |
| 386 { | |
| 387 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
| 388 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
| 389 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) | |
| 390 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; | |
| 391 | |
| 392 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; | |
| 393 } | |
| 394 fclose( fp ) ; | |
| 395 } | |
| 396 fclose( fpList ) ; | |
| 397 | |
| 398 // Add the gene_name field. | |
| 399 fpList = fopen( argv[2], "r" ) ; | |
| 400 int novelCnt = 0 ; | |
| 401 while ( fscanf( fpList, "%s", line ) != EOF ) | |
| 402 { | |
| 403 FILE *fpOut ; | |
| 404 // Set the output file | |
| 405 char *p ; | |
| 406 p = line + strlen( line ) ; | |
| 407 while ( p != line && *p != '/' ) | |
| 408 { | |
| 409 if ( *p == '\n' ) | |
| 410 *p = '\0' ; | |
| 411 --p ; | |
| 412 } | |
| 413 sprintf( buffer, "%s/%s", outputPath, p ) ; | |
| 414 fpOut = fopen( buffer, "w" ) ; | |
| 415 | |
| 416 // Process the input file | |
| 417 fp = fopen( line, "r" ) ; | |
| 418 | |
| 419 transcript.resize( 0 ) ; | |
| 420 prevTid[0] = '\0' ; | |
| 421 while ( fgets( line, sizeof( line ), fp ) ) | |
| 422 { | |
| 423 char gname[97] ; | |
| 424 if ( line[0] == '#' ) | |
| 425 { | |
| 426 fprintf( fpOut, "%s", line ) ; | |
| 427 continue ; | |
| 428 } | |
| 429 | |
| 430 if ( GetGTFField( buffer, line, "transcript_id" ) ) | |
| 431 { | |
| 432 if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() ) | |
| 433 { | |
| 434 int len = strlen( line ) ; | |
| 435 if ( line[len - 1] == '\n' ) | |
| 436 { | |
| 437 line[len - 1] = '\0' ; | |
| 438 --len ; | |
| 439 } | |
| 440 fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ; | |
| 441 continue ; | |
| 442 } | |
| 443 } | |
| 444 | |
| 445 if ( GetGTFField( gname, line, "gene_id" ) ) | |
| 446 { | |
| 447 int len = strlen( line ) ; | |
| 448 if ( line[len - 1] == '\n' ) | |
| 449 { | |
| 450 line[len - 1] = '\0' ; | |
| 451 --len ; | |
| 452 } | |
| 453 if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() ) | |
| 454 { | |
| 455 fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ; | |
| 456 } | |
| 457 else | |
| 458 { | |
| 459 sprintf( buffer, "novel_%d", novelCnt ) ; | |
| 460 geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ; | |
| 461 fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ; | |
| 462 ++novelCnt ; | |
| 463 } | |
| 464 | |
| 465 } | |
| 466 else | |
| 467 fprintf( fpOut, "%s", line ) ; | |
| 468 | |
| 469 } | |
| 470 | |
| 471 fclose( fp ) ; | |
| 472 fclose( fpOut ) ; | |
| 473 } | |
| 474 | |
| 475 return 0 ; | |
| 476 } |
