Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/AddGeneName.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/AddGeneName.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,476 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <string> +#include <map> +#include <vector> +#include <algorithm> + +char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n" + "\t-o: the directory for output gtf files (default: ./)\n" ; + +struct _interval +{ + char chrom[97] ; + char geneName[97] ; + + int start, end ; +} ; + +char line[10000] ; +char buffer[10000], buffer2[10000], buffer3[10000] ; + +int CompInterval( const struct _interval &a, const struct _interval &b ) +{ + int chromCmp = strcmp( a.chrom, b.chrom ) ; + if ( chromCmp ) + return chromCmp ; + else if ( a.start != b.start ) + return a.start - b.start ; + else + return a.end - b.end ; +} + +bool Comp( const struct _interval &a, const struct _interval &b ) +{ + int cmp = CompInterval( a, b ) ; + if ( cmp < 0 ) + return true ; + else + return false ; + + return false ; +} + +void SortAndCleanIntervals( std::vector<struct _interval> &it ) +{ + std::sort( it.begin(), it.end(), Comp ) ; + + int i, k ; + int size = it.size() ; + if ( size == 0 ) + return ; + + k = 1 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( CompInterval( it[k - 1], it[i] ) ) + { + it[k] = it[i] ; + ++k ; + } + } + it.resize( k ) ; +} + +int GetGTFField( char *ret, char *line, const char *fid ) +{ + char *p = strstr( line, fid ) ; + if ( p == NULL ) + return 0 ; + //printf( "%s %s\n", line, fid ) ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; + + sscanf( p, "%s", ret ) ; + p = ret + strlen( ret ) ; + while ( p != ret && *p != '\"' ) + --p ; + *p = '\0' ; + return 1 ; +} + +void UpdateIdToAnnoId( std::vector<struct _interval> &transcript, char *tid, int exonTag, int intronTag, std::vector<struct _interval> &exons, std::vector<struct _interval> &introns, + std::map<std::string, std::string> &txptIdToAnnoId, std::map<std::string, std::string> &geneIdToAnnoId ) +{ + int i, k ; + bool flag = false ; + // First, try whether intron works. + int ecnt = transcript.size() ; + int intronSize = introns.size() ; + int exonSize = exons.size() ; + + struct _interval itron ; + strcpy( itron.chrom, transcript[0].chrom ) ; + k = intronTag ; + for ( i = 0 ; i < ecnt - 1 && !flag ; ++i ) + { + itron.start = transcript[i].end ; + itron.end = transcript[i + 1].start ; + for ( ; k < intronSize ; ++k ) + { + //printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ; + if ( strcmp( introns[k].chrom, itron.chrom ) ) + break ; + + int cmp = CompInterval( introns[k], itron ) ; + if ( cmp == 0 ) + { + txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ; + geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ; + flag = true ; + break ; + } + else if ( cmp > 0 ) + break ; + } + } + + // Next, try whether exon works + k = exonTag ; + for ( i = 0 ; i < ecnt && !flag ; ++i ) + { + for ( ; k < exonSize ; ++k ) + { + if ( strcmp( exons[k].chrom, transcript[i].chrom ) ) + break ; + + if ( exons[k].end < transcript[i].start ) + continue ; + else if ( exons[k].start > transcript[i].end ) + break ; + else + { + txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ; + geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ; + flag = true ; + break ; + } + } + } +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fp ; + FILE *fpList ; + char outputPath[1024] = "./" ; + char prevTid[97] = "" ; + + std::vector<struct _interval> introns, exons ; + std::map<std::string, std::string> txptIdToAnnoId, geneIdToAnnoId ; + std::map<std::string, int> chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom. + + if ( argc < 3 ) + { + fprintf( stderr, "%s", usage ) ; + exit( 1 ) ; + } + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-o" ) ) + { + strcpy( outputPath, argv[i + 1 ] ) ; + ++i ; + } + else + { + fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + + // Get the exons, introns from the annotation. + fp = fopen( argv[1], "r" ) ; + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + char tid[97] ; + char gname[97] ; + char chrom[97] ; + char type[50] ; + int start, end ; + + sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; + if ( strcmp( type, "exon" ) ) + continue ; + if ( GetGTFField( gname, line, "gene_name" ) ) + { + struct _interval ne ; + strcpy( ne.chrom, chrom ) ; + strcpy( ne.geneName, gname ) ; + ne.start = start ; ne.end = end ; + + exons.push_back( ne ) ; + } + else + { + fprintf( stderr, "%s has no field of gene_name.\n", line ) ; + exit( 1 ) ; + } + + if ( GetGTFField( tid, line, "transcript_id" ) ) + { + if ( !strcmp( tid, prevTid ) ) + { + struct _interval ni ; + + strcpy( ni.chrom, chrom ) ; + strcpy( ni.geneName, gname ) ; + + int size = exons.size() ; + if ( size < 2 ) + continue ; + + struct _interval &e1 = exons[ size - 2 ] ; + struct _interval &e2 = exons[ size - 1 ] ; + if ( e1.start < e2.start ) + { + ni.start = e1.end ; + ni.end = e2.start ; + } + else + { + ni.start = e2.end ; + ni.end = e1.start ; + } + + introns.push_back( ni ) ; + } + else + strcpy( prevTid, tid ) ; + } + else + { + fprintf( stderr, "%s has no field of transcript_id.\n", line ) ; + exit( 1 ) ; + } + } + fclose( fp ) ; + SortAndCleanIntervals( exons ) ; + SortAndCleanIntervals( introns ) ; + int exonSize = exons.size() ; + int intronSize = introns.size() ; + // Determine the offset for each chrom on the list of features. + if ( exonSize ) + { + chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ; + for ( i = 1 ; i < exonSize ; ++i ) + { + if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) ) + chromIdMapExons[ std::string( exons[i].chrom ) ] = i ; + } + } + + if ( intronSize ) + { + chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ; + for ( i = 1 ; i < intronSize ; ++i ) + { + if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) ) + { + //printf( "%s %d\n", introns[i].chrom, i ) ; + chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ; + } + } + + //for ( i = 0 ; i < intronSize ; ++i ) + // printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ; + } + //printf( "%d %d\n", exonSize, intronSize ) ; + + // Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name + fpList = fopen( argv[2], "r ") ; + std::vector<struct _interval> transcript ; + int exonTag = 0 ; + int intronTag = 0 ; + + while ( fscanf( fpList, "%s", line ) != EOF ) + { + // Set the output file + //printf( "hi: %s\n", line ) ; + char *p ; + p = line + strlen( line ) ; + while ( p != line && *p != '/' ) + { + if ( *p == '\n' ) + *p = '\0' ; + --p ; + } + sprintf( buffer, "%s/%s", outputPath, p ) ; + + // Test whether this will overwrite the input gtf file. + if ( realpath( buffer, buffer2 ) != NULL ) + { + if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) ) + { + fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ; + exit( 1 ) ; + } + } + + fp = fopen( line, "r" ) ; + transcript.resize( 0 ) ; // hold the exons in the transcript. + prevTid[0] = '\0' ; + exonTag = 0 ; + intronTag = 0 ; + int farthest = 0 ; + + while ( fgets( line, sizeof( line ), fp ) ) + { + char tid[97] ; + //char gname[97] ; + char chrom[97] ; + char type[50] ; + int start, end ; + + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; + if ( strcmp( type, "exon" ) ) + continue ; + + if ( GetGTFField( tid, line, "transcript_id" ) ) + { + struct _interval ne ; + strcpy( ne.chrom, chrom ) ; + ne.start = start ; + ne.end = end ; + GetGTFField( ne.geneName, line, "gene_id" ) ; + + if ( !strcmp( tid, prevTid ) ) + { + transcript.push_back( ne ) ; + if ( end > farthest ) + farthest = end ; + } + else if ( transcript.size() > 0 ) + { + // the non-existed chrom will be put to offset 0, and that's fine + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + + UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; + + // Adjust the offset if we are done with a gene cluster + // We don't need to worry about the case of changing chrom here. + if ( !strcmp( ne.geneName, transcript[0].geneName ) && + start > farthest ) // Use farthest to avoid inteleaved gene. + { + while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom ) + && introns[intronTag].end < ne.start ) + ++intronTag ; + + while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom ) + && exons[ exonTag ].end < ne.start ) + ++exonTag ; + + farthest = end ; + } + + transcript.resize( 0 ) ; + transcript.push_back( ne ) ; + + // Find the overlaps. + strcpy( prevTid, tid ) ; + } + else + strcpy( prevTid, tid ) ; + } + else + { + fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ; + exit( 1 ) ; + } + } + + if ( transcript.size() > 0 ) + { + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + + UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; + } + fclose( fp ) ; + } + fclose( fpList ) ; + + // Add the gene_name field. + fpList = fopen( argv[2], "r" ) ; + int novelCnt = 0 ; + while ( fscanf( fpList, "%s", line ) != EOF ) + { + FILE *fpOut ; + // Set the output file + char *p ; + p = line + strlen( line ) ; + while ( p != line && *p != '/' ) + { + if ( *p == '\n' ) + *p = '\0' ; + --p ; + } + sprintf( buffer, "%s/%s", outputPath, p ) ; + fpOut = fopen( buffer, "w" ) ; + + // Process the input file + fp = fopen( line, "r" ) ; + + transcript.resize( 0 ) ; + prevTid[0] = '\0' ; + while ( fgets( line, sizeof( line ), fp ) ) + { + char gname[97] ; + if ( line[0] == '#' ) + { + fprintf( fpOut, "%s", line ) ; + continue ; + } + + if ( GetGTFField( buffer, line, "transcript_id" ) ) + { + if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() ) + { + int len = strlen( line ) ; + if ( line[len - 1] == '\n' ) + { + line[len - 1] = '\0' ; + --len ; + } + fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ; + continue ; + } + } + + if ( GetGTFField( gname, line, "gene_id" ) ) + { + int len = strlen( line ) ; + if ( line[len - 1] == '\n' ) + { + line[len - 1] = '\0' ; + --len ; + } + if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() ) + { + fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ; + } + else + { + sprintf( buffer, "novel_%d", novelCnt ) ; + geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ; + fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ; + ++novelCnt ; + } + + } + else + fprintf( fpOut, "%s", line ) ; + + } + + fclose( fp ) ; + fclose( fpOut ) ; + } + + return 0 ; +}