sharplabtool: tools/ngs_rna/filter_transcripts_via

annotate tools/ngs_rna/filter_transcripts_via_tracking.py @ 0:9071e359b9a3

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children

rev	line source
0 9071e359b9a3 Uploaded xuebing parents: diff changeset	1 #!/usr/bin/env python
9071e359b9a3 Uploaded xuebing parents: diff changeset	2 import os, sys, tempfile
9071e359b9a3 Uploaded xuebing parents: diff changeset	3
9071e359b9a3 Uploaded xuebing parents: diff changeset	4 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded xuebing parents: diff changeset	5
9071e359b9a3 Uploaded xuebing parents: diff changeset	6 def __main__():
9071e359b9a3 Uploaded xuebing parents: diff changeset	7 """
9071e359b9a3 Uploaded xuebing parents: diff changeset	8 Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts
9071e359b9a3 Uploaded xuebing parents: diff changeset	9 produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only
9071e359b9a3 Uploaded xuebing parents: diff changeset	10 transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for
9071e359b9a3 Uploaded xuebing parents: diff changeset	11 a particular sample.
9071e359b9a3 Uploaded xuebing parents: diff changeset	12 """
9071e359b9a3 Uploaded xuebing parents: diff changeset	13 # Read parms.
9071e359b9a3 Uploaded xuebing parents: diff changeset	14 tracking_file_name = sys.argv[1]
9071e359b9a3 Uploaded xuebing parents: diff changeset	15 transcripts_file_name = sys.argv[2]
9071e359b9a3 Uploaded xuebing parents: diff changeset	16 output_file_name = sys.argv[3]
9071e359b9a3 Uploaded xuebing parents: diff changeset	17 sample_number = int ( sys.argv[4] )
9071e359b9a3 Uploaded xuebing parents: diff changeset	18
9071e359b9a3 Uploaded xuebing parents: diff changeset	19 # Open files.
9071e359b9a3 Uploaded xuebing parents: diff changeset	20 transcripts_file = open( transcripts_file_name, 'r' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	21 output_file = open( output_file_name, 'w' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	22
9071e359b9a3 Uploaded xuebing parents: diff changeset	23 # Read transcript IDs from tracking file.
9071e359b9a3 Uploaded xuebing parents: diff changeset	24 transcript_ids = {}
9071e359b9a3 Uploaded xuebing parents: diff changeset	25 for i, line in enumerate( file( tracking_file_name ) ) :
9071e359b9a3 Uploaded xuebing parents: diff changeset	26 # Split line into elements. Line format is
9071e359b9a3 Uploaded xuebing parents: diff changeset	27 # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>\|<transcript_id>\|<FMI>\|<FPKM>\|<conf_lo>\|<conf_hi>]
9071e359b9a3 Uploaded xuebing parents: diff changeset	28 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	29 elems = line.split( '\t' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	30
9071e359b9a3 Uploaded xuebing parents: diff changeset	31 # Get transcript info.
9071e359b9a3 Uploaded xuebing parents: diff changeset	32 if sample_number == 1:
9071e359b9a3 Uploaded xuebing parents: diff changeset	33 transcript_info = elems[4]
9071e359b9a3 Uploaded xuebing parents: diff changeset	34 elif sample_number == 2:
9071e359b9a3 Uploaded xuebing parents: diff changeset	35 transcript_info = elems[5]
9071e359b9a3 Uploaded xuebing parents: diff changeset	36 if not transcript_info.startswith('q'):
9071e359b9a3 Uploaded xuebing parents: diff changeset	37 # No transcript for this sample.
9071e359b9a3 Uploaded xuebing parents: diff changeset	38 continue
9071e359b9a3 Uploaded xuebing parents: diff changeset	39
9071e359b9a3 Uploaded xuebing parents: diff changeset	40 # Get and store transcript id.
9071e359b9a3 Uploaded xuebing parents: diff changeset	41 transcript_id = transcript_info.split('\|')[1]
9071e359b9a3 Uploaded xuebing parents: diff changeset	42 transcript_id = transcript_id.strip('"')
9071e359b9a3 Uploaded xuebing parents: diff changeset	43 transcript_ids[transcript_id] = ""
9071e359b9a3 Uploaded xuebing parents: diff changeset	44
9071e359b9a3 Uploaded xuebing parents: diff changeset	45 # Filter transcripts file using transcript_ids
9071e359b9a3 Uploaded xuebing parents: diff changeset	46 for i, line in enumerate( file( transcripts_file_name ) ):
9071e359b9a3 Uploaded xuebing parents: diff changeset	47 # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
9071e359b9a3 Uploaded xuebing parents: diff changeset	48 elems = line.split( '\t' )
9071e359b9a3 Uploaded xuebing parents: diff changeset	49
9071e359b9a3 Uploaded xuebing parents: diff changeset	50 # Get attributes.
9071e359b9a3 Uploaded xuebing parents: diff changeset	51 attributes_list = elems[8].split(";")
9071e359b9a3 Uploaded xuebing parents: diff changeset	52 attributes = {}
9071e359b9a3 Uploaded xuebing parents: diff changeset	53 for name_value_pair in attributes_list:
9071e359b9a3 Uploaded xuebing parents: diff changeset	54 pair = name_value_pair.strip().split(" ")
9071e359b9a3 Uploaded xuebing parents: diff changeset	55 name = pair[0].strip()
9071e359b9a3 Uploaded xuebing parents: diff changeset	56 if name == '':
9071e359b9a3 Uploaded xuebing parents: diff changeset	57 continue
9071e359b9a3 Uploaded xuebing parents: diff changeset	58 # Need to strip double quote from values
9071e359b9a3 Uploaded xuebing parents: diff changeset	59 value = pair[1].strip(" \"")
9071e359b9a3 Uploaded xuebing parents: diff changeset	60 attributes[name] = value
9071e359b9a3 Uploaded xuebing parents: diff changeset	61
9071e359b9a3 Uploaded xuebing parents: diff changeset	62 # Get element's transcript id.
9071e359b9a3 Uploaded xuebing parents: diff changeset	63 transcript_id = attributes['transcript_id']
9071e359b9a3 Uploaded xuebing parents: diff changeset	64 if transcript_id in transcript_ids:
9071e359b9a3 Uploaded xuebing parents: diff changeset	65 output_file.write(line)
9071e359b9a3 Uploaded xuebing parents: diff changeset	66
9071e359b9a3 Uploaded xuebing parents: diff changeset	67 # Clean up.
9071e359b9a3 Uploaded xuebing parents: diff changeset	68 output_file.close()
9071e359b9a3 Uploaded xuebing parents: diff changeset	69
9071e359b9a3 Uploaded xuebing parents: diff changeset	70 if __name__ == "__main__": __main__()

Mercurial > repos > xuebing > sharplabtool

annotate tools/ngs_rna/filter_transcripts_via_tracking.py @ 0:9071e359b9a3