annotate tools/ngs_rna/filter_transcripts_via_tracking.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 import os, sys, tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 a particular sample.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 # Read parms.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 tracking_file_name = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 transcripts_file_name = sys.argv[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 output_file_name = sys.argv[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 sample_number = int ( sys.argv[4] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 # Open files.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 transcripts_file = open( transcripts_file_name, 'r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 output_file = open( output_file_name, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 # Read transcript IDs from tracking file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 transcript_ids = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 for i, line in enumerate( file( tracking_file_name ) ) :
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 # Split line into elements. Line format is
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 elems = line.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 # Get transcript info.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 if sample_number == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 transcript_info = elems[4]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 elif sample_number == 2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 transcript_info = elems[5]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 if not transcript_info.startswith('q'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 # No transcript for this sample.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 # Get and store transcript id.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 transcript_id = transcript_info.split('|')[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 transcript_id = transcript_id.strip('"')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 transcript_ids[transcript_id] = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 # Filter transcripts file using transcript_ids
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 for i, line in enumerate( file( transcripts_file_name ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 elems = line.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 # Get attributes.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 attributes_list = elems[8].split(";")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 attributes = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 for name_value_pair in attributes_list:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 pair = name_value_pair.strip().split(" ")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 name = pair[0].strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 if name == '':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 # Need to strip double quote from values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 value = pair[1].strip(" \"")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 attributes[name] = value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 # Get element's transcript id.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 transcript_id = attributes['transcript_id']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 if transcript_id in transcript_ids:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 output_file.write(line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 # Clean up.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 output_file.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 if __name__ == "__main__": __main__()