Mercurial > repos > xuebing > sharplabtool
comparison tools/ngs_rna/filter_transcripts_via_tracking.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 import os, sys, tempfile | |
3 | |
4 assert sys.version_info[:2] >= ( 2, 4 ) | |
5 | |
6 def __main__(): | |
7 """ | |
8 Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts | |
9 produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only | |
10 transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for | |
11 a particular sample. | |
12 """ | |
13 # Read parms. | |
14 tracking_file_name = sys.argv[1] | |
15 transcripts_file_name = sys.argv[2] | |
16 output_file_name = sys.argv[3] | |
17 sample_number = int ( sys.argv[4] ) | |
18 | |
19 # Open files. | |
20 transcripts_file = open( transcripts_file_name, 'r' ) | |
21 output_file = open( output_file_name, 'w' ) | |
22 | |
23 # Read transcript IDs from tracking file. | |
24 transcript_ids = {} | |
25 for i, line in enumerate( file( tracking_file_name ) ) : | |
26 # Split line into elements. Line format is | |
27 # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>] | |
28 line = line.rstrip( '\r\n' ) | |
29 elems = line.split( '\t' ) | |
30 | |
31 # Get transcript info. | |
32 if sample_number == 1: | |
33 transcript_info = elems[4] | |
34 elif sample_number == 2: | |
35 transcript_info = elems[5] | |
36 if not transcript_info.startswith('q'): | |
37 # No transcript for this sample. | |
38 continue | |
39 | |
40 # Get and store transcript id. | |
41 transcript_id = transcript_info.split('|')[1] | |
42 transcript_id = transcript_id.strip('"') | |
43 transcript_ids[transcript_id] = "" | |
44 | |
45 # Filter transcripts file using transcript_ids | |
46 for i, line in enumerate( file( transcripts_file_name ) ): | |
47 # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. | |
48 elems = line.split( '\t' ) | |
49 | |
50 # Get attributes. | |
51 attributes_list = elems[8].split(";") | |
52 attributes = {} | |
53 for name_value_pair in attributes_list: | |
54 pair = name_value_pair.strip().split(" ") | |
55 name = pair[0].strip() | |
56 if name == '': | |
57 continue | |
58 # Need to strip double quote from values | |
59 value = pair[1].strip(" \"") | |
60 attributes[name] = value | |
61 | |
62 # Get element's transcript id. | |
63 transcript_id = attributes['transcript_id'] | |
64 if transcript_id in transcript_ids: | |
65 output_file.write(line) | |
66 | |
67 # Clean up. | |
68 output_file.close() | |
69 | |
70 if __name__ == "__main__": __main__() |