annotate dexseq-hts_1.0/src/dexseq_prepare_annotation.py @ 11:cec4b4fb30be draft default tip

DEXSeq version 1.6 added
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 08:22:45 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
1 import sys, collections, itertools, os.path
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
2
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
3 if len( sys.argv ) != 3:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
4 sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
5 sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
6 sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
7 sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
8 sys.stderr.write( "with the count_in_exons.py script.\n" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
9 sys.exit(1)
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
10
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
11 try:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
12 import HTSeq
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
13 except ImportError:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
14 sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
15 sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
16 sys.exit(1)
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
17
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
18 gtf_file = sys.argv[1]
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
19 out_file = sys.argv[2]
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
20
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
21 ## make sure that it can handle GFF files.
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
22 parent_child_map = dict()
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
23 for feature in HTSeq.GFF_Reader( gtf_file ):
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
24 if feature.type in ['mRNA',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
25 'transcript',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
26 'ncRNA',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
27 'miRNA',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
28 'pseudogenic_transcript',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
29 'rRNA',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
30 'snoRNA',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
31 'snRNA',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
32 'tRNA',
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
33 'scRNA']:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
34 parent_child_map[feature.attr['ID']] = feature.attr['Parent']
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
35
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
36 # Step 1: Store all exons with their gene and transcript ID
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
37 # in a GenomicArrayOfSets
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
38
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
39 exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
40 for f in HTSeq.GFF_Reader( gtf_file ):
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
41 if not f.type in ["exon", "pseudogenic_exon"]:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
42 continue
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
43 if not f.attr.get('gene_id'):
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
44 f.attr['gene_id'] = parent_child_map[f.attr['Parent']]
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
45 f.attr['transcript_id'] = f.attr['Parent']
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
46 f.attr['gene_id'] = f.attr['gene_id'].replace( ":", "_" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
47 exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
48
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
49 # Step 2: Form sets of overlapping genes
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
50
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
51 # We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
52 # contains IDs of genes that overlap, i.e., share bases (on the same strand).
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
53 # The keys of 'gene_sets' are the IDs of all genes, and each key refers to
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
54 # the set that contains the gene.
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
55 # Each gene set forms an 'aggregate gene'.
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
56
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
57 gene_sets = collections.defaultdict( lambda: set() )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
58 for iv, s in exons.steps():
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
59 # For each step, make a set, 'full_set' of all the gene IDs occuring
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
60 # in the present step, and also add all those gene IDs, whch have been
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
61 # seen earlier to co-occur with each of the currently present gene IDs.
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
62 full_set = set()
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
63 for gene_id, transcript_id in s:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
64 full_set.add( gene_id )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
65 full_set |= gene_sets[ gene_id ]
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
66 # Make sure that all genes that are now in full_set get associated
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
67 # with full_set, i.e., get to know about their new partners
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
68 for gene_id in full_set:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
69 assert gene_sets[ gene_id ] <= full_set
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
70 gene_sets[ gene_id ] = full_set
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
71
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
72
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
73 # Step 3: Go through the steps again to get the exonic sections. Each step
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
74 # becomes an 'exonic part'. The exonic part is associated with an
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
75 # aggregate gene, i.e., a gene set as determined in the previous step,
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
76 # and a transcript set, containing all transcripts that occur in the step.
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
77 # The results are stored in the dict 'aggregates', which contains, for each
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
78 # aggregate ID, a list of all its exonic_part features.
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
79
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
80 aggregates = collections.defaultdict( lambda: list() )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
81 for iv, s in exons.steps( ):
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
82 # Skip empty steps
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
83 if len(s) == 0:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
84 continue
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
85 # Take one of the gene IDs, find the others via gene sets, and
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
86 # form the aggregate ID from all of them
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
87 gene_id = list(s)[0][0]
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
88 assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ]
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
89 aggregate_id = '+'.join( gene_sets[ gene_id ] )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
90 # Make the feature and store it in 'aggregates'
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
91 f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
92 f.source = os.path.basename( sys.argv[1] )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
93 f.attr = {}
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
94 f.attr[ 'gene_id' ] = aggregate_id
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
95 transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
96 f.attr[ 'transcripts' ] = '+'.join( transcript_set )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
97 aggregates[ aggregate_id ].append( f )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
98
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
99
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
100 # Step 4: For each aggregate, number the exonic parts
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
101
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
102 aggregate_features = []
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
103 for l in aggregates.values():
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
104 for i in xrange( len(l)-1 ):
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
105 assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name"
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
106 assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early"
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
107 if l[i].iv.chrom != l[i+1].iv.chrom:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
108 raise ValueError, "Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
109 if l[i].iv.strand != l[i+1].iv.strand:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
110 raise ValueError, "Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
111 aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene",
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
112 HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start,
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
113 l[-1].iv.end, l[0].iv.strand ) )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
114 aggr_feat.source = os.path.basename( sys.argv[1] )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
115 aggr_feat.attr = { 'gene_id': aggr_feat.name }
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
116 for i in xrange( len(l) ):
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
117 l[i].attr['exonic_part_number'] = "%03d" % ( i+1 )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
118 aggregate_features.append( aggr_feat )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
119
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
120
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
121 # Step 5: Sort the aggregates, then write everything out
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
122
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
123 aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
124
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
125 fout = open( out_file, "w" )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
126 for aggr_feat in aggregate_features:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
127 fout.write( aggr_feat.get_gff_line() )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
128 for f in aggregates[ aggr_feat.name ]:
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
129 fout.write( f.get_gff_line() )
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
130
cec4b4fb30be DEXSeq version 1.6 added
vipints <vipin@cbio.mskcc.org>
parents:
diff changeset
131 fout.close()