deseq_hts: dexseq-hts_1.0/src/dexseq_prepare

comparison dexseq-hts_1.0/src/dexseq_prepare_annotation.py @ 11:cec4b4fb30be draft default tip

DEXSeq version 1.6 added

author	vipints <vipin@cbio.mskcc.org>
date	Tue, 08 Oct 2013 08:22:45 -0400
parents
children

comparison

equal deleted inserted replaced

-:2fe512c7bfdf
+:cec4b4fb30be
+import sys, collections, itertools, os.path
+if len( sys.argv ) != 3:
+sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" )
+sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) )
+sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" )
+sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" )
+sys.stderr.write( "with the count_in_exons.py script.\n" )
+sys.exit(1)
+try:
+import HTSeq
+except ImportError:
+sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )
+sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )
+sys.exit(1)
+gtf_file = sys.argv[1]
+out_file = sys.argv[2]
+## make sure that it can handle GFF files.
+parent_child_map = dict()
+for feature in HTSeq.GFF_Reader( gtf_file ):
+if feature.type in ['mRNA',
+'transcript',
+'ncRNA',
+'miRNA',
+'pseudogenic_transcript',
+'rRNA',
+'snoRNA',
+'snRNA',
+'tRNA',
+'scRNA']:
+parent_child_map[feature.attr['ID']] = feature.attr['Parent']
+# Step 1: Store all exons with their gene and transcript ID
+# in a GenomicArrayOfSets
+exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True )
+for f in HTSeq.GFF_Reader( gtf_file ):
+if not f.type in ["exon", "pseudogenic_exon"]:
+continue
+if not f.attr.get('gene_id'):
+f.attr['gene_id'] = parent_child_map[f.attr['Parent']]
+f.attr['transcript_id'] = f.attr['Parent']
+f.attr['gene_id'] = f.attr['gene_id'].replace( ":", "_" )
+exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] )
+# Step 2: Form sets of overlapping genes
+# We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set
+# contains IDs of genes that overlap, i.e., share bases (on the same strand).
+# The keys of 'gene_sets' are the IDs of all genes, and each key refers to
+# the set that contains the gene.
+# Each gene set forms an 'aggregate gene'.
+gene_sets = collections.defaultdict( lambda: set() )
+for iv, s in exons.steps():
+# For each step, make a set, 'full_set' of all the gene IDs occuring
+# in the present step, and also add all those gene IDs, whch have been
+# seen earlier to co-occur with each of the currently present gene IDs.
+full_set = set()
+for gene_id, transcript_id in s:
+full_set.add( gene_id )
+full_set |= gene_sets[ gene_id ]
+# Make sure that all genes that are now in full_set get associated
+# with full_set, i.e., get to know about their new partners
+for gene_id in full_set:
+assert gene_sets[ gene_id ] <= full_set
+gene_sets[ gene_id ] = full_set
+# Step 3: Go through the steps again to get the exonic sections. Each step
+# becomes an 'exonic part'. The exonic part is associated with an
+# aggregate gene, i.e., a gene set as determined in the previous step,
+# and a transcript set, containing all transcripts that occur in the step.
+# The results are stored in the dict 'aggregates', which contains, for each
+# aggregate ID, a list of all its exonic_part features.
+aggregates = collections.defaultdict( lambda: list() )
+for iv, s in exons.steps( ):
+# Skip empty steps
+if len(s) == 0:
+continue
+# Take one of the gene IDs, find the others via gene sets, and
+# form the aggregate ID from all of them
+gene_id = list(s)[0][0]
+assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ]
+aggregate_id = '+'.join( gene_sets[ gene_id ] )
+# Make the feature and store it in 'aggregates'
+f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv )
+f.source = os.path.basename( sys.argv[1] )
+f.attr = {}
+f.attr[ 'gene_id' ] = aggregate_id
+transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) )
+f.attr[ 'transcripts' ] = '+'.join( transcript_set )
+aggregates[ aggregate_id ].append( f )
+# Step 4: For each aggregate, number the exonic parts
+aggregate_features = []
+for l in aggregates.values():
+for i in xrange( len(l)-1 ):
+assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name"
+assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early"
+if l[i].iv.chrom != l[i+1].iv.chrom:
+raise ValueError, "Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) )
+if l[i].iv.strand != l[i+1].iv.strand:
+raise ValueError, "Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) )
+aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene",
+HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start,
+l[-1].iv.end, l[0].iv.strand ) )
+aggr_feat.source = os.path.basename( sys.argv[1] )
+aggr_feat.attr = { 'gene_id': aggr_feat.name }
+for i in xrange( len(l) ):
+l[i].attr['exonic_part_number'] = "%03d" % ( i+1 )
+aggregate_features.append( aggr_feat )
+# Step 5: Sort the aggregates, then write everything out
+aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) )
+fout = open( out_file, "w" )
+for aggr_feat in aggregate_features:
+fout.write( aggr_feat.get_gff_line() )
+for f in aggregates[ aggr_feat.name ]:
+fout.write( f.get_gff_line() )
+fout.close()

Mercurial > repos > vipints > deseq_hts

comparison dexseq-hts_1.0/src/dexseq_prepare_annotation.py @ 11:cec4b4fb30be draft default tip