comparison ensembl_longest_cds_per_gene.py @ 2:6cf9f7f6509c draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
author earlhaminst
date Wed, 15 Mar 2017 20:23:13 -0400
parents 4dba69135845
children
comparison
equal deleted inserted replaced
1:a07680f3033a 2:6cf9f7f6509c
1 """ 1 """
2 This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with 2 This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with
3 only the longest CDS sequence for each gene. The header of the sequences in the 3 only the longest CDS sequence for each gene.
4 output file will be the transcript id without version.
5 """ 4 """
6 from __future__ import print_function 5 from __future__ import print_function
7 6
8 import collections 7 import collections
9 import optparse 8 import optparse
31 30
32 def remove_id_version(s): 31 def remove_id_version(s):
33 """ 32 """
34 Remove the optional '.VERSION' from an Ensembl id. 33 Remove the optional '.VERSION' from an Ensembl id.
35 """ 34 """
36 return s.split('.')[0] 35 if s.startswith('ENS'):
36 return s.split('.')[0]
37 else:
38 return s
37 39
38 40
39 parser = optparse.OptionParser() 41 parser = optparse.OptionParser()
40 parser.add_option('-f', '--fasta', dest="input_fasta_filename", 42 parser.add_option('-f', '--fasta', dest="input_fasta_filename",
41 help='CDS file in FASTA format from Ensembl') 43 help='CDS file in FASTA format from Ensembl')
50 52
51 gene_transcripts_dict = dict() 53 gene_transcripts_dict = dict()
52 54
53 for entry in FASTAReader_gen(options.input_fasta_filename): 55 for entry in FASTAReader_gen(options.input_fasta_filename):
54 transcript_id, rest = entry.header[1:].split(' ', 1) 56 transcript_id, rest = entry.header[1:].split(' ', 1)
55 transcript_id = remove_id_version(transcript_id)
56 gene_id = None 57 gene_id = None
57 for s in rest.split(' '): 58 for s in rest.split(' '):
58 if s.startswith('gene:'): 59 if s.startswith('gene:'):
59 gene_id = remove_id_version(s[5:]) 60 gene_id = remove_id_version(s[5:])
60 break 61 break
71 # first one to appear in the FASTA file is selected 72 # first one to appear in the FASTA file is selected
72 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] 73 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
73 74
74 with open(options.output_fasta_filename, 'w') as output_fasta_file: 75 with open(options.output_fasta_filename, 'w') as output_fasta_file:
75 for entry in FASTAReader_gen(options.input_fasta_filename): 76 for entry in FASTAReader_gen(options.input_fasta_filename):
76 transcript_id = remove_id_version(entry.header[1:].split(' ')[0]) 77 transcript_id = entry.header[1:].split(' ')[0]
77 if transcript_id in selected_transcript_ids: 78 if transcript_id in selected_transcript_ids:
78 output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence)) 79 output_fasta_file.write("%s\n%s\n" % (entry.header, entry.sequence))