Mercurial > repos > earlhaminst > ensembl_longest_cds_per_gene
comparison ensembl_longest_cds_per_gene.py @ 2:6cf9f7f6509c draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
author | earlhaminst |
---|---|
date | Wed, 15 Mar 2017 20:23:13 -0400 |
parents | 4dba69135845 |
children |
comparison
equal
deleted
inserted
replaced
1:a07680f3033a | 2:6cf9f7f6509c |
---|---|
1 """ | 1 """ |
2 This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with | 2 This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with |
3 only the longest CDS sequence for each gene. The header of the sequences in the | 3 only the longest CDS sequence for each gene. |
4 output file will be the transcript id without version. | |
5 """ | 4 """ |
6 from __future__ import print_function | 5 from __future__ import print_function |
7 | 6 |
8 import collections | 7 import collections |
9 import optparse | 8 import optparse |
31 | 30 |
32 def remove_id_version(s): | 31 def remove_id_version(s): |
33 """ | 32 """ |
34 Remove the optional '.VERSION' from an Ensembl id. | 33 Remove the optional '.VERSION' from an Ensembl id. |
35 """ | 34 """ |
36 return s.split('.')[0] | 35 if s.startswith('ENS'): |
36 return s.split('.')[0] | |
37 else: | |
38 return s | |
37 | 39 |
38 | 40 |
39 parser = optparse.OptionParser() | 41 parser = optparse.OptionParser() |
40 parser.add_option('-f', '--fasta', dest="input_fasta_filename", | 42 parser.add_option('-f', '--fasta', dest="input_fasta_filename", |
41 help='CDS file in FASTA format from Ensembl') | 43 help='CDS file in FASTA format from Ensembl') |
50 | 52 |
51 gene_transcripts_dict = dict() | 53 gene_transcripts_dict = dict() |
52 | 54 |
53 for entry in FASTAReader_gen(options.input_fasta_filename): | 55 for entry in FASTAReader_gen(options.input_fasta_filename): |
54 transcript_id, rest = entry.header[1:].split(' ', 1) | 56 transcript_id, rest = entry.header[1:].split(' ', 1) |
55 transcript_id = remove_id_version(transcript_id) | |
56 gene_id = None | 57 gene_id = None |
57 for s in rest.split(' '): | 58 for s in rest.split(' '): |
58 if s.startswith('gene:'): | 59 if s.startswith('gene:'): |
59 gene_id = remove_id_version(s[5:]) | 60 gene_id = remove_id_version(s[5:]) |
60 break | 61 break |
71 # first one to appear in the FASTA file is selected | 72 # first one to appear in the FASTA file is selected |
72 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] | 73 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] |
73 | 74 |
74 with open(options.output_fasta_filename, 'w') as output_fasta_file: | 75 with open(options.output_fasta_filename, 'w') as output_fasta_file: |
75 for entry in FASTAReader_gen(options.input_fasta_filename): | 76 for entry in FASTAReader_gen(options.input_fasta_filename): |
76 transcript_id = remove_id_version(entry.header[1:].split(' ')[0]) | 77 transcript_id = entry.header[1:].split(' ')[0] |
77 if transcript_id in selected_transcript_ids: | 78 if transcript_id in selected_transcript_ids: |
78 output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence)) | 79 output_fasta_file.write("%s\n%s\n" % (entry.header, entry.sequence)) |