Mercurial > repos > earlhaminst > ensembl_longest_cds_per_gene
comparison ensembl_longest_cds_per_gene.py @ 0:4dba69135845 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
author | earlhaminst |
---|---|
date | Tue, 07 Mar 2017 05:54:30 -0500 |
parents | |
children | 6cf9f7f6509c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4dba69135845 |
---|---|
1 """ | |
2 This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with | |
3 only the longest CDS sequence for each gene. The header of the sequences in the | |
4 output file will be the transcript id without version. | |
5 """ | |
6 from __future__ import print_function | |
7 | |
8 import collections | |
9 import optparse | |
10 import sys | |
11 | |
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | |
13 | |
14 | |
15 def FASTAReader_gen(fasta_filename): | |
16 with open(fasta_filename) as fasta_file: | |
17 line = fasta_file.readline() | |
18 while True: | |
19 if not line: | |
20 return | |
21 assert line.startswith('>'), "FASTA headers must start with >" | |
22 header = line.rstrip() | |
23 sequence_parts = [] | |
24 line = fasta_file.readline() | |
25 while line and line[0] != '>': | |
26 sequence_parts.append(line.rstrip()) | |
27 line = fasta_file.readline() | |
28 sequence = "\n".join(sequence_parts) | |
29 yield Sequence(header, sequence) | |
30 | |
31 | |
32 def remove_id_version(s): | |
33 """ | |
34 Remove the optional '.VERSION' from an Ensembl id. | |
35 """ | |
36 return s.split('.')[0] | |
37 | |
38 | |
39 parser = optparse.OptionParser() | |
40 parser.add_option('-f', '--fasta', dest="input_fasta_filename", | |
41 help='CDS file in FASTA format from Ensembl') | |
42 parser.add_option('-o', '--output', dest="output_fasta_filename", | |
43 help='Output FASTA file name') | |
44 options, args = parser.parse_args() | |
45 | |
46 if options.input_fasta_filename is None: | |
47 raise Exception('-f option must be specified') | |
48 if options.output_fasta_filename is None: | |
49 raise Exception('-o option must be specified') | |
50 | |
51 gene_transcripts_dict = dict() | |
52 | |
53 for entry in FASTAReader_gen(options.input_fasta_filename): | |
54 transcript_id, rest = entry.header[1:].split(' ', 1) | |
55 transcript_id = remove_id_version(transcript_id) | |
56 gene_id = None | |
57 for s in rest.split(' '): | |
58 if s.startswith('gene:'): | |
59 gene_id = remove_id_version(s[5:]) | |
60 break | |
61 else: | |
62 print("Gene id not found in header '%s'" % entry.header, file=sys.stderr) | |
63 continue | |
64 if gene_id in gene_transcripts_dict: | |
65 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) | |
66 else: | |
67 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] | |
68 | |
69 # For each gene, select the transcript with the longest sequence | |
70 # If more than one transcripts have the same longest sequence for a gene, the | |
71 # first one to appear in the FASTA file is selected | |
72 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] | |
73 | |
74 with open(options.output_fasta_filename, 'w') as output_fasta_file: | |
75 for entry in FASTAReader_gen(options.input_fasta_filename): | |
76 transcript_id = remove_id_version(entry.header[1:].split(' ')[0]) | |
77 if transcript_id in selected_transcript_ids: | |
78 output_fasta_file.write(">%s\n%s\n" % (transcript_id, entry.sequence)) |