Mercurial > repos > earlhaminst > ensembl_longest_cds_per_gene
annotate ensembl_longest_cds_per_gene.py @ 2:6cf9f7f6509c draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
author | earlhaminst |
---|---|
date | Wed, 15 Mar 2017 20:23:13 -0400 |
parents | 4dba69135845 |
children |
rev | line source |
---|---|
0
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
1 """ |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
2 This script reads a CDS FASTA file from Ensembl and outputs a FASTA file with |
2
6cf9f7f6509c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
0
diff
changeset
|
3 only the longest CDS sequence for each gene. |
0
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
4 """ |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
5 from __future__ import print_function |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
6 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
7 import collections |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
8 import optparse |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
9 import sys |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
10 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
11 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
12 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
13 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
14 def FASTAReader_gen(fasta_filename): |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
15 with open(fasta_filename) as fasta_file: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
16 line = fasta_file.readline() |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
17 while True: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
18 if not line: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
19 return |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
20 assert line.startswith('>'), "FASTA headers must start with >" |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
21 header = line.rstrip() |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
22 sequence_parts = [] |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
23 line = fasta_file.readline() |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
24 while line and line[0] != '>': |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
25 sequence_parts.append(line.rstrip()) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
26 line = fasta_file.readline() |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
27 sequence = "\n".join(sequence_parts) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
28 yield Sequence(header, sequence) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
29 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
30 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
31 def remove_id_version(s): |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
32 """ |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
33 Remove the optional '.VERSION' from an Ensembl id. |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
34 """ |
2
6cf9f7f6509c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
0
diff
changeset
|
35 if s.startswith('ENS'): |
6cf9f7f6509c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
0
diff
changeset
|
36 return s.split('.')[0] |
6cf9f7f6509c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
0
diff
changeset
|
37 else: |
6cf9f7f6509c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
0
diff
changeset
|
38 return s |
0
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
39 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
40 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
41 parser = optparse.OptionParser() |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
42 parser.add_option('-f', '--fasta', dest="input_fasta_filename", |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
43 help='CDS file in FASTA format from Ensembl') |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
44 parser.add_option('-o', '--output', dest="output_fasta_filename", |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
45 help='Output FASTA file name') |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
46 options, args = parser.parse_args() |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
47 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
48 if options.input_fasta_filename is None: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
49 raise Exception('-f option must be specified') |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
50 if options.output_fasta_filename is None: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
51 raise Exception('-o option must be specified') |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
52 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
53 gene_transcripts_dict = dict() |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
54 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
55 for entry in FASTAReader_gen(options.input_fasta_filename): |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
56 transcript_id, rest = entry.header[1:].split(' ', 1) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
57 gene_id = None |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
58 for s in rest.split(' '): |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
59 if s.startswith('gene:'): |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
60 gene_id = remove_id_version(s[5:]) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
61 break |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
62 else: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
63 print("Gene id not found in header '%s'" % entry.header, file=sys.stderr) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
64 continue |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
65 if gene_id in gene_transcripts_dict: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
66 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
67 else: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
68 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
69 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
70 # For each gene, select the transcript with the longest sequence |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
71 # If more than one transcripts have the same longest sequence for a gene, the |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
72 # first one to appear in the FASTA file is selected |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
73 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
74 |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
75 with open(options.output_fasta_filename, 'w') as output_fasta_file: |
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
76 for entry in FASTAReader_gen(options.input_fasta_filename): |
2
6cf9f7f6509c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
0
diff
changeset
|
77 transcript_id = entry.header[1:].split(' ')[0] |
0
4dba69135845
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 26c70aecb56c19099455bb5a432615b09ad322d1
earlhaminst
parents:
diff
changeset
|
78 if transcript_id in selected_transcript_ids: |
2
6cf9f7f6509c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ensembl_longest_cds_per_gene commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
0
diff
changeset
|
79 output_fasta_file.write("%s\n%s\n" % (entry.header, entry.sequence)) |