annotate fasta_header_converter.py @ 3:dd268de3a107 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author earlhaminst
date Fri, 03 Mar 2017 07:22:53 -0500
parents 4f9e5110914b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
2
3
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
3 import collections
0
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
4 import json
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
5 import optparse
3
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
6 import sys
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
7
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
9
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
10
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
11 def FASTAReader_gen(fasta_filename):
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
12 with open(fasta_filename) as fasta_file:
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
13 line = fasta_file.readline()
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
14 while True:
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
15 if not line:
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
16 return
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
17 assert line.startswith('>'), "FASTA headers must start with >"
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
18 header = line.rstrip()
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
19 sequence_parts = []
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
20 line = fasta_file.readline()
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
21 while line and line[0] != '>':
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
22 sequence_parts.append(line.rstrip())
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
23 line = fasta_file.readline()
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
24 sequence = "\n".join(sequence_parts)
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
25 yield Sequence(header, sequence)
0
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
26
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
27
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
28 def read_gene_info(gene_info):
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
29 transcript_species_dict = dict()
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
30 for gene_dict in gene_info.values():
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
31 for transcript in gene_dict['Transcript']:
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
32 transcript_species_dict[transcript['id']] = transcript['species'].replace("_", "")
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
33 return transcript_species_dict
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
34
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
35
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
36 parser = optparse.OptionParser()
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
37 parser.add_option('-j', '--json', dest="input_gene_filename",
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
38 help='Gene feature information in JSON format')
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
39 parser.add_option('-f', '--fasta', dest="input_fasta_filename",
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
40 help='Sequences in FASTA format')
3
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
41 parser.add_option('-o', '--output', dest="output_fasta_filename",
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
42 help='Output FASTA file name')
0
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
43 options, args = parser.parse_args()
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
44
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
45 if options.input_gene_filename is None:
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
46 raise Exception('-j option must be specified')
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
47 if options.input_fasta_filename is None:
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
48 raise Exception('-f option must be specified')
3
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
49 if options.output_fasta_filename is None:
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
50 raise Exception('-o option must be specified')
0
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
51
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
52 with open(options.input_gene_filename) as json_fh:
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
53 gene_info = json.load(json_fh)
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
54 transcript_species_dict = read_gene_info(gene_info)
4f9e5110914b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff changeset
55
3
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
56 with open(options.output_fasta_filename, 'w') as output_fasta_file:
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
57 for entry in FASTAReader_gen(options.input_fasta_filename):
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
58 name = entry.header[1:].lstrip()
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
59 if name not in transcript_species_dict:
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
60 print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr)
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
61 continue
dd268de3a107 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 0
diff changeset
62 output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence))