comparison fasta_header_converter.py @ 3:dd268de3a107 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author earlhaminst
date Fri, 03 Mar 2017 07:22:53 -0500
parents 4f9e5110914b
children
comparison
equal deleted inserted replaced
2:7ea4df039a53 3:dd268de3a107
1 from __future__ import print_function 1 from __future__ import print_function
2 2
3 import collections
3 import json 4 import json
4 import optparse 5 import optparse
6 import sys
7
8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
9
10
11 def FASTAReader_gen(fasta_filename):
12 with open(fasta_filename) as fasta_file:
13 line = fasta_file.readline()
14 while True:
15 if not line:
16 return
17 assert line.startswith('>'), "FASTA headers must start with >"
18 header = line.rstrip()
19 sequence_parts = []
20 line = fasta_file.readline()
21 while line and line[0] != '>':
22 sequence_parts.append(line.rstrip())
23 line = fasta_file.readline()
24 sequence = "\n".join(sequence_parts)
25 yield Sequence(header, sequence)
5 26
6 27
7 def read_gene_info(gene_info): 28 def read_gene_info(gene_info):
8 transcript_species_dict = dict() 29 transcript_species_dict = dict()
9 for gene_dict in gene_info.values(): 30 for gene_dict in gene_info.values():
15 parser = optparse.OptionParser() 36 parser = optparse.OptionParser()
16 parser.add_option('-j', '--json', dest="input_gene_filename", 37 parser.add_option('-j', '--json', dest="input_gene_filename",
17 help='Gene feature information in JSON format') 38 help='Gene feature information in JSON format')
18 parser.add_option('-f', '--fasta', dest="input_fasta_filename", 39 parser.add_option('-f', '--fasta', dest="input_fasta_filename",
19 help='Sequences in FASTA format') 40 help='Sequences in FASTA format')
41 parser.add_option('-o', '--output', dest="output_fasta_filename",
42 help='Output FASTA file name')
20 options, args = parser.parse_args() 43 options, args = parser.parse_args()
21 44
22 if options.input_gene_filename is None: 45 if options.input_gene_filename is None:
23 raise Exception('-j option must be specified') 46 raise Exception('-j option must be specified')
24
25 if options.input_fasta_filename is None: 47 if options.input_fasta_filename is None:
26 raise Exception('-f option must be specified') 48 raise Exception('-f option must be specified')
49 if options.output_fasta_filename is None:
50 raise Exception('-o option must be specified')
27 51
28 with open(options.input_gene_filename) as json_fh: 52 with open(options.input_gene_filename) as json_fh:
29 gene_info = json.load(json_fh) 53 gene_info = json.load(json_fh)
30 transcript_species_dict = read_gene_info(gene_info) 54 transcript_species_dict = read_gene_info(gene_info)
31 55
32 with open(options.input_fasta_filename) as fasta_fh: 56 with open(options.output_fasta_filename, 'w') as output_fasta_file:
33 for line in fasta_fh: 57 for entry in FASTAReader_gen(options.input_fasta_filename):
34 line = line.rstrip() 58 name = entry.header[1:].lstrip()
35 if line.startswith(">"): 59 if name not in transcript_species_dict:
36 name = line[1:].lstrip() 60 print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr)
37 print(">" + name + "_" + transcript_species_dict[name]) 61 continue
38 else: 62 output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence))
39 print(line)