Mercurial > repos > earlhaminst > treebest_best
comparison fasta_header_converter.py @ 3:dd268de3a107 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
| author | earlhaminst | 
|---|---|
| date | Fri, 03 Mar 2017 07:22:53 -0500 | 
| parents | 4f9e5110914b | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 2:7ea4df039a53 | 3:dd268de3a107 | 
|---|---|
| 1 from __future__ import print_function | 1 from __future__ import print_function | 
| 2 | 2 | 
| 3 import collections | |
| 3 import json | 4 import json | 
| 4 import optparse | 5 import optparse | 
| 6 import sys | |
| 7 | |
| 8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | |
| 9 | |
| 10 | |
| 11 def FASTAReader_gen(fasta_filename): | |
| 12 with open(fasta_filename) as fasta_file: | |
| 13 line = fasta_file.readline() | |
| 14 while True: | |
| 15 if not line: | |
| 16 return | |
| 17 assert line.startswith('>'), "FASTA headers must start with >" | |
| 18 header = line.rstrip() | |
| 19 sequence_parts = [] | |
| 20 line = fasta_file.readline() | |
| 21 while line and line[0] != '>': | |
| 22 sequence_parts.append(line.rstrip()) | |
| 23 line = fasta_file.readline() | |
| 24 sequence = "\n".join(sequence_parts) | |
| 25 yield Sequence(header, sequence) | |
| 5 | 26 | 
| 6 | 27 | 
| 7 def read_gene_info(gene_info): | 28 def read_gene_info(gene_info): | 
| 8 transcript_species_dict = dict() | 29 transcript_species_dict = dict() | 
| 9 for gene_dict in gene_info.values(): | 30 for gene_dict in gene_info.values(): | 
| 15 parser = optparse.OptionParser() | 36 parser = optparse.OptionParser() | 
| 16 parser.add_option('-j', '--json', dest="input_gene_filename", | 37 parser.add_option('-j', '--json', dest="input_gene_filename", | 
| 17 help='Gene feature information in JSON format') | 38 help='Gene feature information in JSON format') | 
| 18 parser.add_option('-f', '--fasta', dest="input_fasta_filename", | 39 parser.add_option('-f', '--fasta', dest="input_fasta_filename", | 
| 19 help='Sequences in FASTA format') | 40 help='Sequences in FASTA format') | 
| 41 parser.add_option('-o', '--output', dest="output_fasta_filename", | |
| 42 help='Output FASTA file name') | |
| 20 options, args = parser.parse_args() | 43 options, args = parser.parse_args() | 
| 21 | 44 | 
| 22 if options.input_gene_filename is None: | 45 if options.input_gene_filename is None: | 
| 23 raise Exception('-j option must be specified') | 46 raise Exception('-j option must be specified') | 
| 24 | |
| 25 if options.input_fasta_filename is None: | 47 if options.input_fasta_filename is None: | 
| 26 raise Exception('-f option must be specified') | 48 raise Exception('-f option must be specified') | 
| 49 if options.output_fasta_filename is None: | |
| 50 raise Exception('-o option must be specified') | |
| 27 | 51 | 
| 28 with open(options.input_gene_filename) as json_fh: | 52 with open(options.input_gene_filename) as json_fh: | 
| 29 gene_info = json.load(json_fh) | 53 gene_info = json.load(json_fh) | 
| 30 transcript_species_dict = read_gene_info(gene_info) | 54 transcript_species_dict = read_gene_info(gene_info) | 
| 31 | 55 | 
| 32 with open(options.input_fasta_filename) as fasta_fh: | 56 with open(options.output_fasta_filename, 'w') as output_fasta_file: | 
| 33 for line in fasta_fh: | 57 for entry in FASTAReader_gen(options.input_fasta_filename): | 
| 34 line = line.rstrip() | 58 name = entry.header[1:].lstrip() | 
| 35 if line.startswith(">"): | 59 if name not in transcript_species_dict: | 
| 36 name = line[1:].lstrip() | 60 print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr) | 
| 37 print(">" + name + "_" + transcript_species_dict[name]) | 61 continue | 
| 38 else: | 62 output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence)) | 
| 39 print(line) | 
