view fasta_header_converter.py @ 3:dd268de3a107 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author earlhaminst
date Fri, 03 Mar 2017 07:22:53 -0500
parents 4f9e5110914b
children
line wrap: on
line source

from __future__ import print_function

import collections
import json
import optparse
import sys

Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])


def FASTAReader_gen(fasta_filename):
    with open(fasta_filename) as fasta_file:
        line = fasta_file.readline()
        while True:
            if not line:
                return
            assert line.startswith('>'), "FASTA headers must start with >"
            header = line.rstrip()
            sequence_parts = []
            line = fasta_file.readline()
            while line and line[0] != '>':
                sequence_parts.append(line.rstrip())
                line = fasta_file.readline()
            sequence = "\n".join(sequence_parts)
            yield Sequence(header, sequence)


def read_gene_info(gene_info):
    transcript_species_dict = dict()
    for gene_dict in gene_info.values():
        for transcript in gene_dict['Transcript']:
            transcript_species_dict[transcript['id']] = transcript['species'].replace("_", "")
    return transcript_species_dict


parser = optparse.OptionParser()
parser.add_option('-j', '--json', dest="input_gene_filename",
                  help='Gene feature information in JSON format')
parser.add_option('-f', '--fasta', dest="input_fasta_filename",
                  help='Sequences in FASTA format')
parser.add_option('-o', '--output', dest="output_fasta_filename",
                  help='Output FASTA file name')
options, args = parser.parse_args()

if options.input_gene_filename is None:
    raise Exception('-j option must be specified')
if options.input_fasta_filename is None:
    raise Exception('-f option must be specified')
if options.output_fasta_filename is None:
    raise Exception('-o option must be specified')

with open(options.input_gene_filename) as json_fh:
    gene_info = json.load(json_fh)
transcript_species_dict = read_gene_info(gene_info)

with open(options.output_fasta_filename, 'w') as output_fasta_file:
    for entry in FASTAReader_gen(options.input_fasta_filename):
        name = entry.header[1:].lstrip()
        if name not in transcript_species_dict:
            print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr)
            continue
        output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence))