view fasta_header_converter.py @ 2:7ea4df039a53 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 978a8efb9969646b97873d69348971860f2793f5
author earlhaminst
date Wed, 22 Feb 2017 05:48:02 -0500
parents 4f9e5110914b
children dd268de3a107
line wrap: on
line source

from __future__ import print_function

import json
import optparse


def read_gene_info(gene_info):
    transcript_species_dict = dict()
    for gene_dict in gene_info.values():
        for transcript in gene_dict['Transcript']:
            transcript_species_dict[transcript['id']] = transcript['species'].replace("_", "")
    return transcript_species_dict


parser = optparse.OptionParser()
parser.add_option('-j', '--json', dest="input_gene_filename",
                  help='Gene feature information in JSON format')
parser.add_option('-f', '--fasta', dest="input_fasta_filename",
                  help='Sequences in FASTA format')
options, args = parser.parse_args()

if options.input_gene_filename is None:
    raise Exception('-j option must be specified')

if options.input_fasta_filename is None:
    raise Exception('-f option must be specified')

with open(options.input_gene_filename) as json_fh:
    gene_info = json.load(json_fh)
transcript_species_dict = read_gene_info(gene_info)

with open(options.input_fasta_filename) as fasta_fh:
    for line in fasta_fh:
        line = line.rstrip()
        if line.startswith(">"):
            name = line[1:].lstrip()
            print(">" + name + "_" + transcript_species_dict[name])
        else:
            print(line)