view codon_usage.py @ 31:3cb2af2435d3 draft

Uploaded
author gianmarco_piccinno
date Wed, 12 Dec 2018 08:48:22 -0500
parents e85a1feaaf38
children 6e6b1fd6eda1
line wrap: on
line source

#!/usr/bin/env python

import Bio as Bio
from Bio import SeqIO
from Bio.Data import CodonTable
import re
from pprint import pprint
import argparse as ap
import sys
import os
import pandas as pd


def read_input(data = "example.fna", type_ = "fasta"):

    """
    Accepted formats:
        - fasta (multifasta)
        - gff
        - gbk

    """


    seqs = ""

    if type_ == "fasta":
        with open(data, "rU") as handle:
            for record in SeqIO.parse(handle, type_):
                seqs = seqs + str(record.seq)


    #elif type_ == "gff":
    #    with open(data, "rU") as handle:
    #        for record in GFF.parse(handle):
    #            seqs = seqs + str(record.seq)

    elif type_ == "gbk":
        with open(data, "rU") as input_handle:
                for record in SeqIO.parse(input_handle, "genbank"):
                    seqs = seqs + str(record.seq)


    return seqs

def codon_usage(seqs, codonTable):

    codon_usage = {}
    tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]

    b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table


    for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
        b_cod_table[cod] = "_Stop"

    for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
            b_cod_table[cod + " Start"] = b_cod_table[cod]
            b_cod_table.pop(cod)

    aas = set(b_cod_table.values())


    for aa in aas:
        codon_usage[aa] = {}
        for codon in b_cod_table.keys():
            if b_cod_table[codon] == aa:
                codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])


    tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}

    codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
    codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
    codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))

    return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}

if __name__ == '__main__':

    parser = ap.ArgumentParser(description=
    'This script takes as input gff, gbk and single or multifasta files and \n'
    'compute the codon usage for a specified codon table.\n'
    'Usage:\n'
    'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
    'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
    'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
    formatter_class=ap.RawTextHelpFormatter)

    parser.add_argument('-i','--input', help='The path to the input file',required=True)
    parser.add_argument('-t','--type', help=
    'The format of the file [genebank, fasta, gff ...]', required=True)
    parser.add_argument('-c','--codonTable', help=
    'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
    'Alternative Flatworm Mitochondrial,\\n'
    'Alternative Yeast Nuclear,\n'
    'Archaeal,\n'
    'Ascidian Mitochondrial,\n'
    'Bacterial,\n'
    'Blastocrithidia Nuclear,\n'
    'Blepharisma Macronuclear,\n'
    'Candidate Division SR1,\n'
    'Chlorophycean Mitochondrial,\n'
    'Ciliate Nuclear,\n'
    'Coelenterate Mitochondrial,\n'
    'Condylostoma Nuclear,\n'
    'Dasycladacean Nuclear,\n'
    'Echinoderm Mitochondrial,\n'
    'Euplotid Nuclear,\n'
    'Flatworm Mitochondrial,\n'
    'Gracilibacteria,\n'
    'Hexamita Nuclear,\n'
    'Invertebrate Mitochondrial,\n'
    'Karyorelict Nuclear,\n'
    'Mesodinium Nuclear,\n'
    'Mold Mitochondrial,\n'
    'Mycoplasma,\n'
    'Pachysolen tannophilus Nuclear,\n'
    'Peritrich Nuclear,\n'
    'Plant Plastid,\n'
    'Protozoan Mitochondrial,\n'
    'Pterobranchia Mitochondrial,\n'
    'SGC0,\n'
    'SGC1,\n'
    'SGC2,\n'
    'SGC3,\n'
    'SGC4,\n'
    'SGC5,\n'
    'SGC8,\n'
    'SGC9,\n'
    'Scenedesmus obliquus Mitochondrial,\n'
    'Spiroplasma,\n'
    'Standard,\n'
    'Thraustochytrium Mitochondrial,\n'
    'Trematode Mitochondrial,\n'
    'Vertebrate Mitochondrial,\n'
    'Yeast Mitochondrial\n', required=True)

    parser.add_argument('-o','--output', help='Description for bar argument', required=True)
    args = vars(parser.parse_args())

    seqs = read_input(data=args['input'], type_=args['type'])
    out = codon_usage(seqs, args['codonTable'])

    with open(args['output']+".csv", "w") as outf:
        out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])