Mercurial > repos > gianmarco_piccinno > project_rm
view codon_usage.py @ 31:3cb2af2435d3 draft
Uploaded
author | gianmarco_piccinno |
---|---|
date | Wed, 12 Dec 2018 08:48:22 -0500 |
parents | e85a1feaaf38 |
children | 6e6b1fd6eda1 |
line wrap: on
line source
#!/usr/bin/env python import Bio as Bio from Bio import SeqIO from Bio.Data import CodonTable import re from pprint import pprint import argparse as ap import sys import os import pandas as pd def read_input(data = "example.fna", type_ = "fasta"): """ Accepted formats: - fasta (multifasta) - gff - gbk """ seqs = "" if type_ == "fasta": with open(data, "rU") as handle: for record in SeqIO.parse(handle, type_): seqs = seqs + str(record.seq) #elif type_ == "gff": # with open(data, "rU") as handle: # for record in GFF.parse(handle): # seqs = seqs + str(record.seq) elif type_ == "gbk": with open(data, "rU") as input_handle: for record in SeqIO.parse(input_handle, "genbank"): seqs = seqs + str(record.seq) return seqs def codon_usage(seqs, codonTable): codon_usage = {} tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: b_cod_table[cod] = "_Stop" for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: b_cod_table[cod + " Start"] = b_cod_table[cod] b_cod_table.pop(cod) aas = set(b_cod_table.values()) for aa in aas: codon_usage[aa] = {} for codon in b_cod_table.keys(): if b_cod_table[codon] == aa: codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} if __name__ == '__main__': parser = ap.ArgumentParser(description= 'This script takes as input gff, gbk and single or multifasta files and \n' 'compute the codon usage for a specified codon table.\n' 'Usage:\n' 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', formatter_class=ap.RawTextHelpFormatter) parser.add_argument('-i','--input', help='The path to the input file',required=True) parser.add_argument('-t','--type', help= 'The format of the file [genebank, fasta, gff ...]', required=True) parser.add_argument('-c','--codonTable', help= 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' 'Alternative Flatworm Mitochondrial,\\n' 'Alternative Yeast Nuclear,\n' 'Archaeal,\n' 'Ascidian Mitochondrial,\n' 'Bacterial,\n' 'Blastocrithidia Nuclear,\n' 'Blepharisma Macronuclear,\n' 'Candidate Division SR1,\n' 'Chlorophycean Mitochondrial,\n' 'Ciliate Nuclear,\n' 'Coelenterate Mitochondrial,\n' 'Condylostoma Nuclear,\n' 'Dasycladacean Nuclear,\n' 'Echinoderm Mitochondrial,\n' 'Euplotid Nuclear,\n' 'Flatworm Mitochondrial,\n' 'Gracilibacteria,\n' 'Hexamita Nuclear,\n' 'Invertebrate Mitochondrial,\n' 'Karyorelict Nuclear,\n' 'Mesodinium Nuclear,\n' 'Mold Mitochondrial,\n' 'Mycoplasma,\n' 'Pachysolen tannophilus Nuclear,\n' 'Peritrich Nuclear,\n' 'Plant Plastid,\n' 'Protozoan Mitochondrial,\n' 'Pterobranchia Mitochondrial,\n' 'SGC0,\n' 'SGC1,\n' 'SGC2,\n' 'SGC3,\n' 'SGC4,\n' 'SGC5,\n' 'SGC8,\n' 'SGC9,\n' 'Scenedesmus obliquus Mitochondrial,\n' 'Spiroplasma,\n' 'Standard,\n' 'Thraustochytrium Mitochondrial,\n' 'Trematode Mitochondrial,\n' 'Vertebrate Mitochondrial,\n' 'Yeast Mitochondrial\n', required=True) parser.add_argument('-o','--output', help='Description for bar argument', required=True) args = vars(parser.parse_args()) seqs = read_input(data=args['input'], type_=args['type']) out = codon_usage(seqs, args['codonTable']) with open(args['output']+".csv", "w") as outf: out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])