comparison codon_usage.py @ 28:e85a1feaaf38 draft

Uploaded
author fabio
date Wed, 12 Dec 2018 08:12:25 -0500
parents
children 3cb2af2435d3
comparison
equal deleted inserted replaced
27:1eb19659ff16 28:e85a1feaaf38
1 #!/usr/bin/env python
2
3 import Bio as Bio
4 from Bio import SeqIO
5 from Bio.Data import CodonTable
6 import re
7 from pprint import pprint
8 import argparse as ap
9 import sys
10 import os
11 import pandas as pd
12 #from BCBio import GFF
13
14
15 def read_input(data = "example.fna", type_ = "fasta"):
16
17 """
18 Accepted formats:
19 - fasta (multifasta)
20 - gff
21 - gbk
22
23 """
24
25
26 seqs = ""
27
28 if type_ == "fasta":
29 with open(data, "rU") as handle:
30 for record in SeqIO.parse(handle, type_):
31 seqs = seqs + str(record.seq)
32
33
34 #elif type_ == "gff":
35 # with open(data, "rU") as handle:
36 # for record in GFF.parse(handle):
37 # seqs = seqs + str(record.seq)
38
39 elif type_ == "gbk":
40 with open(data, "rU") as input_handle:
41 for record in SeqIO.parse(input_handle, "genbank"):
42 seqs = seqs + str(record.seq)
43
44
45 return seqs
46
47 def codon_usage(seqs, codonTable):
48
49 codon_usage = {}
50 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
51
52 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
53
54
55 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
56 b_cod_table[cod] = "_Stop"
57
58 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
59 b_cod_table[cod + " Start"] = b_cod_table[cod]
60 b_cod_table.pop(cod)
61
62 aas = set(b_cod_table.values())
63
64
65 for aa in aas:
66 codon_usage[aa] = {}
67 for codon in b_cod_table.keys():
68 if b_cod_table[codon] == aa:
69 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
70
71
72 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
73
74 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
75 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
76 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
77
78 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
79
80 if __name__ == '__main__':
81
82 parser = ap.ArgumentParser(description=
83 'This script takes as input gff, gbk and single or multifasta files and \n'
84 'compute the codon usage for a specified codon table.\n'
85 'Usage:\n'
86 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
87 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
88 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
89 formatter_class=ap.RawTextHelpFormatter)
90
91 parser.add_argument('-i','--input', help='The path to the input file',required=True)
92 parser.add_argument('-t','--type', help=
93 'The format of the file [genebank, fasta, gff ...]', required=True)
94 parser.add_argument('-c','--codonTable', help=
95 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
96 'Alternative Flatworm Mitochondrial,\\n'
97 'Alternative Yeast Nuclear,\n'
98 'Archaeal,\n'
99 'Ascidian Mitochondrial,\n'
100 'Bacterial,\n'
101 'Blastocrithidia Nuclear,\n'
102 'Blepharisma Macronuclear,\n'
103 'Candidate Division SR1,\n'
104 'Chlorophycean Mitochondrial,\n'
105 'Ciliate Nuclear,\n'
106 'Coelenterate Mitochondrial,\n'
107 'Condylostoma Nuclear,\n'
108 'Dasycladacean Nuclear,\n'
109 'Echinoderm Mitochondrial,\n'
110 'Euplotid Nuclear,\n'
111 'Flatworm Mitochondrial,\n'
112 'Gracilibacteria,\n'
113 'Hexamita Nuclear,\n'
114 'Invertebrate Mitochondrial,\n'
115 'Karyorelict Nuclear,\n'
116 'Mesodinium Nuclear,\n'
117 'Mold Mitochondrial,\n'
118 'Mycoplasma,\n'
119 'Pachysolen tannophilus Nuclear,\n'
120 'Peritrich Nuclear,\n'
121 'Plant Plastid,\n'
122 'Protozoan Mitochondrial,\n'
123 'Pterobranchia Mitochondrial,\n'
124 'SGC0,\n'
125 'SGC1,\n'
126 'SGC2,\n'
127 'SGC3,\n'
128 'SGC4,\n'
129 'SGC5,\n'
130 'SGC8,\n'
131 'SGC9,\n'
132 'Scenedesmus obliquus Mitochondrial,\n'
133 'Spiroplasma,\n'
134 'Standard,\n'
135 'Thraustochytrium Mitochondrial,\n'
136 'Trematode Mitochondrial,\n'
137 'Vertebrate Mitochondrial,\n'
138 'Yeast Mitochondrial\n', required=True)
139
140 parser.add_argument('-o','--output', help='Description for bar argument', required=True)
141 args = vars(parser.parse_args())
142
143 seqs = read_input(data=args['input'], type_=args['type'])
144 out = codon_usage(seqs, args['codonTable'])
145
146 with open(args['output'], "w") as outf:
147 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
148
149