28
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import Bio as Bio
|
|
4 from Bio import SeqIO
|
|
5 from Bio.Data import CodonTable
|
|
6 import re
|
|
7 from pprint import pprint
|
|
8 import argparse as ap
|
|
9 import sys
|
|
10 import os
|
|
11 import pandas as pd
|
|
12
|
|
13
|
|
14 def read_input(data = "example.fna", type_ = "fasta"):
|
|
15
|
|
16 """
|
|
17 Accepted formats:
|
|
18 - fasta (multifasta)
|
|
19 - gff
|
|
20 - gbk
|
|
21
|
|
22 """
|
|
23
|
|
24
|
|
25 seqs = ""
|
|
26
|
|
27 if type_ == "fasta":
|
|
28 with open(data, "rU") as handle:
|
|
29 for record in SeqIO.parse(handle, type_):
|
|
30 seqs = seqs + str(record.seq)
|
|
31
|
|
32
|
|
33 #elif type_ == "gff":
|
|
34 # with open(data, "rU") as handle:
|
|
35 # for record in GFF.parse(handle):
|
|
36 # seqs = seqs + str(record.seq)
|
|
37
|
|
38 elif type_ == "gbk":
|
|
39 with open(data, "rU") as input_handle:
|
|
40 for record in SeqIO.parse(input_handle, "genbank"):
|
|
41 seqs = seqs + str(record.seq)
|
|
42
|
|
43
|
|
44 return seqs
|
|
45
|
|
46 def codon_usage(seqs, codonTable):
|
|
47
|
|
48 codon_usage = {}
|
|
49 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
|
|
50
|
|
51 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
|
|
52
|
|
53
|
|
54 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
|
|
55 b_cod_table[cod] = "_Stop"
|
|
56
|
|
57 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
|
|
58 b_cod_table[cod + " Start"] = b_cod_table[cod]
|
|
59 b_cod_table.pop(cod)
|
|
60
|
|
61 aas = set(b_cod_table.values())
|
|
62
|
|
63
|
|
64 for aa in aas:
|
|
65 codon_usage[aa] = {}
|
|
66 for codon in b_cod_table.keys():
|
|
67 if b_cod_table[codon] == aa:
|
|
68 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
|
|
69
|
|
70
|
|
71 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
|
|
72
|
|
73 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
|
|
74 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
|
|
75 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
|
|
76
|
|
77 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
|
|
78
|
|
79 if __name__ == '__main__':
|
|
80
|
|
81 parser = ap.ArgumentParser(description=
|
|
82 'This script takes as input gff, gbk and single or multifasta files and \n'
|
|
83 'compute the codon usage for a specified codon table.\n'
|
|
84 'Usage:\n'
|
|
85 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
|
|
86 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
|
|
87 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
|
|
88 formatter_class=ap.RawTextHelpFormatter)
|
|
89
|
|
90 parser.add_argument('-i','--input', help='The path to the input file',required=True)
|
|
91 parser.add_argument('-t','--type', help=
|
|
92 'The format of the file [genebank, fasta, gff ...]', required=True)
|
|
93 parser.add_argument('-c','--codonTable', help=
|
|
94 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
|
|
95 'Alternative Flatworm Mitochondrial,\\n'
|
|
96 'Alternative Yeast Nuclear,\n'
|
|
97 'Archaeal,\n'
|
|
98 'Ascidian Mitochondrial,\n'
|
|
99 'Bacterial,\n'
|
|
100 'Blastocrithidia Nuclear,\n'
|
|
101 'Blepharisma Macronuclear,\n'
|
|
102 'Candidate Division SR1,\n'
|
|
103 'Chlorophycean Mitochondrial,\n'
|
|
104 'Ciliate Nuclear,\n'
|
|
105 'Coelenterate Mitochondrial,\n'
|
|
106 'Condylostoma Nuclear,\n'
|
|
107 'Dasycladacean Nuclear,\n'
|
|
108 'Echinoderm Mitochondrial,\n'
|
|
109 'Euplotid Nuclear,\n'
|
|
110 'Flatworm Mitochondrial,\n'
|
|
111 'Gracilibacteria,\n'
|
|
112 'Hexamita Nuclear,\n'
|
|
113 'Invertebrate Mitochondrial,\n'
|
|
114 'Karyorelict Nuclear,\n'
|
|
115 'Mesodinium Nuclear,\n'
|
|
116 'Mold Mitochondrial,\n'
|
|
117 'Mycoplasma,\n'
|
|
118 'Pachysolen tannophilus Nuclear,\n'
|
|
119 'Peritrich Nuclear,\n'
|
|
120 'Plant Plastid,\n'
|
|
121 'Protozoan Mitochondrial,\n'
|
|
122 'Pterobranchia Mitochondrial,\n'
|
|
123 'SGC0,\n'
|
|
124 'SGC1,\n'
|
|
125 'SGC2,\n'
|
|
126 'SGC3,\n'
|
|
127 'SGC4,\n'
|
|
128 'SGC5,\n'
|
|
129 'SGC8,\n'
|
|
130 'SGC9,\n'
|
|
131 'Scenedesmus obliquus Mitochondrial,\n'
|
|
132 'Spiroplasma,\n'
|
|
133 'Standard,\n'
|
|
134 'Thraustochytrium Mitochondrial,\n'
|
|
135 'Trematode Mitochondrial,\n'
|
|
136 'Vertebrate Mitochondrial,\n'
|
|
137 'Yeast Mitochondrial\n', required=True)
|
|
138
|
|
139 parser.add_argument('-o','--output', help='Description for bar argument', required=True)
|
|
140 args = vars(parser.parse_args())
|
|
141
|
|
142 seqs = read_input(data=args['input'], type_=args['type'])
|
|
143 out = codon_usage(seqs, args['codonTable'])
|
|
144
|
31
|
145 with open(args['output']+".csv", "w") as outf:
|
28
|
146 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
|
|
147
|
|
148 |