28
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import Bio as Bio
|
|
4 from Bio import SeqIO
|
|
5 from Bio.Data import CodonTable
|
|
6 import re
|
|
7 from pprint import pprint
|
|
8 import argparse as ap
|
|
9 import sys
|
|
10 import os
|
|
11 import pandas as pd
|
|
12 #from BCBio import GFF
|
|
13
|
|
14
|
|
15 def read_input(data = "example.fna", type_ = "fasta"):
|
|
16
|
|
17 """
|
|
18 Accepted formats:
|
|
19 - fasta (multifasta)
|
|
20 - gff
|
|
21 - gbk
|
|
22
|
|
23 """
|
|
24
|
|
25
|
|
26 seqs = ""
|
|
27
|
|
28 if type_ == "fasta":
|
|
29 with open(data, "rU") as handle:
|
|
30 for record in SeqIO.parse(handle, type_):
|
|
31 seqs = seqs + str(record.seq)
|
|
32
|
|
33
|
|
34 #elif type_ == "gff":
|
|
35 # with open(data, "rU") as handle:
|
|
36 # for record in GFF.parse(handle):
|
|
37 # seqs = seqs + str(record.seq)
|
|
38
|
|
39 elif type_ == "gbk":
|
|
40 with open(data, "rU") as input_handle:
|
|
41 for record in SeqIO.parse(input_handle, "genbank"):
|
|
42 seqs = seqs + str(record.seq)
|
|
43
|
|
44
|
|
45 return seqs
|
|
46
|
|
47 def codon_usage(seqs, codonTable):
|
|
48
|
|
49 codon_usage = {}
|
|
50 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
|
|
51
|
|
52 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
|
|
53
|
|
54
|
|
55 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
|
|
56 b_cod_table[cod] = "_Stop"
|
|
57
|
|
58 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
|
|
59 b_cod_table[cod + " Start"] = b_cod_table[cod]
|
|
60 b_cod_table.pop(cod)
|
|
61
|
|
62 aas = set(b_cod_table.values())
|
|
63
|
|
64
|
|
65 for aa in aas:
|
|
66 codon_usage[aa] = {}
|
|
67 for codon in b_cod_table.keys():
|
|
68 if b_cod_table[codon] == aa:
|
|
69 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
|
|
70
|
|
71
|
|
72 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
|
|
73
|
|
74 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
|
|
75 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
|
|
76 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
|
|
77
|
|
78 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
|
|
79
|
|
80 if __name__ == '__main__':
|
|
81
|
|
82 parser = ap.ArgumentParser(description=
|
|
83 'This script takes as input gff, gbk and single or multifasta files and \n'
|
|
84 'compute the codon usage for a specified codon table.\n'
|
|
85 'Usage:\n'
|
|
86 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
|
|
87 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
|
|
88 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
|
|
89 formatter_class=ap.RawTextHelpFormatter)
|
|
90
|
|
91 parser.add_argument('-i','--input', help='The path to the input file',required=True)
|
|
92 parser.add_argument('-t','--type', help=
|
|
93 'The format of the file [genebank, fasta, gff ...]', required=True)
|
|
94 parser.add_argument('-c','--codonTable', help=
|
|
95 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
|
|
96 'Alternative Flatworm Mitochondrial,\\n'
|
|
97 'Alternative Yeast Nuclear,\n'
|
|
98 'Archaeal,\n'
|
|
99 'Ascidian Mitochondrial,\n'
|
|
100 'Bacterial,\n'
|
|
101 'Blastocrithidia Nuclear,\n'
|
|
102 'Blepharisma Macronuclear,\n'
|
|
103 'Candidate Division SR1,\n'
|
|
104 'Chlorophycean Mitochondrial,\n'
|
|
105 'Ciliate Nuclear,\n'
|
|
106 'Coelenterate Mitochondrial,\n'
|
|
107 'Condylostoma Nuclear,\n'
|
|
108 'Dasycladacean Nuclear,\n'
|
|
109 'Echinoderm Mitochondrial,\n'
|
|
110 'Euplotid Nuclear,\n'
|
|
111 'Flatworm Mitochondrial,\n'
|
|
112 'Gracilibacteria,\n'
|
|
113 'Hexamita Nuclear,\n'
|
|
114 'Invertebrate Mitochondrial,\n'
|
|
115 'Karyorelict Nuclear,\n'
|
|
116 'Mesodinium Nuclear,\n'
|
|
117 'Mold Mitochondrial,\n'
|
|
118 'Mycoplasma,\n'
|
|
119 'Pachysolen tannophilus Nuclear,\n'
|
|
120 'Peritrich Nuclear,\n'
|
|
121 'Plant Plastid,\n'
|
|
122 'Protozoan Mitochondrial,\n'
|
|
123 'Pterobranchia Mitochondrial,\n'
|
|
124 'SGC0,\n'
|
|
125 'SGC1,\n'
|
|
126 'SGC2,\n'
|
|
127 'SGC3,\n'
|
|
128 'SGC4,\n'
|
|
129 'SGC5,\n'
|
|
130 'SGC8,\n'
|
|
131 'SGC9,\n'
|
|
132 'Scenedesmus obliquus Mitochondrial,\n'
|
|
133 'Spiroplasma,\n'
|
|
134 'Standard,\n'
|
|
135 'Thraustochytrium Mitochondrial,\n'
|
|
136 'Trematode Mitochondrial,\n'
|
|
137 'Vertebrate Mitochondrial,\n'
|
|
138 'Yeast Mitochondrial\n', required=True)
|
|
139
|
|
140 parser.add_argument('-o','--output', help='Description for bar argument', required=True)
|
|
141 args = vars(parser.parse_args())
|
|
142
|
|
143 seqs = read_input(data=args['input'], type_=args['type'])
|
|
144 out = codon_usage(seqs, args['codonTable'])
|
|
145
|
|
146 with open(args['output'], "w") as outf:
|
|
147 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
|
|
148
|
|
149 |