Mercurial > repos > gianmarco_piccinno > project_rm
comparison codon_usage.py @ 28:e85a1feaaf38 draft
Uploaded
author | fabio |
---|---|
date | Wed, 12 Dec 2018 08:12:25 -0500 |
parents | |
children | 3cb2af2435d3 |
comparison
equal
deleted
inserted
replaced
27:1eb19659ff16 | 28:e85a1feaaf38 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import Bio as Bio | |
4 from Bio import SeqIO | |
5 from Bio.Data import CodonTable | |
6 import re | |
7 from pprint import pprint | |
8 import argparse as ap | |
9 import sys | |
10 import os | |
11 import pandas as pd | |
12 #from BCBio import GFF | |
13 | |
14 | |
15 def read_input(data = "example.fna", type_ = "fasta"): | |
16 | |
17 """ | |
18 Accepted formats: | |
19 - fasta (multifasta) | |
20 - gff | |
21 - gbk | |
22 | |
23 """ | |
24 | |
25 | |
26 seqs = "" | |
27 | |
28 if type_ == "fasta": | |
29 with open(data, "rU") as handle: | |
30 for record in SeqIO.parse(handle, type_): | |
31 seqs = seqs + str(record.seq) | |
32 | |
33 | |
34 #elif type_ == "gff": | |
35 # with open(data, "rU") as handle: | |
36 # for record in GFF.parse(handle): | |
37 # seqs = seqs + str(record.seq) | |
38 | |
39 elif type_ == "gbk": | |
40 with open(data, "rU") as input_handle: | |
41 for record in SeqIO.parse(input_handle, "genbank"): | |
42 seqs = seqs + str(record.seq) | |
43 | |
44 | |
45 return seqs | |
46 | |
47 def codon_usage(seqs, codonTable): | |
48 | |
49 codon_usage = {} | |
50 tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] | |
51 | |
52 b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table | |
53 | |
54 | |
55 for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: | |
56 b_cod_table[cod] = "_Stop" | |
57 | |
58 for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: | |
59 b_cod_table[cod + " Start"] = b_cod_table[cod] | |
60 b_cod_table.pop(cod) | |
61 | |
62 aas = set(b_cod_table.values()) | |
63 | |
64 | |
65 for aa in aas: | |
66 codon_usage[aa] = {} | |
67 for codon in b_cod_table.keys(): | |
68 if b_cod_table[codon] == aa: | |
69 codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) | |
70 | |
71 | |
72 tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} | |
73 | |
74 codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) | |
75 codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) | |
76 codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) | |
77 | |
78 return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} | |
79 | |
80 if __name__ == '__main__': | |
81 | |
82 parser = ap.ArgumentParser(description= | |
83 'This script takes as input gff, gbk and single or multifasta files and \n' | |
84 'compute the codon usage for a specified codon table.\n' | |
85 'Usage:\n' | |
86 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n' | |
87 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n' | |
88 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n', | |
89 formatter_class=ap.RawTextHelpFormatter) | |
90 | |
91 parser.add_argument('-i','--input', help='The path to the input file',required=True) | |
92 parser.add_argument('-t','--type', help= | |
93 'The format of the file [genebank, fasta, gff ...]', required=True) | |
94 parser.add_argument('-c','--codonTable', help= | |
95 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n' | |
96 'Alternative Flatworm Mitochondrial,\\n' | |
97 'Alternative Yeast Nuclear,\n' | |
98 'Archaeal,\n' | |
99 'Ascidian Mitochondrial,\n' | |
100 'Bacterial,\n' | |
101 'Blastocrithidia Nuclear,\n' | |
102 'Blepharisma Macronuclear,\n' | |
103 'Candidate Division SR1,\n' | |
104 'Chlorophycean Mitochondrial,\n' | |
105 'Ciliate Nuclear,\n' | |
106 'Coelenterate Mitochondrial,\n' | |
107 'Condylostoma Nuclear,\n' | |
108 'Dasycladacean Nuclear,\n' | |
109 'Echinoderm Mitochondrial,\n' | |
110 'Euplotid Nuclear,\n' | |
111 'Flatworm Mitochondrial,\n' | |
112 'Gracilibacteria,\n' | |
113 'Hexamita Nuclear,\n' | |
114 'Invertebrate Mitochondrial,\n' | |
115 'Karyorelict Nuclear,\n' | |
116 'Mesodinium Nuclear,\n' | |
117 'Mold Mitochondrial,\n' | |
118 'Mycoplasma,\n' | |
119 'Pachysolen tannophilus Nuclear,\n' | |
120 'Peritrich Nuclear,\n' | |
121 'Plant Plastid,\n' | |
122 'Protozoan Mitochondrial,\n' | |
123 'Pterobranchia Mitochondrial,\n' | |
124 'SGC0,\n' | |
125 'SGC1,\n' | |
126 'SGC2,\n' | |
127 'SGC3,\n' | |
128 'SGC4,\n' | |
129 'SGC5,\n' | |
130 'SGC8,\n' | |
131 'SGC9,\n' | |
132 'Scenedesmus obliquus Mitochondrial,\n' | |
133 'Spiroplasma,\n' | |
134 'Standard,\n' | |
135 'Thraustochytrium Mitochondrial,\n' | |
136 'Trematode Mitochondrial,\n' | |
137 'Vertebrate Mitochondrial,\n' | |
138 'Yeast Mitochondrial\n', required=True) | |
139 | |
140 parser.add_argument('-o','--output', help='Description for bar argument', required=True) | |
141 args = vars(parser.parse_args()) | |
142 | |
143 seqs = read_input(data=args['input'], type_=args['type']) | |
144 out = codon_usage(seqs, args['codonTable']) | |
145 | |
146 with open(args['output'], "w") as outf: | |
147 out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"]) | |
148 | |
149 |