Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01a_codons_counting.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:28:50 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc3674e515b |
---|---|
1 #!/usr/bin/env python | |
2 # coding: utf-8 | |
3 # Author : Victor Mataigne | |
4 | |
5 import string, os, sys, re, random, itertools, argparse, copy, math | |
6 import pandas as pd | |
7 import numpy as np | |
8 | |
9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif): | |
10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting | |
11 | |
12 Args : | |
13 list_codons (list of str) : all codons except codons-stop | |
14 content (int or list) : an integer (for coutings and transitions), or an empty list (for resampling) | |
15 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
16 dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} | |
17 | |
18 Returns : | |
19 dico_codons, dico_aa, dico_aatypes (dicts) : keys : codons/amico-acids/amico-acids types, values : 0 or [] | |
20 dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions (dicts of dicts) : | |
21 actually, the three first dictionaries nested as values of keys codons/amico-acids/amico-acids | |
22 """ | |
23 | |
24 # I could have make sub-routines here. | |
25 # the copy commands are mandatory, otherwise all dictionaries will reference the same variable(s) | |
26 | |
27 dico_codons = {} | |
28 for codon in list_codons: | |
29 dico_codons[codon] = copy.deepcopy(content) | |
30 | |
31 dico_aa = {} | |
32 for aa in dict_genetic_code.keys(): | |
33 dico_aa[aa] = copy.deepcopy(content) | |
34 | |
35 dico_aatypes = {} | |
36 for aatype in dict_aa_classif.keys(): | |
37 dico_aatypes[aatype] = copy.deepcopy(content) | |
38 | |
39 dico_codons_transitions=copy.deepcopy(dico_codons) | |
40 for key in dico_codons_transitions.keys(): | |
41 dico_codons_transitions[key]=copy.deepcopy(dico_codons) | |
42 | |
43 dico_aa_transitions = copy.deepcopy(dico_aa) | |
44 for key in dico_aa_transitions.keys(): | |
45 dico_aa_transitions[key]=copy.deepcopy(dico_aa) | |
46 | |
47 dico_aatypes_transitions = copy.deepcopy(dico_aatypes) | |
48 for key in dico_aatypes_transitions.keys(): | |
49 dico_aatypes_transitions[key]=copy.deepcopy(dico_aatypes) | |
50 | |
51 return dico_codons, dico_aa, dico_aatypes, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions | |
52 | |
53 def viable(seqs, pos, method): | |
54 """ Compute if, among a set of sequences, either at least one of the codons at the specified position has not a "-", | |
55 or not any codon has a "-" | |
56 | |
57 Args : | |
58 seqs : a list (the sequences, which must all have the same size) | |
59 pos : an integer (the positions, <= len(seqs) -3) | |
60 | |
61 Returns: | |
62 bool | |
63 """ | |
64 codons = [seq[pos:pos+3] for seq in seqs] | |
65 if method == "all": | |
66 return not all("-" in codon for codon in codons) | |
67 elif method == "any": | |
68 return not any("-" in codon for codon in codons) | |
69 | |
70 # # # Function for codons, aa, aatypes Countings ------------------------------------------------------------------------------- | |
71 | |
72 def computeAllCountingsAndFreqs(seq, list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif): | |
73 """ Call all functions dedicated to the computation of occurences and frequencies of codons, amino-acids, amino-acids types | |
74 | |
75 Args : see sub-routines | |
76 | |
77 Returns : 6 dictionaries (occurences and frequencies for codons, amino-acids, amino-acids types). See sub-routines for details. | |
78 """ | |
79 | |
80 # ------ Sub-routines ------ # | |
81 | |
82 def codonsCountings(seq, list_codons, init_dict_codons): | |
83 """ Count occurences of each codon in a sequence | |
84 First reading frame only : input sequence is supposed to be an ORF | |
85 | |
86 Args : | |
87 seq (str) : the sequence | |
88 list_codons (list of str) : all codons except codons-stop | |
89 init_dict_codons (dict) : {codon1 : 0, codon2: 0, ...} | |
90 | |
91 Return : | |
92 codon (dict) : codons (keys) and their occurences (values) in the sequence | |
93 """ | |
94 | |
95 codons = copy.deepcopy(init_dict_codons) | |
96 | |
97 l = len(seq) | |
98 | |
99 if l%3 == 0: max_indice = l-3 | |
100 if l%3 == 1: max_indice = l-4 | |
101 if l%3 == 2: max_indice = l-5 | |
102 | |
103 for codon in range(0,max_indice+1,3): | |
104 if "-" not in seq[codon:codon+3] : | |
105 codons[seq[codon:codon+3]] += 1 | |
106 | |
107 return codons | |
108 | |
109 def codonsFreqs(dict_codons_counts): | |
110 """ Computes frequencies of each codon in a sequence | |
111 | |
112 Args : | |
113 dict_codons (dict) : the output of codonsCounting() | |
114 | |
115 Return : | |
116 codons_freqs (dict) : codons (keys) and their frequencies (values) | |
117 """ | |
118 | |
119 codons_freqs = {} | |
120 | |
121 for key in dict_codons_counts.keys(): | |
122 freq = float(dict_codons_counts[key])/sum(dict_codons_counts.values()) | |
123 codons_freqs[key] = freq | |
124 | |
125 return codons_freqs | |
126 | |
127 def aaCountings(dict_codons, dict_genetic_code, init_dict_aa): | |
128 """ Count occurences of each amino-acid in a sequence, based on the countings of codons (1st ORF) | |
129 | |
130 Args : | |
131 dict_codons (dict) : the output of codonsCounting() | |
132 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
133 init_dict_aa (dict) : {aa1 : 0, aa2: 0, ...} | |
134 | |
135 Return : | |
136 dict_aa (dict) : amino-acids (keys) and their occurences (values) | |
137 """ | |
138 dict_aa = copy.deepcopy(init_dict_aa) | |
139 | |
140 for key in dict_codons.keys(): | |
141 for value in dict_genetic_code.values(): | |
142 if key in value: | |
143 dict_aa[dict_genetic_code.keys()[dict_genetic_code.values().index(value)]] += dict_codons[key] | |
144 | |
145 return dict_aa | |
146 | |
147 def aaFreqs(dict_aa_counts): | |
148 """ Computes frequencies of each amino-acid in a sequence, based on the countings of codons (1st ORF) | |
149 | |
150 Args : | |
151 dict_aa_counts (dict) : the output of aaCountings() | |
152 | |
153 Return : | |
154 dict_aa_freqs (dict) : amino-acids (keys) and their frequencies (values) | |
155 """ | |
156 | |
157 dict_aa_freqs = {} | |
158 | |
159 for key in dict_aa_counts.keys(): | |
160 freq = float(dict_aa_counts[key])/sum(dict_aa_counts.values()) | |
161 dict_aa_freqs[key] = freq | |
162 | |
163 return dict_aa_freqs | |
164 | |
165 def aatypesCountings(dict_aa, dict_aa_classif, init_dict_classif): | |
166 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) | |
167 | |
168 Args : | |
169 - dict_aa (dict) : the output of aaCountings() | |
170 - dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} ) | |
171 - init_dict_classif (dict) : {'polar': 0, 'apolar': 0, ...} | |
172 | |
173 Return : | |
174 dict_aatypes (dict) : amino-acids types (keys) and their occurences (values) | |
175 """ | |
176 dict_aatypes = copy.deepcopy(init_dict_classif) | |
177 | |
178 for key_classif in dict_aa_classif.keys(): | |
179 for key_aa in dict_aa.keys(): | |
180 if key_aa in dict_aa_classif[key_classif]: | |
181 dict_aatypes[key_classif] += dict_aa[key_aa] | |
182 | |
183 return dict_aatypes | |
184 | |
185 def aatypesFreqs(dict_aatypes): | |
186 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) | |
187 | |
188 Args : | |
189 dict_aatypes (dict) : the output of aatypesCountings() | |
190 | |
191 Return : | |
192 dict_aatypes_freqs : amino-acids types (keys) and their frequencies (values) | |
193 """ | |
194 dict_aatypes_freqs = {} | |
195 | |
196 for key in dict_aatypes.keys(): | |
197 freq = float(dict_aatypes[key])/sum(dict_aatypes.values()) | |
198 dict_aatypes_freqs[key] = freq | |
199 | |
200 return dict_aatypes_freqs | |
201 | |
202 # ------ The function ------ # | |
203 | |
204 codons_c = codonsCountings(seq, list_codons, init_dict_codons) | |
205 codons_f = codonsFreqs(codons_c) | |
206 aa_c = aaCountings(codons_c, dict_genetic_code, init_dict_aa) | |
207 aa_f = aaFreqs(aa_c) | |
208 aat_c = aatypesCountings(aa_c, dict_aa_classif, init_dict_classif) | |
209 aat_f = aatypesFreqs(aat_c) | |
210 | |
211 return codons_c, codons_f, aa_c, aa_f, aat_c, aat_f | |
212 | |
213 # # # Functions for various measures (ivywrel, ekqh...) ------------------------------------------------------------------------- | |
214 | |
215 def computeVarious(seq, dict_aa_counts, dict_aa_types): | |
216 """ Call al the functions for nucleic and amino-acids sequences description | |
217 | |
218 Args : See sub-routines for details. | |
219 | |
220 Returns : 6 integer or floats. See sub-routines for details. | |
221 """ | |
222 | |
223 # ------ Sub-routines ------ # | |
224 | |
225 def nbCodons(seq): | |
226 """ Compute the number of full codons in a sequence | |
227 Arg : seq (str): the sequence | |
228 Return: nb_codons (int) | |
229 """ | |
230 l = len(seq) | |
231 if l%3 == 0: nb_codons = l/3 | |
232 if l%3 == 1: nb_codons = (l-1)/3 | |
233 if l%3 == 2: nb_codons = (l-2)/3 | |
234 return nb_codons | |
235 | |
236 def maxIndice(seq): | |
237 """ Compute the highest indice for parsing a sequence | |
238 Arg : seq (str): the sequence | |
239 Return : max_indice (int) | |
240 """ | |
241 l = len(seq) | |
242 if l%3 == 0: max_indice = l-3 | |
243 if l%3 == 1: max_indice = l-4 | |
244 if l%3 == 2: max_indice = l-5 | |
245 return max_indice | |
246 | |
247 def gc12Andgc3Count(seq, nb_codons, max_indice): | |
248 """ Compute the frequency of gc12 in a sequence | |
249 Arg : seq (str) : the sequence | |
250 Return : (float) | |
251 """ | |
252 | |
253 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? | |
254 # But : involves a less readable code | |
255 | |
256 gc12 = 0 | |
257 gc3 = 0 | |
258 | |
259 for i in range(0, max_indice+1,3): | |
260 if seq[i] in ["c","g"]: gc12 += 1 | |
261 if seq[i+1] in ["c","g"]: gc12 += 1 | |
262 if seq[i+2] in ["c","g"] or seq[i+2] in ["c","g"]: gc3 += 1 | |
263 | |
264 return float(gc3)/nb_codons, float(gc12)/(2*nb_codons) | |
265 | |
266 def ivywrelCount(nb_codons, dict_aa_counts): | |
267 """ Compute the sum of occurences of amino-acids IVYWREL divided by the number of codons | |
268 | |
269 Args : | |
270 nb_codons (int) : the number of codons in the sequence | |
271 dict_aa_counts (dict) : the output of aaCountings() | |
272 | |
273 return : (float) | |
274 """ | |
275 | |
276 IVYWREL = 0 | |
277 | |
278 for aa in ["I","V","Y","W","R","E","L"]: # Impossible to make a simple sum, in case one the aa is not in the dict keys | |
279 if aa in dict_aa_counts.keys(): | |
280 IVYWREL += dict_aa_counts[aa] | |
281 | |
282 return float(IVYWREL)/nb_codons | |
283 | |
284 def ekqhCount(dict_aa_counts): | |
285 """ Compute the ratio of amino-acids EK/QH | |
286 Arg : dict_aa_counts (dict) : the output of aaCountings() | |
287 Return : (float) | |
288 """ | |
289 ek = 0 | |
290 qh = 0 | |
291 | |
292 ek = dict_aa_counts["E"] + dict_aa_counts["K"] | |
293 qh = dict_aa_counts["Q"] + dict_aa_counts["H"] | |
294 | |
295 if qh != 0: | |
296 return float(ek)/qh | |
297 else : return ek | |
298 | |
299 def payresdgmCount(dict_aa_counts): | |
300 """ Compute the ratio of amino-acids PASRE/SDGM | |
301 Arg : dict_aa_counts (dict) : the output of aaCountings() | |
302 Return : (float) | |
303 """ | |
304 payre = 0 | |
305 sdgm = 0 | |
306 | |
307 for aa in ["P","A","Y","R","E"]: | |
308 payre += dict_aa_counts[aa] | |
309 for aa in ["S","D","G","M"]: | |
310 sdgm += dict_aa_counts[aa] | |
311 | |
312 if sdgm != 0: | |
313 return float(payre)/sdgm | |
314 else : return payre | |
315 | |
316 def purineLoad(seq, nb_codons): | |
317 """ Compute the purine load indice of a sequence | |
318 Args : | |
319 seq (str) : the sequence | |
320 nb_codons (int) : the number of codons in the sequence | |
321 | |
322 Return (float) | |
323 """ | |
324 | |
325 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? | |
326 # But : involves a less readable code | |
327 | |
328 g12, g3, A, c12, c3, T = 0.0,0.0,seq.count("a"),0.0,0.0,seq.count("t") | |
329 | |
330 # g3 and c3 : g and c in 3rd position of a codon | |
331 s = "" | |
332 for i in range(2, len(seq), 3): | |
333 s += seq[i] | |
334 g3 = s.count("g") | |
335 c3 = s.count("c") | |
336 | |
337 # g12 and c12 : g and c in 1st and 2d positions of a codon | |
338 s = "" | |
339 for i in range(0, len(seq), 3): | |
340 s += seq[i] | |
341 g12 = s.count("g") | |
342 c12 = s.count("c") | |
343 s = "" | |
344 for i in range(1, len(seq), 3): | |
345 s += seq[i] | |
346 g12 += s.count("g") | |
347 c12 += s.count("c") | |
348 | |
349 return float(1000*(g12+g3+A-c12-c3-T))/(3*nb_codons) | |
350 | |
351 def cvp(dict_aatypes): | |
352 """ Compute the difference nb_charged_aamino_acids - nb_polar_amino_acids | |
353 Return: (int) | |
354 """ | |
355 return dict_aatypes["charged"] - dict_aatypes["polar"] | |
356 | |
357 # ------ The function ------ # | |
358 | |
359 nb_codons = nbCodons(seq) | |
360 max_indice = maxIndice(seq) | |
361 GC3, GC12 = gc12Andgc3Count(seq, nb_codons, max_indice) | |
362 IVYWREL, EKQH, PAYRESDGM = ivywrelCount(nb_codons, dict_aa_counts), ekqhCount(dict_aa_counts), payresdgmCount(dict_aa_counts) | |
363 purineload, CvP = purineLoad(seq, nb_codons), cvp(dict_aa_types) | |
364 return GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP | |
365 | |
366 # # # Function for codons, aa, aatypes Transitions ----------------------------------------------------------------------------- | |
367 | |
368 def computeAllBiases(seq1, seq2, dico_codons_transi, dico_aa_transi, dico_aatypes_transi, reversecode, reverseclassif) : | |
369 """ Compute all biases (transisitions codon->codon, aa->-aa, aa_type->aa_type) between two sequences | |
370 | |
371 Args : See sub-routines for details | |
372 | |
373 Returns 3 dictionaries of dictionaries. See sub-routines for details | |
374 """ | |
375 | |
376 # ------ Sub-routines ------ # | |
377 | |
378 def codons_transitions(seq1, seq2, dico_codons_transi): | |
379 """ Compute the number of transitions from a codon of a sequence to the codon of a second sequence | |
380 | |
381 Args : | |
382 seq1 (str) : the first sequence | |
383 seq2 (str) : the second sequence | |
384 dico_codons_transi (dict of dicts) : { codon1 : {codon1: 0, codon2 : 0, ...}, ..} | |
385 | |
386 Return : | |
387 codons_transi (dict of dicts) : the occurences of each codon to codon transition | |
388 """ | |
389 | |
390 codons_transi = copy.deepcopy(dico_codons_transi) | |
391 | |
392 for i in range(0, len(seq1), 3): | |
393 # check if no indel and if len seq[i:i+3] is really 3 (check for the last codon) | |
394 if viable([seq1, seq2], i, "any") and len(seq1[i:i+3]) == 3 and len(seq2[i:i+3]) == 3 : | |
395 codons_transi[seq1[i:i+3]][seq2[i:i+3]] += 1 | |
396 | |
397 return codons_transi | |
398 | |
399 def codons_transitions_freqs(codons_transitions_counts): | |
400 """ Computes frequencies of codon transitions between two sequences | |
401 | |
402 Arg : | |
403 codons_transitions_counts (dict) : the output of codons_transitions() | |
404 | |
405 Return : | |
406 codons_transi_freqs (dict of dicts) : the frequencies of each codon to codon transition | |
407 """ | |
408 | |
409 codons_transi_freqs = {} | |
410 | |
411 for key in codons_transitions_counts.keys(): | |
412 codons_transi_freqs[key] = {} | |
413 for key2 in codons_transitions_counts[key].keys(): | |
414 if sum(codons_transitions_counts[key].values()) != 0: | |
415 freq = float(codons_transitions_counts[key][key2])/sum(codons_transitions_counts[key].values()) | |
416 codons_transi_freqs[key][key2] = freq | |
417 else : | |
418 codons_transi_freqs[key][key2] = 0 | |
419 return codons_transi_freqs | |
420 | |
421 def aa_transitions(dico_codons_transi, dico_aa_transi, reversecode): | |
422 """ Compute the number of transitions from an amino-acid of a sequence to the amino-acid of a second sequence | |
423 | |
424 Args : | |
425 dico_codons_transi (dict of dicts) : the codons transitions computed by codons_transitions() | |
426 dico_aa_transi (dict of dicts) : { aa1 : {aa1: 0, aa2 : 0, ...}, ..} | |
427 reversecode (dict) : the reversed genetic code {aa1 : [codons],...} -> {codon1: aa1, codon2: aa2, ...} | |
428 | |
429 Return : | |
430 aa_transi (dict of dicts) : the occurences of each aa to aa transition | |
431 """ | |
432 | |
433 aa_transi = copy.deepcopy(dico_aa_transi) | |
434 | |
435 for k in dico_codons_transi.keys(): | |
436 newk = reversecode[k] | |
437 for k2 in dico_codons_transi[k].keys(): | |
438 newk2 = reversecode[k2] | |
439 aa_transi[newk][newk2] += dico_codons_transi[k][k2] | |
440 | |
441 return aa_transi | |
442 | |
443 def aa_transitions_freqs(aa_transitions_counts): | |
444 """ Computes frequencies of amico-acids transitions between two sequences | |
445 Arg : aa_transitions_counts (dict of dicts): the output of aa_transitions() | |
446 Return : aa_transi_freqs (dict of dicts) : the frequencies of each aa to aa transition | |
447 """ | |
448 | |
449 aa_transi_freqs = {} | |
450 | |
451 for key in aa_transitions_counts.keys(): | |
452 aa_transi_freqs[key] = {} | |
453 for key2 in aa_transitions_counts[key].keys(): | |
454 if sum(aa_transitions_counts[key].values()) != 0: | |
455 freq = float(aa_transitions_counts[key][key2])/sum(aa_transitions_counts[key].values()) | |
456 aa_transi_freqs[key][key2] = freq | |
457 else : | |
458 aa_transi_freqs[key][key2] = 0 | |
459 return aa_transi_freqs | |
460 | |
461 def aatypes_transitions(dico_aa_transi, dico_aatypes_transi, reverseclassif): | |
462 """ Compute the number of transitions from an amino-acid type of a sequence to the amino-acid type of a second sequence | |
463 | |
464 Args : | |
465 dico_aa_transi (dict of dicts) : the output of aa_transitions() | |
466 dico_aatypes_transi (dict of dicts) : { type1 : {type1: 0, type2 : 0, ...}, ..} | |
467 reverseclassif (dict) : the reversed amino-acid clasification {aa1: type, aa2: type, ...} | |
468 | |
469 Return : | |
470 aatypes_transi (dict of dicts) : the occurences of each aatype to aatype transition | |
471 """ | |
472 | |
473 aatypes_transi = copy.deepcopy(dico_aatypes_transi) | |
474 for k in dico_aa_transi.keys(): | |
475 newk = reverseclassif[k] | |
476 for k2 in dico_aa_transi[k].keys(): | |
477 newk2 = reverseclassif[k2] | |
478 aatypes_transi[newk][newk2] += dico_aa_transi[k][k2] | |
479 | |
480 return aatypes_transi | |
481 | |
482 def aatypes_transitions_freqs(aatypes_transitions_counts): | |
483 """ Computes frequencies of amico-acids types transitions between two sequences | |
484 Args : aatypes_transitions_counts (dict of dicts) : the output of aatypes_transitions() | |
485 Return : aatypes_transi_freqs (dict of dicts) : the frequencies of each aatype to aatype transition | |
486 """ | |
487 | |
488 aatypes_transi_freqs = {} | |
489 | |
490 for key in aatypes_transitions_counts.keys(): | |
491 aatypes_transi_freqs[key] = {} | |
492 for key2 in aatypes_transitions_counts[key].keys(): | |
493 if sum(aatypes_transitions_counts[key].values()) != 0: | |
494 freq = float(aatypes_transitions_counts[key][key2])/sum(aatypes_transitions_counts[key].values()) | |
495 aatypes_transi_freqs[key][key2] = freq | |
496 else : | |
497 aatypes_transi_freqs[key][key2] = 0 | |
498 return aatypes_transi_freqs | |
499 | |
500 | |
501 | |
502 | |
503 # ------ The function ------ # | |
504 | |
505 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi) | |
506 codons_transitions_freqs = codons_transitions_freqs(codons_transitions) | |
507 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode) | |
508 aa_transitions_freqs = aa_transitions_freqs(aa_transitions) | |
509 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif) | |
510 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions) | |
511 | |
512 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs | |
513 | |
514 def all_sed(codons_c, aa_c, aat_c, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transi, dico_aa_transi, dico_aatypes_transi): | |
515 | |
516 def compute_sed(transi, counts, dico): | |
517 """ Compute the substitution exchangeability disequilibrium (SED) from one species A to another B between codons/aa//aatypes couples | |
518 | |
519 Args: | |
520 transi ; dict - dictionaries of all counts of transition from codon/aa/aatype X to Y from sp A to sp B | |
521 counts : dict - dictionaries of codons/aa/aatypes counts in species A | |
522 dico : dict - a dictionary (nested) with all values to 0 | |
523 | |
524 """ | |
525 dict_sed = copy.deepcopy(dico) | |
526 | |
527 for key in transi.keys(): | |
528 for key2 in transi.keys(): | |
529 if counts[key] != 0 and float(transi[key2][key])/counts[key] != 0.0: | |
530 x = (float(transi[key][key2])/counts[key]) / (float(transi[key2][key])/counts[key]) | |
531 dict_sed[key][key2] = - pow(2,1-x)+1 | |
532 else : | |
533 dict_sed[key][key2] = 'NA' | |
534 | |
535 return dict_sed | |
536 | |
537 codons_sed = compute_sed(codons_transitions, codons_c, dico_codons_transi) | |
538 aa_sed = compute_sed(aa_transitions, aa_c, dico_aa_transi) | |
539 aatypes_sed = compute_sed(aatypes_transitions, aat_c, dico_aatypes_transi) | |
540 | |
541 return codons_sed, aa_sed, aatypes_sed | |
542 | |
543 # # # Function for random resampling -------------------------------------------------------------------------------------------- | |
544 | |
545 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif): | |
546 """ Resample randomly codons from sequences (sort of bootsrap) | |
547 | |
548 Args : | |
549 dict_seq (dict) : contains the species name and sequences used ( { 'sp1': seq1, 'sp2': seq2, ...}) without '-' removal | |
550 nb_iter (int) : number of resampling iterations (better >= 1000) | |
551 len_sample (int) : length (in codons) of a resampled sequence (better >= 1000) | |
552 list_codons (list of str): all codons except codons-stop | |
553 genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
554 aa_classif (dict) : the types of the amino-acids : {type: [aa1, aa2, ...], ...} | |
555 reversecode (dict) : the reversed genetic code : {codon1: aa1, codon2: aa2, ...} | |
556 reverseclassif (dict) : the reversed amino-acid : clasification {aa1: type, aa2: type, ...} | |
557 | |
558 Returns : | |
559 codons_lst, aa_lst, classif_lst (dicts) : keys : codons/aa/aatypes, values : [] | |
560 codons_transitions_lst, aa_transitions_lst, classif_transitions_lst (dict of dicts) : keys : codons/aa/aatypes, values : the 3 previous dicts | |
561 """ | |
562 | |
563 # Initialize empty dictionaries for countings and transitions. It's also possible to isntanciate these ones in the main() but it would make a function with ~14 parameters | |
564 codons_0, aa_0, classif_0, codons_transitions_0, aa_transitions_0, classif_transitions_0 = buildDicts(list_codons, 0, genetic_code, aa_classif) | |
565 codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst = buildDicts(list_codons, [], genetic_code, aa_classif) | |
566 | |
567 # determine the max position where sampling is possible | |
568 l = len(dict_seq.values()[1]) | |
569 if l%3 == 0: max_indice = l-3 | |
570 if l%3 == 1: max_indice = l-4 | |
571 if l%3 == 2: max_indice = l-5 | |
572 | |
573 # List of positions to resample | |
574 viable_positions = [pos for pos in range(0,max_indice,3) if viable(dict_seq.values(), pos, "all")] | |
575 sample_positions = np.random.choice(viable_positions, len_sample) | |
576 | |
577 # nb_iter resampled sequences | |
578 for i in range(nb_iter): | |
579 if (i+1)%(nb_iter/10) == 0: | |
580 print " "+str( (i+1)*100/nb_iter)+"%" | |
581 | |
582 seqa, seqb = "", "" | |
583 for pos in sample_positions: | |
584 codona, codonb = "---", "---" | |
585 # The sequence to be resampled in this position is randomly chosen ; no "-" resampled | |
586 while "-" in codona : | |
587 codona = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] | |
588 while "-" in codonb : | |
589 codonb = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] | |
590 seqa += codona | |
591 seqb += codonb | |
592 | |
593 # dictionaries : frequences of codons, aa, aatypes (seq1) | |
594 codons_occ_tmp, codons_freq_tmp, aa_occ_tmp, aa_freq_tmp, aatypes_occ_tmp, aatypes_freq_tmp = computeAllCountingsAndFreqs(seqa, list_codons, codons_0, aa_0, classif_0, genetic_code, aa_classif) | |
595 # dictionaries frequences of transitions (seqa->seqb) | |
596 codons_transitions_tmp, codons_transitions_freq_tmp, aa_transition_tmp, aa_transitions_freq_tmp, aatypes_transitions_tmp, aatypes_transitions_freq_tmp = computeAllBiases(seqa, seqb, codons_transitions_0, aa_transitions_0, classif_transitions_0, reversecode, reverseclassif) | |
597 | |
598 # Adding occurrences in final dicts | |
599 for key in codons_freq_tmp.keys(): | |
600 codons_lst[key].append(codons_freq_tmp[key]) | |
601 for key in aa_freq_tmp.keys(): | |
602 aa_lst[key].append(aa_freq_tmp[key]) | |
603 for key in aatypes_freq_tmp.keys(): | |
604 classif_lst[key].append(aatypes_freq_tmp[key]) | |
605 | |
606 # Adding occurrences in final dicts (transitions) | |
607 for key in codons_transitions_freq_tmp.keys(): | |
608 for key2 in codons_transitions_freq_tmp[key].keys(): | |
609 codons_transitions_lst[key][key2].append(codons_transitions_freq_tmp[key][key2]) | |
610 for key in aa_transitions_freq_tmp.keys(): | |
611 for key2 in aa_transitions_freq_tmp[key].keys(): | |
612 aa_transitions_lst[key][key2].append(aa_transitions_freq_tmp[key][key2]) | |
613 for key in aatypes_transitions_freq_tmp.keys(): | |
614 for key2 in aatypes_transitions_freq_tmp[key].keys(): | |
615 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2]) | |
616 | |
617 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst | |
618 | |
619 def testPvalues(dict_counts, dict_resampling, nb_iter, method): | |
620 """ Computes where the observed value is located in the expected counting distribution | |
621 | |
622 Args : | |
623 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() | |
624 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function | |
625 | |
626 Return : | |
627 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts) | |
628 | |
629 | |
630 pnorm computes the pvalue to have a value inferior to the observed value under a normal distribution | |
631 One sided to left tail : | |
632 p < 0.05 indicates significantly lower counts | |
633 p > 0.95 indicates significantly higher counts | |
634 """ | |
635 | |
636 | |
637 def p_resampling(obs, values, nb_iter): | |
638 """ The pvalue is the proportion of bootsrapped values smaller than the observed value | |
639 If p = 0.025 : 2.5% of the bootstrapped values are smaller than the observed value | |
640 p < 0.025 : the obs value is most likely significantly lower. | |
641 If p = 0.975 : 97.5% of the bootstrapped values are smaller than the observed value | |
642 p > 0.975 the obs value is most likely significantly higher. | |
643 | |
644 Args : | |
645 obs : int or float - the observed value | |
646 values : list - values of resampling (int or floats) | |
647 nb_iter : int - the number of resampled values (=len(values)) | |
648 | |
649 Return : | |
650 pvalue (float) | |
651 """ | |
652 | |
653 num = len([x for x in values if x < obs]) | |
654 return float(num + 1) / (nb_iter+1) | |
655 | |
656 def testPvalue(obs, exp, nb_iter): | |
657 """ Compute a pvalue | |
658 | |
659 Args : | |
660 exp (list of floatsà : a list of length nb_iter, containing expected frequencies of a codon/aa/aatype at each iteration | |
661 obs (float) : the observed value | |
662 nb_iter (int) : the number of iterations for resampling | |
663 | |
664 Returns : | |
665 pvalue (float) | |
666 """ | |
667 | |
668 max_val = nb_iter-1 | |
669 min_val = 0 | |
670 test_val = (max_val+min_val)/2 | |
671 | |
672 while max_val-min_val > 1: | |
673 if obs > exp[test_val]: | |
674 min_val = test_val | |
675 test_val = (max_val+min_val)/2 | |
676 elif obs < exp[test_val]: | |
677 max_val = test_val | |
678 test_val = (max_val+min_val)/2 | |
679 else: | |
680 break | |
681 | |
682 pvalue = float(test_val+1)/(nb_iter+1) | |
683 | |
684 return pvalue | |
685 | |
686 # ------ The function ------ # | |
687 | |
688 pvalues = {} | |
689 | |
690 for key in dict_resampling.keys(): | |
691 if type(dict_resampling.values()[1]) is not dict : | |
692 if method == 'origin': | |
693 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter) | |
694 elif method == 'pnorm': | |
695 pvalues[key] = scipy.stats.norm.cdf(dict_counts[key], np.mean(dict_resampling[key]), np.std(dict_resampling[key])) | |
696 elif method == 'p_resampling': | |
697 pvalues[key] = p_resampling(dict_counts[key], dict_resampling[key], nb_iter) | |
698 else : | |
699 pvalues[key] = {} | |
700 for key2 in dict_resampling[key].keys(): | |
701 if method == 'origin': | |
702 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | |
703 elif method == 'pnorm': | |
704 pvalues[key][key2] = scipy.stats.norm.cdf(dict_counts[key][key2], np.mean(dict_resampling[key][key2]), np.std(dict_resampling[key][key2])) | |
705 elif method == 'p_resampling': | |
706 pvalues[key][key2] = p_resampling(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | |
707 | |
708 return pvalues | |
709 | |
710 def main(): | |
711 | |
712 # arguments | |
713 parser = argparse.ArgumentParser() | |
714 parser.add_argument("sequences_file", help="File containing sequences (the output of the tool 'ConcatPhyl'") | |
715 parser.add_argument("considered_species", help="The species name, separated by commas (must be the same than in the sequences_file). It is possible to consider only a subset of species.") | |
716 parser.add_argument("species_for_bootstrap", help="The species which will be used for bootstrapping, separated by commas. It is possible to consider only a subset of species.") | |
717 parser.add_argument("iteration", help="The number of iterations for bootstrapping (better if => 1000)", type=int) | |
718 parser.add_argument("sample_length", help="The lenght of a bootstrapped sequences (better if >= 1000", type=int) | |
719 args = parser.parse_args() | |
720 | |
721 print "\n ------ Occurences and frequencies of codons, amino-acids, amino-acids types -------\n" | |
722 | |
723 print "The script counts the number of codons, amino acids, and types of amino acids in sequences," | |
724 print "as well as the mutation bias from one item to another between 2 sequences.\n" | |
725 | |
726 print "Counting are then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences." | |
727 print "In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from" | |
728 print "a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting." | |
729 | |
730 print " Sequences file : {}".format(args.sequences_file) | |
731 print " Species retained for countings : {}\n".format(args.considered_species) | |
732 | |
733 print "Processing : reading input file, opening output files, building dictionaries." | |
734 | |
735 # make pairs | |
736 list_species = str.split(args.considered_species, ",") | |
737 list_species_boot = str.split(args.species_for_bootstrap, ",") | |
738 pairs_list=list(itertools.combinations(list_species,2)) | |
739 | |
740 # read sequences | |
741 sequences_for_counts = {} | |
742 sequences_for_resampling = {} | |
743 with open(args.sequences_file, "r") as file: | |
744 for line1,line2 in itertools.izip_longest(*[file]*2): | |
745 species = line1.strip(">\r\n") | |
746 sequence = line2.strip("\r\n") | |
747 if species in list_species: | |
748 sequences_for_counts[species] = sequence | |
749 if species in list_species_boot: | |
750 sequences_for_resampling[species] = sequence | |
751 | |
752 print " Warning : countings might be biased and show high differences between species because of high variations of the indels proportions among sequences." | |
753 print " Frequences are more representative." | |
754 | |
755 print "\n Indels percent :" | |
756 | |
757 for k,v in sequences_for_counts.items(): | |
758 print " {} : {} %".format(k, float(v.count("-"))/len(v)*100) | |
759 | |
760 # useful dictionaries | |
761 dict_genetic_code={"F":["ttt","ttc"], | |
762 "L":["tta","ttg","ctt","ctc","cta","ctg"], | |
763 "I":["att","atc","ata"], | |
764 "M":["atg"], | |
765 "V":["gtt","gtc","gta","gtg"], | |
766 "S":["tct","tcc","tca","tcg","agt","agc"], | |
767 "P":["cct","cca","ccg","ccc"], | |
768 "T":["act","acc","aca","acg"], | |
769 "A":["gct","gcc","gca","gcg"], | |
770 "Y":["tat","tac"], | |
771 "H":["cat","cac"], | |
772 "Q":["caa","cag"], | |
773 "N":["aat","aac"], | |
774 "K":["aaa","aag"], | |
775 "D":["gat","gac"], | |
776 "E":["gaa","gag"], | |
777 "C":["tgt","tgc"], | |
778 "W":["tgg"], | |
779 "R":["cgt","cgc","cga","cgg","aga","agg"], | |
780 "G":["ggt","ggc","gga","ggg"]} | |
781 | |
782 dict_aa_classif={"unpolar":["G","A","V","L","M","I"], | |
783 "polar":["S","T","C","P","N","Q"], | |
784 "charged":["K","R","H","D","E"], | |
785 "aromatics":["F","Y","W"]} | |
786 | |
787 reversecode={v:k for k in dict_genetic_code for v in dict_genetic_code[k]} | |
788 reverseclassif={v:k for k in dict_aa_classif for v in dict_aa_classif[k]} | |
789 | |
790 # codons list (without stop codons) | |
791 nucleotides = ['a', 'c', 'g', 't'] | |
792 list_codons = [''.join(comb) for comb in itertools.product(nucleotides, repeat=3)] | |
793 list_codons.remove("taa") | |
794 list_codons.remove("tag") | |
795 list_codons.remove("tga") | |
796 | |
797 # Store already computed species + row.names in output files | |
798 index = [] | |
799 index_transi = [] | |
800 | |
801 # Final dictionaries writed to csv files | |
802 all_codons = {} | |
803 all_aa = {} | |
804 all_aatypes = {} | |
805 all_various = {} | |
806 all_codons_transitions = {} # Not used because too much : 61*61 columns | |
807 all_aa_transitions = {} | |
808 all_aatypes_transitions = {} | |
809 | |
810 # RUN | |
811 | |
812 print "\nProcessing : resampling ..." | |
813 print " Parameters : {niter} iterations, {lensample} codon per resampled sequence, species used : {species}\n".format(niter=args.iteration, lensample=args.sample_length, species=args.species_for_bootstrap) | |
814 | |
815 codons_boot, aa_boot, aatypes_boot, codons_transi_boot, aa_transi_boot, aatypes_transi_boot = sampling(sequences_for_resampling, args.iteration, args.sample_length, list_codons, dict_genetic_code, dict_aa_classif, reversecode, reverseclassif) | |
816 print " Done.\n" | |
817 | |
818 print "Processing : countings....\n" | |
819 | |
820 # Initialize empty dictionaries for countings and transitions | |
821 init_dict_codons, init_dict_aa, init_dict_classif, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions = buildDicts(list_codons, 0, dict_genetic_code, dict_aa_classif) | |
822 | |
823 for pair in pairs_list: | |
824 p1, p2 = pair[0], pair[1] | |
825 if p1 not in index: | |
826 print "Countings on {}".format(p1) | |
827 | |
828 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | |
829 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs) | |
830 | |
831 | |
832 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration, 'p_resampling') | |
833 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration, 'p_resampling') | |
834 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') | |
835 | |
836 all_codons[p1+"_obs_counts"] = p1_codons_counts | |
837 all_codons[p1+"_obs_freqs"] = p1_codons_freqs | |
838 all_codons[p1+"_pvalues"] = p1_codons_pvalues | |
839 all_aa[p1+"_obs_counts"] = p1_aa_counts | |
840 all_aa[p1+"_obs_freqs"] = p1_aa_freqs | |
841 all_aa[p1+"_pvalues"] = p1_aa_pvalues | |
842 all_aatypes[p1+"_obs_counts"] = p1_aatypes_counts | |
843 all_aatypes[p1+"_obs_freqs"] = p1_aatypes_freqs | |
844 all_aatypes[p1+"_pvalues"] = p1_aatypes_pvalues | |
845 all_various[p1] = [p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP] | |
846 | |
847 index.append(p1) | |
848 | |
849 if p2 not in index: | |
850 print "Countings on {}".format(p2) | |
851 | |
852 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | |
853 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs) | |
854 | |
855 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration, 'p_resampling') | |
856 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration, 'p_resampling') | |
857 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') | |
858 | |
859 all_codons[p2+"_obs_counts"] = p2_codons_counts | |
860 all_codons[p2+"_obs_freqs"] = p2_codons_freqs | |
861 all_codons[p2+"_pvalues"] = p2_codons_pvalues | |
862 all_aa[p2+"_obs_counts"] = p2_aa_counts | |
863 all_aa[p2+"_obs_freqs"] = p2_aa_freqs | |
864 all_aa[p2+"_pvalues"] = p2_aa_pvalues | |
865 all_aatypes[p2+"_obs_counts"] = p2_aatypes_counts | |
866 all_aatypes[p2+"_obs_freqs"] = p2_aatypes_freqs | |
867 all_aatypes[p2+"_pvalues"] = p2_aatypes_pvalues | |
868 all_various[p2] = p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP | |
869 | |
870 index.append(p2) | |
871 | |
872 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts: | |
873 print "Countings transitions between {} and {}".format(p1, p2) | |
874 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif) | |
875 | |
876 # Ajout | |
877 codons_sed, aa_sed, aatypes_sed = all_sed(p1_codons_counts, p1_aa_counts, p1_aatypes_counts, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions) | |
878 | |
879 index_transi.append((p1,p2)) | |
880 | |
881 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration, 'p_resampling') | |
882 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration, 'p_resampling') | |
883 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration, 'p_resampling') | |
884 | |
885 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions | |
886 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs | |
887 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues | |
888 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions | |
889 all_aa_transitions[p1+">"+p2+"_obs_freqs"] = aa_transitions_freqs | |
890 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues | |
891 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions | |
892 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs | |
893 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues | |
894 | |
895 all_codons_transitions[p1+">"+p2+"_sed"] = codons_sed | |
896 all_aa_transitions[p1+">"+p2+"_sed"] = aa_sed | |
897 all_aatypes_transitions[p1+">"+p2+"_sed"] = aatypes_sed | |
898 | |
899 index_transi.append((p1, p2)) | |
900 | |
901 print "\n Done.\n" | |
902 | |
903 print "Processing : creating dataframes ..." | |
904 | |
905 frame_codons = pd.DataFrame(all_codons).T.astype('object') | |
906 frame_aa = pd.DataFrame(all_aa).T.astype('object') | |
907 frame_aatypes = pd.DataFrame(all_aatypes).T.astype('object') | |
908 | |
909 frame_codons_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_codons_transitions.items()}).unstack() | |
910 frame_codons_transitions.columns = frame_codons_transitions.columns.map('>'.join) | |
911 | |
912 frame_aa_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aa_transitions.items()}).unstack() | |
913 frame_aa_transitions.columns = frame_aa_transitions.columns.map('>'.join) | |
914 | |
915 frame_aatypes_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aatypes_transitions.items()}).unstack() | |
916 frame_aatypes_transitions.columns = frame_aatypes_transitions.columns.map('>'.join) | |
917 | |
918 frame_various = pd.DataFrame(all_various).T | |
919 frame_various.columns = ["GC3","GC12","IVYWREL","EKQH","PAYRESDGM","purineload", "CvP"] | |
920 | |
921 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species" | |
922 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species" | |
923 | |
924 print "Writing dataframes to output files ...\n" | |
925 | |
926 frame_codons.round(8).to_csv("codons_freqs.csv", sep=",", encoding="utf-8") | |
927 frame_aa.round(8).to_csv("aa_freqs.csv", sep=",", encoding="utf-8") | |
928 frame_aatypes.astype('object').round(8).to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8") | |
929 frame_codons_transitions.round(8).to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8") | |
930 frame_aa_transitions.round(8).to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8") | |
931 frame_aatypes_transitions.round(8).to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8") | |
932 frame_various.round(8).to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8") | |
933 | |
934 print "Done." | |
935 | |
936 if __name__ == "__main__": | |
937 main() |