comparison scripts/S01a_codons_counting.py @ 0:acc3674e515b draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:28:50 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc3674e515b
1 #!/usr/bin/env python
2 # coding: utf-8
3 # Author : Victor Mataigne
4
5 import string, os, sys, re, random, itertools, argparse, copy, math
6 import pandas as pd
7 import numpy as np
8
9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif):
10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting
11
12 Args :
13 list_codons (list of str) : all codons except codons-stop
14 content (int or list) : an integer (for coutings and transitions), or an empty list (for resampling)
15 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
16 dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...}
17
18 Returns :
19 dico_codons, dico_aa, dico_aatypes (dicts) : keys : codons/amico-acids/amico-acids types, values : 0 or []
20 dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions (dicts of dicts) :
21 actually, the three first dictionaries nested as values of keys codons/amico-acids/amico-acids
22 """
23
24 # I could have make sub-routines here.
25 # the copy commands are mandatory, otherwise all dictionaries will reference the same variable(s)
26
27 dico_codons = {}
28 for codon in list_codons:
29 dico_codons[codon] = copy.deepcopy(content)
30
31 dico_aa = {}
32 for aa in dict_genetic_code.keys():
33 dico_aa[aa] = copy.deepcopy(content)
34
35 dico_aatypes = {}
36 for aatype in dict_aa_classif.keys():
37 dico_aatypes[aatype] = copy.deepcopy(content)
38
39 dico_codons_transitions=copy.deepcopy(dico_codons)
40 for key in dico_codons_transitions.keys():
41 dico_codons_transitions[key]=copy.deepcopy(dico_codons)
42
43 dico_aa_transitions = copy.deepcopy(dico_aa)
44 for key in dico_aa_transitions.keys():
45 dico_aa_transitions[key]=copy.deepcopy(dico_aa)
46
47 dico_aatypes_transitions = copy.deepcopy(dico_aatypes)
48 for key in dico_aatypes_transitions.keys():
49 dico_aatypes_transitions[key]=copy.deepcopy(dico_aatypes)
50
51 return dico_codons, dico_aa, dico_aatypes, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions
52
53 def viable(seqs, pos, method):
54 """ Compute if, among a set of sequences, either at least one of the codons at the specified position has not a "-",
55 or not any codon has a "-"
56
57 Args :
58 seqs : a list (the sequences, which must all have the same size)
59 pos : an integer (the positions, <= len(seqs) -3)
60
61 Returns:
62 bool
63 """
64 codons = [seq[pos:pos+3] for seq in seqs]
65 if method == "all":
66 return not all("-" in codon for codon in codons)
67 elif method == "any":
68 return not any("-" in codon for codon in codons)
69
70 # # # Function for codons, aa, aatypes Countings -------------------------------------------------------------------------------
71
72 def computeAllCountingsAndFreqs(seq, list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif):
73 """ Call all functions dedicated to the computation of occurences and frequencies of codons, amino-acids, amino-acids types
74
75 Args : see sub-routines
76
77 Returns : 6 dictionaries (occurences and frequencies for codons, amino-acids, amino-acids types). See sub-routines for details.
78 """
79
80 # ------ Sub-routines ------ #
81
82 def codonsCountings(seq, list_codons, init_dict_codons):
83 """ Count occurences of each codon in a sequence
84 First reading frame only : input sequence is supposed to be an ORF
85
86 Args :
87 seq (str) : the sequence
88 list_codons (list of str) : all codons except codons-stop
89 init_dict_codons (dict) : {codon1 : 0, codon2: 0, ...}
90
91 Return :
92 codon (dict) : codons (keys) and their occurences (values) in the sequence
93 """
94
95 codons = copy.deepcopy(init_dict_codons)
96
97 l = len(seq)
98
99 if l%3 == 0: max_indice = l-3
100 if l%3 == 1: max_indice = l-4
101 if l%3 == 2: max_indice = l-5
102
103 for codon in range(0,max_indice+1,3):
104 if "-" not in seq[codon:codon+3] :
105 codons[seq[codon:codon+3]] += 1
106
107 return codons
108
109 def codonsFreqs(dict_codons_counts):
110 """ Computes frequencies of each codon in a sequence
111
112 Args :
113 dict_codons (dict) : the output of codonsCounting()
114
115 Return :
116 codons_freqs (dict) : codons (keys) and their frequencies (values)
117 """
118
119 codons_freqs = {}
120
121 for key in dict_codons_counts.keys():
122 freq = float(dict_codons_counts[key])/sum(dict_codons_counts.values())
123 codons_freqs[key] = freq
124
125 return codons_freqs
126
127 def aaCountings(dict_codons, dict_genetic_code, init_dict_aa):
128 """ Count occurences of each amino-acid in a sequence, based on the countings of codons (1st ORF)
129
130 Args :
131 dict_codons (dict) : the output of codonsCounting()
132 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
133 init_dict_aa (dict) : {aa1 : 0, aa2: 0, ...}
134
135 Return :
136 dict_aa (dict) : amino-acids (keys) and their occurences (values)
137 """
138 dict_aa = copy.deepcopy(init_dict_aa)
139
140 for key in dict_codons.keys():
141 for value in dict_genetic_code.values():
142 if key in value:
143 dict_aa[dict_genetic_code.keys()[dict_genetic_code.values().index(value)]] += dict_codons[key]
144
145 return dict_aa
146
147 def aaFreqs(dict_aa_counts):
148 """ Computes frequencies of each amino-acid in a sequence, based on the countings of codons (1st ORF)
149
150 Args :
151 dict_aa_counts (dict) : the output of aaCountings()
152
153 Return :
154 dict_aa_freqs (dict) : amino-acids (keys) and their frequencies (values)
155 """
156
157 dict_aa_freqs = {}
158
159 for key in dict_aa_counts.keys():
160 freq = float(dict_aa_counts[key])/sum(dict_aa_counts.values())
161 dict_aa_freqs[key] = freq
162
163 return dict_aa_freqs
164
165 def aatypesCountings(dict_aa, dict_aa_classif, init_dict_classif):
166 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF)
167
168 Args :
169 - dict_aa (dict) : the output of aaCountings()
170 - dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} )
171 - init_dict_classif (dict) : {'polar': 0, 'apolar': 0, ...}
172
173 Return :
174 dict_aatypes (dict) : amino-acids types (keys) and their occurences (values)
175 """
176 dict_aatypes = copy.deepcopy(init_dict_classif)
177
178 for key_classif in dict_aa_classif.keys():
179 for key_aa in dict_aa.keys():
180 if key_aa in dict_aa_classif[key_classif]:
181 dict_aatypes[key_classif] += dict_aa[key_aa]
182
183 return dict_aatypes
184
185 def aatypesFreqs(dict_aatypes):
186 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF)
187
188 Args :
189 dict_aatypes (dict) : the output of aatypesCountings()
190
191 Return :
192 dict_aatypes_freqs : amino-acids types (keys) and their frequencies (values)
193 """
194 dict_aatypes_freqs = {}
195
196 for key in dict_aatypes.keys():
197 freq = float(dict_aatypes[key])/sum(dict_aatypes.values())
198 dict_aatypes_freqs[key] = freq
199
200 return dict_aatypes_freqs
201
202 # ------ The function ------ #
203
204 codons_c = codonsCountings(seq, list_codons, init_dict_codons)
205 codons_f = codonsFreqs(codons_c)
206 aa_c = aaCountings(codons_c, dict_genetic_code, init_dict_aa)
207 aa_f = aaFreqs(aa_c)
208 aat_c = aatypesCountings(aa_c, dict_aa_classif, init_dict_classif)
209 aat_f = aatypesFreqs(aat_c)
210
211 return codons_c, codons_f, aa_c, aa_f, aat_c, aat_f
212
213 # # # Functions for various measures (ivywrel, ekqh...) -------------------------------------------------------------------------
214
215 def computeVarious(seq, dict_aa_counts, dict_aa_types):
216 """ Call al the functions for nucleic and amino-acids sequences description
217
218 Args : See sub-routines for details.
219
220 Returns : 6 integer or floats. See sub-routines for details.
221 """
222
223 # ------ Sub-routines ------ #
224
225 def nbCodons(seq):
226 """ Compute the number of full codons in a sequence
227 Arg : seq (str): the sequence
228 Return: nb_codons (int)
229 """
230 l = len(seq)
231 if l%3 == 0: nb_codons = l/3
232 if l%3 == 1: nb_codons = (l-1)/3
233 if l%3 == 2: nb_codons = (l-2)/3
234 return nb_codons
235
236 def maxIndice(seq):
237 """ Compute the highest indice for parsing a sequence
238 Arg : seq (str): the sequence
239 Return : max_indice (int)
240 """
241 l = len(seq)
242 if l%3 == 0: max_indice = l-3
243 if l%3 == 1: max_indice = l-4
244 if l%3 == 2: max_indice = l-5
245 return max_indice
246
247 def gc12Andgc3Count(seq, nb_codons, max_indice):
248 """ Compute the frequency of gc12 in a sequence
249 Arg : seq (str) : the sequence
250 Return : (float)
251 """
252
253 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ?
254 # But : involves a less readable code
255
256 gc12 = 0
257 gc3 = 0
258
259 for i in range(0, max_indice+1,3):
260 if seq[i] in ["c","g"]: gc12 += 1
261 if seq[i+1] in ["c","g"]: gc12 += 1
262 if seq[i+2] in ["c","g"] or seq[i+2] in ["c","g"]: gc3 += 1
263
264 return float(gc3)/nb_codons, float(gc12)/(2*nb_codons)
265
266 def ivywrelCount(nb_codons, dict_aa_counts):
267 """ Compute the sum of occurences of amino-acids IVYWREL divided by the number of codons
268
269 Args :
270 nb_codons (int) : the number of codons in the sequence
271 dict_aa_counts (dict) : the output of aaCountings()
272
273 return : (float)
274 """
275
276 IVYWREL = 0
277
278 for aa in ["I","V","Y","W","R","E","L"]: # Impossible to make a simple sum, in case one the aa is not in the dict keys
279 if aa in dict_aa_counts.keys():
280 IVYWREL += dict_aa_counts[aa]
281
282 return float(IVYWREL)/nb_codons
283
284 def ekqhCount(dict_aa_counts):
285 """ Compute the ratio of amino-acids EK/QH
286 Arg : dict_aa_counts (dict) : the output of aaCountings()
287 Return : (float)
288 """
289 ek = 0
290 qh = 0
291
292 ek = dict_aa_counts["E"] + dict_aa_counts["K"]
293 qh = dict_aa_counts["Q"] + dict_aa_counts["H"]
294
295 if qh != 0:
296 return float(ek)/qh
297 else : return ek
298
299 def payresdgmCount(dict_aa_counts):
300 """ Compute the ratio of amino-acids PASRE/SDGM
301 Arg : dict_aa_counts (dict) : the output of aaCountings()
302 Return : (float)
303 """
304 payre = 0
305 sdgm = 0
306
307 for aa in ["P","A","Y","R","E"]:
308 payre += dict_aa_counts[aa]
309 for aa in ["S","D","G","M"]:
310 sdgm += dict_aa_counts[aa]
311
312 if sdgm != 0:
313 return float(payre)/sdgm
314 else : return payre
315
316 def purineLoad(seq, nb_codons):
317 """ Compute the purine load indice of a sequence
318 Args :
319 seq (str) : the sequence
320 nb_codons (int) : the number of codons in the sequence
321
322 Return (float)
323 """
324
325 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ?
326 # But : involves a less readable code
327
328 g12, g3, A, c12, c3, T = 0.0,0.0,seq.count("a"),0.0,0.0,seq.count("t")
329
330 # g3 and c3 : g and c in 3rd position of a codon
331 s = ""
332 for i in range(2, len(seq), 3):
333 s += seq[i]
334 g3 = s.count("g")
335 c3 = s.count("c")
336
337 # g12 and c12 : g and c in 1st and 2d positions of a codon
338 s = ""
339 for i in range(0, len(seq), 3):
340 s += seq[i]
341 g12 = s.count("g")
342 c12 = s.count("c")
343 s = ""
344 for i in range(1, len(seq), 3):
345 s += seq[i]
346 g12 += s.count("g")
347 c12 += s.count("c")
348
349 return float(1000*(g12+g3+A-c12-c3-T))/(3*nb_codons)
350
351 def cvp(dict_aatypes):
352 """ Compute the difference nb_charged_aamino_acids - nb_polar_amino_acids
353 Return: (int)
354 """
355 return dict_aatypes["charged"] - dict_aatypes["polar"]
356
357 # ------ The function ------ #
358
359 nb_codons = nbCodons(seq)
360 max_indice = maxIndice(seq)
361 GC3, GC12 = gc12Andgc3Count(seq, nb_codons, max_indice)
362 IVYWREL, EKQH, PAYRESDGM = ivywrelCount(nb_codons, dict_aa_counts), ekqhCount(dict_aa_counts), payresdgmCount(dict_aa_counts)
363 purineload, CvP = purineLoad(seq, nb_codons), cvp(dict_aa_types)
364 return GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP
365
366 # # # Function for codons, aa, aatypes Transitions -----------------------------------------------------------------------------
367
368 def computeAllBiases(seq1, seq2, dico_codons_transi, dico_aa_transi, dico_aatypes_transi, reversecode, reverseclassif) :
369 """ Compute all biases (transisitions codon->codon, aa->-aa, aa_type->aa_type) between two sequences
370
371 Args : See sub-routines for details
372
373 Returns 3 dictionaries of dictionaries. See sub-routines for details
374 """
375
376 # ------ Sub-routines ------ #
377
378 def codons_transitions(seq1, seq2, dico_codons_transi):
379 """ Compute the number of transitions from a codon of a sequence to the codon of a second sequence
380
381 Args :
382 seq1 (str) : the first sequence
383 seq2 (str) : the second sequence
384 dico_codons_transi (dict of dicts) : { codon1 : {codon1: 0, codon2 : 0, ...}, ..}
385
386 Return :
387 codons_transi (dict of dicts) : the occurences of each codon to codon transition
388 """
389
390 codons_transi = copy.deepcopy(dico_codons_transi)
391
392 for i in range(0, len(seq1), 3):
393 # check if no indel and if len seq[i:i+3] is really 3 (check for the last codon)
394 if viable([seq1, seq2], i, "any") and len(seq1[i:i+3]) == 3 and len(seq2[i:i+3]) == 3 :
395 codons_transi[seq1[i:i+3]][seq2[i:i+3]] += 1
396
397 return codons_transi
398
399 def codons_transitions_freqs(codons_transitions_counts):
400 """ Computes frequencies of codon transitions between two sequences
401
402 Arg :
403 codons_transitions_counts (dict) : the output of codons_transitions()
404
405 Return :
406 codons_transi_freqs (dict of dicts) : the frequencies of each codon to codon transition
407 """
408
409 codons_transi_freqs = {}
410
411 for key in codons_transitions_counts.keys():
412 codons_transi_freqs[key] = {}
413 for key2 in codons_transitions_counts[key].keys():
414 if sum(codons_transitions_counts[key].values()) != 0:
415 freq = float(codons_transitions_counts[key][key2])/sum(codons_transitions_counts[key].values())
416 codons_transi_freqs[key][key2] = freq
417 else :
418 codons_transi_freqs[key][key2] = 0
419 return codons_transi_freqs
420
421 def aa_transitions(dico_codons_transi, dico_aa_transi, reversecode):
422 """ Compute the number of transitions from an amino-acid of a sequence to the amino-acid of a second sequence
423
424 Args :
425 dico_codons_transi (dict of dicts) : the codons transitions computed by codons_transitions()
426 dico_aa_transi (dict of dicts) : { aa1 : {aa1: 0, aa2 : 0, ...}, ..}
427 reversecode (dict) : the reversed genetic code {aa1 : [codons],...} -> {codon1: aa1, codon2: aa2, ...}
428
429 Return :
430 aa_transi (dict of dicts) : the occurences of each aa to aa transition
431 """
432
433 aa_transi = copy.deepcopy(dico_aa_transi)
434
435 for k in dico_codons_transi.keys():
436 newk = reversecode[k]
437 for k2 in dico_codons_transi[k].keys():
438 newk2 = reversecode[k2]
439 aa_transi[newk][newk2] += dico_codons_transi[k][k2]
440
441 return aa_transi
442
443 def aa_transitions_freqs(aa_transitions_counts):
444 """ Computes frequencies of amico-acids transitions between two sequences
445 Arg : aa_transitions_counts (dict of dicts): the output of aa_transitions()
446 Return : aa_transi_freqs (dict of dicts) : the frequencies of each aa to aa transition
447 """
448
449 aa_transi_freqs = {}
450
451 for key in aa_transitions_counts.keys():
452 aa_transi_freqs[key] = {}
453 for key2 in aa_transitions_counts[key].keys():
454 if sum(aa_transitions_counts[key].values()) != 0:
455 freq = float(aa_transitions_counts[key][key2])/sum(aa_transitions_counts[key].values())
456 aa_transi_freqs[key][key2] = freq
457 else :
458 aa_transi_freqs[key][key2] = 0
459 return aa_transi_freqs
460
461 def aatypes_transitions(dico_aa_transi, dico_aatypes_transi, reverseclassif):
462 """ Compute the number of transitions from an amino-acid type of a sequence to the amino-acid type of a second sequence
463
464 Args :
465 dico_aa_transi (dict of dicts) : the output of aa_transitions()
466 dico_aatypes_transi (dict of dicts) : { type1 : {type1: 0, type2 : 0, ...}, ..}
467 reverseclassif (dict) : the reversed amino-acid clasification {aa1: type, aa2: type, ...}
468
469 Return :
470 aatypes_transi (dict of dicts) : the occurences of each aatype to aatype transition
471 """
472
473 aatypes_transi = copy.deepcopy(dico_aatypes_transi)
474 for k in dico_aa_transi.keys():
475 newk = reverseclassif[k]
476 for k2 in dico_aa_transi[k].keys():
477 newk2 = reverseclassif[k2]
478 aatypes_transi[newk][newk2] += dico_aa_transi[k][k2]
479
480 return aatypes_transi
481
482 def aatypes_transitions_freqs(aatypes_transitions_counts):
483 """ Computes frequencies of amico-acids types transitions between two sequences
484 Args : aatypes_transitions_counts (dict of dicts) : the output of aatypes_transitions()
485 Return : aatypes_transi_freqs (dict of dicts) : the frequencies of each aatype to aatype transition
486 """
487
488 aatypes_transi_freqs = {}
489
490 for key in aatypes_transitions_counts.keys():
491 aatypes_transi_freqs[key] = {}
492 for key2 in aatypes_transitions_counts[key].keys():
493 if sum(aatypes_transitions_counts[key].values()) != 0:
494 freq = float(aatypes_transitions_counts[key][key2])/sum(aatypes_transitions_counts[key].values())
495 aatypes_transi_freqs[key][key2] = freq
496 else :
497 aatypes_transi_freqs[key][key2] = 0
498 return aatypes_transi_freqs
499
500
501
502
503 # ------ The function ------ #
504
505 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi)
506 codons_transitions_freqs = codons_transitions_freqs(codons_transitions)
507 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode)
508 aa_transitions_freqs = aa_transitions_freqs(aa_transitions)
509 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif)
510 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions)
511
512 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs
513
514 def all_sed(codons_c, aa_c, aat_c, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transi, dico_aa_transi, dico_aatypes_transi):
515
516 def compute_sed(transi, counts, dico):
517 """ Compute the substitution exchangeability disequilibrium (SED) from one species A to another B between codons/aa//aatypes couples
518
519 Args:
520 transi ; dict - dictionaries of all counts of transition from codon/aa/aatype X to Y from sp A to sp B
521 counts : dict - dictionaries of codons/aa/aatypes counts in species A
522 dico : dict - a dictionary (nested) with all values to 0
523
524 """
525 dict_sed = copy.deepcopy(dico)
526
527 for key in transi.keys():
528 for key2 in transi.keys():
529 if counts[key] != 0 and float(transi[key2][key])/counts[key] != 0.0:
530 x = (float(transi[key][key2])/counts[key]) / (float(transi[key2][key])/counts[key])
531 dict_sed[key][key2] = - pow(2,1-x)+1
532 else :
533 dict_sed[key][key2] = 'NA'
534
535 return dict_sed
536
537 codons_sed = compute_sed(codons_transitions, codons_c, dico_codons_transi)
538 aa_sed = compute_sed(aa_transitions, aa_c, dico_aa_transi)
539 aatypes_sed = compute_sed(aatypes_transitions, aat_c, dico_aatypes_transi)
540
541 return codons_sed, aa_sed, aatypes_sed
542
543 # # # Function for random resampling --------------------------------------------------------------------------------------------
544
545 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif):
546 """ Resample randomly codons from sequences (sort of bootsrap)
547
548 Args :
549 dict_seq (dict) : contains the species name and sequences used ( { 'sp1': seq1, 'sp2': seq2, ...}) without '-' removal
550 nb_iter (int) : number of resampling iterations (better >= 1000)
551 len_sample (int) : length (in codons) of a resampled sequence (better >= 1000)
552 list_codons (list of str): all codons except codons-stop
553 genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
554 aa_classif (dict) : the types of the amino-acids : {type: [aa1, aa2, ...], ...}
555 reversecode (dict) : the reversed genetic code : {codon1: aa1, codon2: aa2, ...}
556 reverseclassif (dict) : the reversed amino-acid : clasification {aa1: type, aa2: type, ...}
557
558 Returns :
559 codons_lst, aa_lst, classif_lst (dicts) : keys : codons/aa/aatypes, values : []
560 codons_transitions_lst, aa_transitions_lst, classif_transitions_lst (dict of dicts) : keys : codons/aa/aatypes, values : the 3 previous dicts
561 """
562
563 # Initialize empty dictionaries for countings and transitions. It's also possible to isntanciate these ones in the main() but it would make a function with ~14 parameters
564 codons_0, aa_0, classif_0, codons_transitions_0, aa_transitions_0, classif_transitions_0 = buildDicts(list_codons, 0, genetic_code, aa_classif)
565 codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst = buildDicts(list_codons, [], genetic_code, aa_classif)
566
567 # determine the max position where sampling is possible
568 l = len(dict_seq.values()[1])
569 if l%3 == 0: max_indice = l-3
570 if l%3 == 1: max_indice = l-4
571 if l%3 == 2: max_indice = l-5
572
573 # List of positions to resample
574 viable_positions = [pos for pos in range(0,max_indice,3) if viable(dict_seq.values(), pos, "all")]
575 sample_positions = np.random.choice(viable_positions, len_sample)
576
577 # nb_iter resampled sequences
578 for i in range(nb_iter):
579 if (i+1)%(nb_iter/10) == 0:
580 print " "+str( (i+1)*100/nb_iter)+"%"
581
582 seqa, seqb = "", ""
583 for pos in sample_positions:
584 codona, codonb = "---", "---"
585 # The sequence to be resampled in this position is randomly chosen ; no "-" resampled
586 while "-" in codona :
587 codona = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3]
588 while "-" in codonb :
589 codonb = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3]
590 seqa += codona
591 seqb += codonb
592
593 # dictionaries : frequences of codons, aa, aatypes (seq1)
594 codons_occ_tmp, codons_freq_tmp, aa_occ_tmp, aa_freq_tmp, aatypes_occ_tmp, aatypes_freq_tmp = computeAllCountingsAndFreqs(seqa, list_codons, codons_0, aa_0, classif_0, genetic_code, aa_classif)
595 # dictionaries frequences of transitions (seqa->seqb)
596 codons_transitions_tmp, codons_transitions_freq_tmp, aa_transition_tmp, aa_transitions_freq_tmp, aatypes_transitions_tmp, aatypes_transitions_freq_tmp = computeAllBiases(seqa, seqb, codons_transitions_0, aa_transitions_0, classif_transitions_0, reversecode, reverseclassif)
597
598 # Adding occurrences in final dicts
599 for key in codons_freq_tmp.keys():
600 codons_lst[key].append(codons_freq_tmp[key])
601 for key in aa_freq_tmp.keys():
602 aa_lst[key].append(aa_freq_tmp[key])
603 for key in aatypes_freq_tmp.keys():
604 classif_lst[key].append(aatypes_freq_tmp[key])
605
606 # Adding occurrences in final dicts (transitions)
607 for key in codons_transitions_freq_tmp.keys():
608 for key2 in codons_transitions_freq_tmp[key].keys():
609 codons_transitions_lst[key][key2].append(codons_transitions_freq_tmp[key][key2])
610 for key in aa_transitions_freq_tmp.keys():
611 for key2 in aa_transitions_freq_tmp[key].keys():
612 aa_transitions_lst[key][key2].append(aa_transitions_freq_tmp[key][key2])
613 for key in aatypes_transitions_freq_tmp.keys():
614 for key2 in aatypes_transitions_freq_tmp[key].keys():
615 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2])
616
617 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst
618
619 def testPvalues(dict_counts, dict_resampling, nb_iter, method):
620 """ Computes where the observed value is located in the expected counting distribution
621
622 Args :
623 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases()
624 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function
625
626 Return :
627 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts)
628
629
630 pnorm computes the pvalue to have a value inferior to the observed value under a normal distribution
631 One sided to left tail :
632 p < 0.05 indicates significantly lower counts
633 p > 0.95 indicates significantly higher counts
634 """
635
636
637 def p_resampling(obs, values, nb_iter):
638 """ The pvalue is the proportion of bootsrapped values smaller than the observed value
639 If p = 0.025 : 2.5% of the bootstrapped values are smaller than the observed value
640 p < 0.025 : the obs value is most likely significantly lower.
641 If p = 0.975 : 97.5% of the bootstrapped values are smaller than the observed value
642 p > 0.975 the obs value is most likely significantly higher.
643
644 Args :
645 obs : int or float - the observed value
646 values : list - values of resampling (int or floats)
647 nb_iter : int - the number of resampled values (=len(values))
648
649 Return :
650 pvalue (float)
651 """
652
653 num = len([x for x in values if x < obs])
654 return float(num + 1) / (nb_iter+1)
655
656 def testPvalue(obs, exp, nb_iter):
657 """ Compute a pvalue
658
659 Args :
660 exp (list of floatsà : a list of length nb_iter, containing expected frequencies of a codon/aa/aatype at each iteration
661 obs (float) : the observed value
662 nb_iter (int) : the number of iterations for resampling
663
664 Returns :
665 pvalue (float)
666 """
667
668 max_val = nb_iter-1
669 min_val = 0
670 test_val = (max_val+min_val)/2
671
672 while max_val-min_val > 1:
673 if obs > exp[test_val]:
674 min_val = test_val
675 test_val = (max_val+min_val)/2
676 elif obs < exp[test_val]:
677 max_val = test_val
678 test_val = (max_val+min_val)/2
679 else:
680 break
681
682 pvalue = float(test_val+1)/(nb_iter+1)
683
684 return pvalue
685
686 # ------ The function ------ #
687
688 pvalues = {}
689
690 for key in dict_resampling.keys():
691 if type(dict_resampling.values()[1]) is not dict :
692 if method == 'origin':
693 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter)
694 elif method == 'pnorm':
695 pvalues[key] = scipy.stats.norm.cdf(dict_counts[key], np.mean(dict_resampling[key]), np.std(dict_resampling[key]))
696 elif method == 'p_resampling':
697 pvalues[key] = p_resampling(dict_counts[key], dict_resampling[key], nb_iter)
698 else :
699 pvalues[key] = {}
700 for key2 in dict_resampling[key].keys():
701 if method == 'origin':
702 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter)
703 elif method == 'pnorm':
704 pvalues[key][key2] = scipy.stats.norm.cdf(dict_counts[key][key2], np.mean(dict_resampling[key][key2]), np.std(dict_resampling[key][key2]))
705 elif method == 'p_resampling':
706 pvalues[key][key2] = p_resampling(dict_counts[key][key2], dict_resampling[key][key2], nb_iter)
707
708 return pvalues
709
710 def main():
711
712 # arguments
713 parser = argparse.ArgumentParser()
714 parser.add_argument("sequences_file", help="File containing sequences (the output of the tool 'ConcatPhyl'")
715 parser.add_argument("considered_species", help="The species name, separated by commas (must be the same than in the sequences_file). It is possible to consider only a subset of species.")
716 parser.add_argument("species_for_bootstrap", help="The species which will be used for bootstrapping, separated by commas. It is possible to consider only a subset of species.")
717 parser.add_argument("iteration", help="The number of iterations for bootstrapping (better if => 1000)", type=int)
718 parser.add_argument("sample_length", help="The lenght of a bootstrapped sequences (better if >= 1000", type=int)
719 args = parser.parse_args()
720
721 print "\n ------ Occurences and frequencies of codons, amino-acids, amino-acids types -------\n"
722
723 print "The script counts the number of codons, amino acids, and types of amino acids in sequences,"
724 print "as well as the mutation bias from one item to another between 2 sequences.\n"
725
726 print "Counting are then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences."
727 print "In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from"
728 print "a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting."
729
730 print " Sequences file : {}".format(args.sequences_file)
731 print " Species retained for countings : {}\n".format(args.considered_species)
732
733 print "Processing : reading input file, opening output files, building dictionaries."
734
735 # make pairs
736 list_species = str.split(args.considered_species, ",")
737 list_species_boot = str.split(args.species_for_bootstrap, ",")
738 pairs_list=list(itertools.combinations(list_species,2))
739
740 # read sequences
741 sequences_for_counts = {}
742 sequences_for_resampling = {}
743 with open(args.sequences_file, "r") as file:
744 for line1,line2 in itertools.izip_longest(*[file]*2):
745 species = line1.strip(">\r\n")
746 sequence = line2.strip("\r\n")
747 if species in list_species:
748 sequences_for_counts[species] = sequence
749 if species in list_species_boot:
750 sequences_for_resampling[species] = sequence
751
752 print " Warning : countings might be biased and show high differences between species because of high variations of the indels proportions among sequences."
753 print " Frequences are more representative."
754
755 print "\n Indels percent :"
756
757 for k,v in sequences_for_counts.items():
758 print " {} : {} %".format(k, float(v.count("-"))/len(v)*100)
759
760 # useful dictionaries
761 dict_genetic_code={"F":["ttt","ttc"],
762 "L":["tta","ttg","ctt","ctc","cta","ctg"],
763 "I":["att","atc","ata"],
764 "M":["atg"],
765 "V":["gtt","gtc","gta","gtg"],
766 "S":["tct","tcc","tca","tcg","agt","agc"],
767 "P":["cct","cca","ccg","ccc"],
768 "T":["act","acc","aca","acg"],
769 "A":["gct","gcc","gca","gcg"],
770 "Y":["tat","tac"],
771 "H":["cat","cac"],
772 "Q":["caa","cag"],
773 "N":["aat","aac"],
774 "K":["aaa","aag"],
775 "D":["gat","gac"],
776 "E":["gaa","gag"],
777 "C":["tgt","tgc"],
778 "W":["tgg"],
779 "R":["cgt","cgc","cga","cgg","aga","agg"],
780 "G":["ggt","ggc","gga","ggg"]}
781
782 dict_aa_classif={"unpolar":["G","A","V","L","M","I"],
783 "polar":["S","T","C","P","N","Q"],
784 "charged":["K","R","H","D","E"],
785 "aromatics":["F","Y","W"]}
786
787 reversecode={v:k for k in dict_genetic_code for v in dict_genetic_code[k]}
788 reverseclassif={v:k for k in dict_aa_classif for v in dict_aa_classif[k]}
789
790 # codons list (without stop codons)
791 nucleotides = ['a', 'c', 'g', 't']
792 list_codons = [''.join(comb) for comb in itertools.product(nucleotides, repeat=3)]
793 list_codons.remove("taa")
794 list_codons.remove("tag")
795 list_codons.remove("tga")
796
797 # Store already computed species + row.names in output files
798 index = []
799 index_transi = []
800
801 # Final dictionaries writed to csv files
802 all_codons = {}
803 all_aa = {}
804 all_aatypes = {}
805 all_various = {}
806 all_codons_transitions = {} # Not used because too much : 61*61 columns
807 all_aa_transitions = {}
808 all_aatypes_transitions = {}
809
810 # RUN
811
812 print "\nProcessing : resampling ..."
813 print " Parameters : {niter} iterations, {lensample} codon per resampled sequence, species used : {species}\n".format(niter=args.iteration, lensample=args.sample_length, species=args.species_for_bootstrap)
814
815 codons_boot, aa_boot, aatypes_boot, codons_transi_boot, aa_transi_boot, aatypes_transi_boot = sampling(sequences_for_resampling, args.iteration, args.sample_length, list_codons, dict_genetic_code, dict_aa_classif, reversecode, reverseclassif)
816 print " Done.\n"
817
818 print "Processing : countings....\n"
819
820 # Initialize empty dictionaries for countings and transitions
821 init_dict_codons, init_dict_aa, init_dict_classif, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions = buildDicts(list_codons, 0, dict_genetic_code, dict_aa_classif)
822
823 for pair in pairs_list:
824 p1, p2 = pair[0], pair[1]
825 if p1 not in index:
826 print "Countings on {}".format(p1)
827
828 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
829 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs)
830
831
832 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration, 'p_resampling')
833 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration, 'p_resampling')
834 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling')
835
836 all_codons[p1+"_obs_counts"] = p1_codons_counts
837 all_codons[p1+"_obs_freqs"] = p1_codons_freqs
838 all_codons[p1+"_pvalues"] = p1_codons_pvalues
839 all_aa[p1+"_obs_counts"] = p1_aa_counts
840 all_aa[p1+"_obs_freqs"] = p1_aa_freqs
841 all_aa[p1+"_pvalues"] = p1_aa_pvalues
842 all_aatypes[p1+"_obs_counts"] = p1_aatypes_counts
843 all_aatypes[p1+"_obs_freqs"] = p1_aatypes_freqs
844 all_aatypes[p1+"_pvalues"] = p1_aatypes_pvalues
845 all_various[p1] = [p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP]
846
847 index.append(p1)
848
849 if p2 not in index:
850 print "Countings on {}".format(p2)
851
852 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
853 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs)
854
855 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration, 'p_resampling')
856 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration, 'p_resampling')
857 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling')
858
859 all_codons[p2+"_obs_counts"] = p2_codons_counts
860 all_codons[p2+"_obs_freqs"] = p2_codons_freqs
861 all_codons[p2+"_pvalues"] = p2_codons_pvalues
862 all_aa[p2+"_obs_counts"] = p2_aa_counts
863 all_aa[p2+"_obs_freqs"] = p2_aa_freqs
864 all_aa[p2+"_pvalues"] = p2_aa_pvalues
865 all_aatypes[p2+"_obs_counts"] = p2_aatypes_counts
866 all_aatypes[p2+"_obs_freqs"] = p2_aatypes_freqs
867 all_aatypes[p2+"_pvalues"] = p2_aatypes_pvalues
868 all_various[p2] = p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP
869
870 index.append(p2)
871
872 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts:
873 print "Countings transitions between {} and {}".format(p1, p2)
874 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif)
875
876 # Ajout
877 codons_sed, aa_sed, aatypes_sed = all_sed(p1_codons_counts, p1_aa_counts, p1_aatypes_counts, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions)
878
879 index_transi.append((p1,p2))
880
881 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration, 'p_resampling')
882 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration, 'p_resampling')
883 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration, 'p_resampling')
884
885 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions
886 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs
887 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues
888 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions
889 all_aa_transitions[p1+">"+p2+"_obs_freqs"] = aa_transitions_freqs
890 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues
891 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions
892 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs
893 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues
894
895 all_codons_transitions[p1+">"+p2+"_sed"] = codons_sed
896 all_aa_transitions[p1+">"+p2+"_sed"] = aa_sed
897 all_aatypes_transitions[p1+">"+p2+"_sed"] = aatypes_sed
898
899 index_transi.append((p1, p2))
900
901 print "\n Done.\n"
902
903 print "Processing : creating dataframes ..."
904
905 frame_codons = pd.DataFrame(all_codons).T.astype('object')
906 frame_aa = pd.DataFrame(all_aa).T.astype('object')
907 frame_aatypes = pd.DataFrame(all_aatypes).T.astype('object')
908
909 frame_codons_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_codons_transitions.items()}).unstack()
910 frame_codons_transitions.columns = frame_codons_transitions.columns.map('>'.join)
911
912 frame_aa_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aa_transitions.items()}).unstack()
913 frame_aa_transitions.columns = frame_aa_transitions.columns.map('>'.join)
914
915 frame_aatypes_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aatypes_transitions.items()}).unstack()
916 frame_aatypes_transitions.columns = frame_aatypes_transitions.columns.map('>'.join)
917
918 frame_various = pd.DataFrame(all_various).T
919 frame_various.columns = ["GC3","GC12","IVYWREL","EKQH","PAYRESDGM","purineload", "CvP"]
920
921 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species"
922 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species"
923
924 print "Writing dataframes to output files ...\n"
925
926 frame_codons.round(8).to_csv("codons_freqs.csv", sep=",", encoding="utf-8")
927 frame_aa.round(8).to_csv("aa_freqs.csv", sep=",", encoding="utf-8")
928 frame_aatypes.astype('object').round(8).to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8")
929 frame_codons_transitions.round(8).to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8")
930 frame_aa_transitions.round(8).to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8")
931 frame_aatypes_transitions.round(8).to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8")
932 frame_various.round(8).to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8")
933
934 print "Done."
935
936 if __name__ == "__main__":
937 main()