Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01a_codons_counting.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
| author | abims-sbr |
|---|---|
| date | Fri, 01 Feb 2019 10:28:50 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc3674e515b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # coding: utf-8 | |
| 3 # Author : Victor Mataigne | |
| 4 | |
| 5 import string, os, sys, re, random, itertools, argparse, copy, math | |
| 6 import pandas as pd | |
| 7 import numpy as np | |
| 8 | |
| 9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif): | |
| 10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting | |
| 11 | |
| 12 Args : | |
| 13 list_codons (list of str) : all codons except codons-stop | |
| 14 content (int or list) : an integer (for coutings and transitions), or an empty list (for resampling) | |
| 15 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
| 16 dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} | |
| 17 | |
| 18 Returns : | |
| 19 dico_codons, dico_aa, dico_aatypes (dicts) : keys : codons/amico-acids/amico-acids types, values : 0 or [] | |
| 20 dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions (dicts of dicts) : | |
| 21 actually, the three first dictionaries nested as values of keys codons/amico-acids/amico-acids | |
| 22 """ | |
| 23 | |
| 24 # I could have make sub-routines here. | |
| 25 # the copy commands are mandatory, otherwise all dictionaries will reference the same variable(s) | |
| 26 | |
| 27 dico_codons = {} | |
| 28 for codon in list_codons: | |
| 29 dico_codons[codon] = copy.deepcopy(content) | |
| 30 | |
| 31 dico_aa = {} | |
| 32 for aa in dict_genetic_code.keys(): | |
| 33 dico_aa[aa] = copy.deepcopy(content) | |
| 34 | |
| 35 dico_aatypes = {} | |
| 36 for aatype in dict_aa_classif.keys(): | |
| 37 dico_aatypes[aatype] = copy.deepcopy(content) | |
| 38 | |
| 39 dico_codons_transitions=copy.deepcopy(dico_codons) | |
| 40 for key in dico_codons_transitions.keys(): | |
| 41 dico_codons_transitions[key]=copy.deepcopy(dico_codons) | |
| 42 | |
| 43 dico_aa_transitions = copy.deepcopy(dico_aa) | |
| 44 for key in dico_aa_transitions.keys(): | |
| 45 dico_aa_transitions[key]=copy.deepcopy(dico_aa) | |
| 46 | |
| 47 dico_aatypes_transitions = copy.deepcopy(dico_aatypes) | |
| 48 for key in dico_aatypes_transitions.keys(): | |
| 49 dico_aatypes_transitions[key]=copy.deepcopy(dico_aatypes) | |
| 50 | |
| 51 return dico_codons, dico_aa, dico_aatypes, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions | |
| 52 | |
| 53 def viable(seqs, pos, method): | |
| 54 """ Compute if, among a set of sequences, either at least one of the codons at the specified position has not a "-", | |
| 55 or not any codon has a "-" | |
| 56 | |
| 57 Args : | |
| 58 seqs : a list (the sequences, which must all have the same size) | |
| 59 pos : an integer (the positions, <= len(seqs) -3) | |
| 60 | |
| 61 Returns: | |
| 62 bool | |
| 63 """ | |
| 64 codons = [seq[pos:pos+3] for seq in seqs] | |
| 65 if method == "all": | |
| 66 return not all("-" in codon for codon in codons) | |
| 67 elif method == "any": | |
| 68 return not any("-" in codon for codon in codons) | |
| 69 | |
| 70 # # # Function for codons, aa, aatypes Countings ------------------------------------------------------------------------------- | |
| 71 | |
| 72 def computeAllCountingsAndFreqs(seq, list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif): | |
| 73 """ Call all functions dedicated to the computation of occurences and frequencies of codons, amino-acids, amino-acids types | |
| 74 | |
| 75 Args : see sub-routines | |
| 76 | |
| 77 Returns : 6 dictionaries (occurences and frequencies for codons, amino-acids, amino-acids types). See sub-routines for details. | |
| 78 """ | |
| 79 | |
| 80 # ------ Sub-routines ------ # | |
| 81 | |
| 82 def codonsCountings(seq, list_codons, init_dict_codons): | |
| 83 """ Count occurences of each codon in a sequence | |
| 84 First reading frame only : input sequence is supposed to be an ORF | |
| 85 | |
| 86 Args : | |
| 87 seq (str) : the sequence | |
| 88 list_codons (list of str) : all codons except codons-stop | |
| 89 init_dict_codons (dict) : {codon1 : 0, codon2: 0, ...} | |
| 90 | |
| 91 Return : | |
| 92 codon (dict) : codons (keys) and their occurences (values) in the sequence | |
| 93 """ | |
| 94 | |
| 95 codons = copy.deepcopy(init_dict_codons) | |
| 96 | |
| 97 l = len(seq) | |
| 98 | |
| 99 if l%3 == 0: max_indice = l-3 | |
| 100 if l%3 == 1: max_indice = l-4 | |
| 101 if l%3 == 2: max_indice = l-5 | |
| 102 | |
| 103 for codon in range(0,max_indice+1,3): | |
| 104 if "-" not in seq[codon:codon+3] : | |
| 105 codons[seq[codon:codon+3]] += 1 | |
| 106 | |
| 107 return codons | |
| 108 | |
| 109 def codonsFreqs(dict_codons_counts): | |
| 110 """ Computes frequencies of each codon in a sequence | |
| 111 | |
| 112 Args : | |
| 113 dict_codons (dict) : the output of codonsCounting() | |
| 114 | |
| 115 Return : | |
| 116 codons_freqs (dict) : codons (keys) and their frequencies (values) | |
| 117 """ | |
| 118 | |
| 119 codons_freqs = {} | |
| 120 | |
| 121 for key in dict_codons_counts.keys(): | |
| 122 freq = float(dict_codons_counts[key])/sum(dict_codons_counts.values()) | |
| 123 codons_freqs[key] = freq | |
| 124 | |
| 125 return codons_freqs | |
| 126 | |
| 127 def aaCountings(dict_codons, dict_genetic_code, init_dict_aa): | |
| 128 """ Count occurences of each amino-acid in a sequence, based on the countings of codons (1st ORF) | |
| 129 | |
| 130 Args : | |
| 131 dict_codons (dict) : the output of codonsCounting() | |
| 132 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
| 133 init_dict_aa (dict) : {aa1 : 0, aa2: 0, ...} | |
| 134 | |
| 135 Return : | |
| 136 dict_aa (dict) : amino-acids (keys) and their occurences (values) | |
| 137 """ | |
| 138 dict_aa = copy.deepcopy(init_dict_aa) | |
| 139 | |
| 140 for key in dict_codons.keys(): | |
| 141 for value in dict_genetic_code.values(): | |
| 142 if key in value: | |
| 143 dict_aa[dict_genetic_code.keys()[dict_genetic_code.values().index(value)]] += dict_codons[key] | |
| 144 | |
| 145 return dict_aa | |
| 146 | |
| 147 def aaFreqs(dict_aa_counts): | |
| 148 """ Computes frequencies of each amino-acid in a sequence, based on the countings of codons (1st ORF) | |
| 149 | |
| 150 Args : | |
| 151 dict_aa_counts (dict) : the output of aaCountings() | |
| 152 | |
| 153 Return : | |
| 154 dict_aa_freqs (dict) : amino-acids (keys) and their frequencies (values) | |
| 155 """ | |
| 156 | |
| 157 dict_aa_freqs = {} | |
| 158 | |
| 159 for key in dict_aa_counts.keys(): | |
| 160 freq = float(dict_aa_counts[key])/sum(dict_aa_counts.values()) | |
| 161 dict_aa_freqs[key] = freq | |
| 162 | |
| 163 return dict_aa_freqs | |
| 164 | |
| 165 def aatypesCountings(dict_aa, dict_aa_classif, init_dict_classif): | |
| 166 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) | |
| 167 | |
| 168 Args : | |
| 169 - dict_aa (dict) : the output of aaCountings() | |
| 170 - dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} ) | |
| 171 - init_dict_classif (dict) : {'polar': 0, 'apolar': 0, ...} | |
| 172 | |
| 173 Return : | |
| 174 dict_aatypes (dict) : amino-acids types (keys) and their occurences (values) | |
| 175 """ | |
| 176 dict_aatypes = copy.deepcopy(init_dict_classif) | |
| 177 | |
| 178 for key_classif in dict_aa_classif.keys(): | |
| 179 for key_aa in dict_aa.keys(): | |
| 180 if key_aa in dict_aa_classif[key_classif]: | |
| 181 dict_aatypes[key_classif] += dict_aa[key_aa] | |
| 182 | |
| 183 return dict_aatypes | |
| 184 | |
| 185 def aatypesFreqs(dict_aatypes): | |
| 186 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) | |
| 187 | |
| 188 Args : | |
| 189 dict_aatypes (dict) : the output of aatypesCountings() | |
| 190 | |
| 191 Return : | |
| 192 dict_aatypes_freqs : amino-acids types (keys) and their frequencies (values) | |
| 193 """ | |
| 194 dict_aatypes_freqs = {} | |
| 195 | |
| 196 for key in dict_aatypes.keys(): | |
| 197 freq = float(dict_aatypes[key])/sum(dict_aatypes.values()) | |
| 198 dict_aatypes_freqs[key] = freq | |
| 199 | |
| 200 return dict_aatypes_freqs | |
| 201 | |
| 202 # ------ The function ------ # | |
| 203 | |
| 204 codons_c = codonsCountings(seq, list_codons, init_dict_codons) | |
| 205 codons_f = codonsFreqs(codons_c) | |
| 206 aa_c = aaCountings(codons_c, dict_genetic_code, init_dict_aa) | |
| 207 aa_f = aaFreqs(aa_c) | |
| 208 aat_c = aatypesCountings(aa_c, dict_aa_classif, init_dict_classif) | |
| 209 aat_f = aatypesFreqs(aat_c) | |
| 210 | |
| 211 return codons_c, codons_f, aa_c, aa_f, aat_c, aat_f | |
| 212 | |
| 213 # # # Functions for various measures (ivywrel, ekqh...) ------------------------------------------------------------------------- | |
| 214 | |
| 215 def computeVarious(seq, dict_aa_counts, dict_aa_types): | |
| 216 """ Call al the functions for nucleic and amino-acids sequences description | |
| 217 | |
| 218 Args : See sub-routines for details. | |
| 219 | |
| 220 Returns : 6 integer or floats. See sub-routines for details. | |
| 221 """ | |
| 222 | |
| 223 # ------ Sub-routines ------ # | |
| 224 | |
| 225 def nbCodons(seq): | |
| 226 """ Compute the number of full codons in a sequence | |
| 227 Arg : seq (str): the sequence | |
| 228 Return: nb_codons (int) | |
| 229 """ | |
| 230 l = len(seq) | |
| 231 if l%3 == 0: nb_codons = l/3 | |
| 232 if l%3 == 1: nb_codons = (l-1)/3 | |
| 233 if l%3 == 2: nb_codons = (l-2)/3 | |
| 234 return nb_codons | |
| 235 | |
| 236 def maxIndice(seq): | |
| 237 """ Compute the highest indice for parsing a sequence | |
| 238 Arg : seq (str): the sequence | |
| 239 Return : max_indice (int) | |
| 240 """ | |
| 241 l = len(seq) | |
| 242 if l%3 == 0: max_indice = l-3 | |
| 243 if l%3 == 1: max_indice = l-4 | |
| 244 if l%3 == 2: max_indice = l-5 | |
| 245 return max_indice | |
| 246 | |
| 247 def gc12Andgc3Count(seq, nb_codons, max_indice): | |
| 248 """ Compute the frequency of gc12 in a sequence | |
| 249 Arg : seq (str) : the sequence | |
| 250 Return : (float) | |
| 251 """ | |
| 252 | |
| 253 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? | |
| 254 # But : involves a less readable code | |
| 255 | |
| 256 gc12 = 0 | |
| 257 gc3 = 0 | |
| 258 | |
| 259 for i in range(0, max_indice+1,3): | |
| 260 if seq[i] in ["c","g"]: gc12 += 1 | |
| 261 if seq[i+1] in ["c","g"]: gc12 += 1 | |
| 262 if seq[i+2] in ["c","g"] or seq[i+2] in ["c","g"]: gc3 += 1 | |
| 263 | |
| 264 return float(gc3)/nb_codons, float(gc12)/(2*nb_codons) | |
| 265 | |
| 266 def ivywrelCount(nb_codons, dict_aa_counts): | |
| 267 """ Compute the sum of occurences of amino-acids IVYWREL divided by the number of codons | |
| 268 | |
| 269 Args : | |
| 270 nb_codons (int) : the number of codons in the sequence | |
| 271 dict_aa_counts (dict) : the output of aaCountings() | |
| 272 | |
| 273 return : (float) | |
| 274 """ | |
| 275 | |
| 276 IVYWREL = 0 | |
| 277 | |
| 278 for aa in ["I","V","Y","W","R","E","L"]: # Impossible to make a simple sum, in case one the aa is not in the dict keys | |
| 279 if aa in dict_aa_counts.keys(): | |
| 280 IVYWREL += dict_aa_counts[aa] | |
| 281 | |
| 282 return float(IVYWREL)/nb_codons | |
| 283 | |
| 284 def ekqhCount(dict_aa_counts): | |
| 285 """ Compute the ratio of amino-acids EK/QH | |
| 286 Arg : dict_aa_counts (dict) : the output of aaCountings() | |
| 287 Return : (float) | |
| 288 """ | |
| 289 ek = 0 | |
| 290 qh = 0 | |
| 291 | |
| 292 ek = dict_aa_counts["E"] + dict_aa_counts["K"] | |
| 293 qh = dict_aa_counts["Q"] + dict_aa_counts["H"] | |
| 294 | |
| 295 if qh != 0: | |
| 296 return float(ek)/qh | |
| 297 else : return ek | |
| 298 | |
| 299 def payresdgmCount(dict_aa_counts): | |
| 300 """ Compute the ratio of amino-acids PASRE/SDGM | |
| 301 Arg : dict_aa_counts (dict) : the output of aaCountings() | |
| 302 Return : (float) | |
| 303 """ | |
| 304 payre = 0 | |
| 305 sdgm = 0 | |
| 306 | |
| 307 for aa in ["P","A","Y","R","E"]: | |
| 308 payre += dict_aa_counts[aa] | |
| 309 for aa in ["S","D","G","M"]: | |
| 310 sdgm += dict_aa_counts[aa] | |
| 311 | |
| 312 if sdgm != 0: | |
| 313 return float(payre)/sdgm | |
| 314 else : return payre | |
| 315 | |
| 316 def purineLoad(seq, nb_codons): | |
| 317 """ Compute the purine load indice of a sequence | |
| 318 Args : | |
| 319 seq (str) : the sequence | |
| 320 nb_codons (int) : the number of codons in the sequence | |
| 321 | |
| 322 Return (float) | |
| 323 """ | |
| 324 | |
| 325 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? | |
| 326 # But : involves a less readable code | |
| 327 | |
| 328 g12, g3, A, c12, c3, T = 0.0,0.0,seq.count("a"),0.0,0.0,seq.count("t") | |
| 329 | |
| 330 # g3 and c3 : g and c in 3rd position of a codon | |
| 331 s = "" | |
| 332 for i in range(2, len(seq), 3): | |
| 333 s += seq[i] | |
| 334 g3 = s.count("g") | |
| 335 c3 = s.count("c") | |
| 336 | |
| 337 # g12 and c12 : g and c in 1st and 2d positions of a codon | |
| 338 s = "" | |
| 339 for i in range(0, len(seq), 3): | |
| 340 s += seq[i] | |
| 341 g12 = s.count("g") | |
| 342 c12 = s.count("c") | |
| 343 s = "" | |
| 344 for i in range(1, len(seq), 3): | |
| 345 s += seq[i] | |
| 346 g12 += s.count("g") | |
| 347 c12 += s.count("c") | |
| 348 | |
| 349 return float(1000*(g12+g3+A-c12-c3-T))/(3*nb_codons) | |
| 350 | |
| 351 def cvp(dict_aatypes): | |
| 352 """ Compute the difference nb_charged_aamino_acids - nb_polar_amino_acids | |
| 353 Return: (int) | |
| 354 """ | |
| 355 return dict_aatypes["charged"] - dict_aatypes["polar"] | |
| 356 | |
| 357 # ------ The function ------ # | |
| 358 | |
| 359 nb_codons = nbCodons(seq) | |
| 360 max_indice = maxIndice(seq) | |
| 361 GC3, GC12 = gc12Andgc3Count(seq, nb_codons, max_indice) | |
| 362 IVYWREL, EKQH, PAYRESDGM = ivywrelCount(nb_codons, dict_aa_counts), ekqhCount(dict_aa_counts), payresdgmCount(dict_aa_counts) | |
| 363 purineload, CvP = purineLoad(seq, nb_codons), cvp(dict_aa_types) | |
| 364 return GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP | |
| 365 | |
| 366 # # # Function for codons, aa, aatypes Transitions ----------------------------------------------------------------------------- | |
| 367 | |
| 368 def computeAllBiases(seq1, seq2, dico_codons_transi, dico_aa_transi, dico_aatypes_transi, reversecode, reverseclassif) : | |
| 369 """ Compute all biases (transisitions codon->codon, aa->-aa, aa_type->aa_type) between two sequences | |
| 370 | |
| 371 Args : See sub-routines for details | |
| 372 | |
| 373 Returns 3 dictionaries of dictionaries. See sub-routines for details | |
| 374 """ | |
| 375 | |
| 376 # ------ Sub-routines ------ # | |
| 377 | |
| 378 def codons_transitions(seq1, seq2, dico_codons_transi): | |
| 379 """ Compute the number of transitions from a codon of a sequence to the codon of a second sequence | |
| 380 | |
| 381 Args : | |
| 382 seq1 (str) : the first sequence | |
| 383 seq2 (str) : the second sequence | |
| 384 dico_codons_transi (dict of dicts) : { codon1 : {codon1: 0, codon2 : 0, ...}, ..} | |
| 385 | |
| 386 Return : | |
| 387 codons_transi (dict of dicts) : the occurences of each codon to codon transition | |
| 388 """ | |
| 389 | |
| 390 codons_transi = copy.deepcopy(dico_codons_transi) | |
| 391 | |
| 392 for i in range(0, len(seq1), 3): | |
| 393 # check if no indel and if len seq[i:i+3] is really 3 (check for the last codon) | |
| 394 if viable([seq1, seq2], i, "any") and len(seq1[i:i+3]) == 3 and len(seq2[i:i+3]) == 3 : | |
| 395 codons_transi[seq1[i:i+3]][seq2[i:i+3]] += 1 | |
| 396 | |
| 397 return codons_transi | |
| 398 | |
| 399 def codons_transitions_freqs(codons_transitions_counts): | |
| 400 """ Computes frequencies of codon transitions between two sequences | |
| 401 | |
| 402 Arg : | |
| 403 codons_transitions_counts (dict) : the output of codons_transitions() | |
| 404 | |
| 405 Return : | |
| 406 codons_transi_freqs (dict of dicts) : the frequencies of each codon to codon transition | |
| 407 """ | |
| 408 | |
| 409 codons_transi_freqs = {} | |
| 410 | |
| 411 for key in codons_transitions_counts.keys(): | |
| 412 codons_transi_freqs[key] = {} | |
| 413 for key2 in codons_transitions_counts[key].keys(): | |
| 414 if sum(codons_transitions_counts[key].values()) != 0: | |
| 415 freq = float(codons_transitions_counts[key][key2])/sum(codons_transitions_counts[key].values()) | |
| 416 codons_transi_freqs[key][key2] = freq | |
| 417 else : | |
| 418 codons_transi_freqs[key][key2] = 0 | |
| 419 return codons_transi_freqs | |
| 420 | |
| 421 def aa_transitions(dico_codons_transi, dico_aa_transi, reversecode): | |
| 422 """ Compute the number of transitions from an amino-acid of a sequence to the amino-acid of a second sequence | |
| 423 | |
| 424 Args : | |
| 425 dico_codons_transi (dict of dicts) : the codons transitions computed by codons_transitions() | |
| 426 dico_aa_transi (dict of dicts) : { aa1 : {aa1: 0, aa2 : 0, ...}, ..} | |
| 427 reversecode (dict) : the reversed genetic code {aa1 : [codons],...} -> {codon1: aa1, codon2: aa2, ...} | |
| 428 | |
| 429 Return : | |
| 430 aa_transi (dict of dicts) : the occurences of each aa to aa transition | |
| 431 """ | |
| 432 | |
| 433 aa_transi = copy.deepcopy(dico_aa_transi) | |
| 434 | |
| 435 for k in dico_codons_transi.keys(): | |
| 436 newk = reversecode[k] | |
| 437 for k2 in dico_codons_transi[k].keys(): | |
| 438 newk2 = reversecode[k2] | |
| 439 aa_transi[newk][newk2] += dico_codons_transi[k][k2] | |
| 440 | |
| 441 return aa_transi | |
| 442 | |
| 443 def aa_transitions_freqs(aa_transitions_counts): | |
| 444 """ Computes frequencies of amico-acids transitions between two sequences | |
| 445 Arg : aa_transitions_counts (dict of dicts): the output of aa_transitions() | |
| 446 Return : aa_transi_freqs (dict of dicts) : the frequencies of each aa to aa transition | |
| 447 """ | |
| 448 | |
| 449 aa_transi_freqs = {} | |
| 450 | |
| 451 for key in aa_transitions_counts.keys(): | |
| 452 aa_transi_freqs[key] = {} | |
| 453 for key2 in aa_transitions_counts[key].keys(): | |
| 454 if sum(aa_transitions_counts[key].values()) != 0: | |
| 455 freq = float(aa_transitions_counts[key][key2])/sum(aa_transitions_counts[key].values()) | |
| 456 aa_transi_freqs[key][key2] = freq | |
| 457 else : | |
| 458 aa_transi_freqs[key][key2] = 0 | |
| 459 return aa_transi_freqs | |
| 460 | |
| 461 def aatypes_transitions(dico_aa_transi, dico_aatypes_transi, reverseclassif): | |
| 462 """ Compute the number of transitions from an amino-acid type of a sequence to the amino-acid type of a second sequence | |
| 463 | |
| 464 Args : | |
| 465 dico_aa_transi (dict of dicts) : the output of aa_transitions() | |
| 466 dico_aatypes_transi (dict of dicts) : { type1 : {type1: 0, type2 : 0, ...}, ..} | |
| 467 reverseclassif (dict) : the reversed amino-acid clasification {aa1: type, aa2: type, ...} | |
| 468 | |
| 469 Return : | |
| 470 aatypes_transi (dict of dicts) : the occurences of each aatype to aatype transition | |
| 471 """ | |
| 472 | |
| 473 aatypes_transi = copy.deepcopy(dico_aatypes_transi) | |
| 474 for k in dico_aa_transi.keys(): | |
| 475 newk = reverseclassif[k] | |
| 476 for k2 in dico_aa_transi[k].keys(): | |
| 477 newk2 = reverseclassif[k2] | |
| 478 aatypes_transi[newk][newk2] += dico_aa_transi[k][k2] | |
| 479 | |
| 480 return aatypes_transi | |
| 481 | |
| 482 def aatypes_transitions_freqs(aatypes_transitions_counts): | |
| 483 """ Computes frequencies of amico-acids types transitions between two sequences | |
| 484 Args : aatypes_transitions_counts (dict of dicts) : the output of aatypes_transitions() | |
| 485 Return : aatypes_transi_freqs (dict of dicts) : the frequencies of each aatype to aatype transition | |
| 486 """ | |
| 487 | |
| 488 aatypes_transi_freqs = {} | |
| 489 | |
| 490 for key in aatypes_transitions_counts.keys(): | |
| 491 aatypes_transi_freqs[key] = {} | |
| 492 for key2 in aatypes_transitions_counts[key].keys(): | |
| 493 if sum(aatypes_transitions_counts[key].values()) != 0: | |
| 494 freq = float(aatypes_transitions_counts[key][key2])/sum(aatypes_transitions_counts[key].values()) | |
| 495 aatypes_transi_freqs[key][key2] = freq | |
| 496 else : | |
| 497 aatypes_transi_freqs[key][key2] = 0 | |
| 498 return aatypes_transi_freqs | |
| 499 | |
| 500 | |
| 501 | |
| 502 | |
| 503 # ------ The function ------ # | |
| 504 | |
| 505 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi) | |
| 506 codons_transitions_freqs = codons_transitions_freqs(codons_transitions) | |
| 507 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode) | |
| 508 aa_transitions_freqs = aa_transitions_freqs(aa_transitions) | |
| 509 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif) | |
| 510 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions) | |
| 511 | |
| 512 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs | |
| 513 | |
| 514 def all_sed(codons_c, aa_c, aat_c, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transi, dico_aa_transi, dico_aatypes_transi): | |
| 515 | |
| 516 def compute_sed(transi, counts, dico): | |
| 517 """ Compute the substitution exchangeability disequilibrium (SED) from one species A to another B between codons/aa//aatypes couples | |
| 518 | |
| 519 Args: | |
| 520 transi ; dict - dictionaries of all counts of transition from codon/aa/aatype X to Y from sp A to sp B | |
| 521 counts : dict - dictionaries of codons/aa/aatypes counts in species A | |
| 522 dico : dict - a dictionary (nested) with all values to 0 | |
| 523 | |
| 524 """ | |
| 525 dict_sed = copy.deepcopy(dico) | |
| 526 | |
| 527 for key in transi.keys(): | |
| 528 for key2 in transi.keys(): | |
| 529 if counts[key] != 0 and float(transi[key2][key])/counts[key] != 0.0: | |
| 530 x = (float(transi[key][key2])/counts[key]) / (float(transi[key2][key])/counts[key]) | |
| 531 dict_sed[key][key2] = - pow(2,1-x)+1 | |
| 532 else : | |
| 533 dict_sed[key][key2] = 'NA' | |
| 534 | |
| 535 return dict_sed | |
| 536 | |
| 537 codons_sed = compute_sed(codons_transitions, codons_c, dico_codons_transi) | |
| 538 aa_sed = compute_sed(aa_transitions, aa_c, dico_aa_transi) | |
| 539 aatypes_sed = compute_sed(aatypes_transitions, aat_c, dico_aatypes_transi) | |
| 540 | |
| 541 return codons_sed, aa_sed, aatypes_sed | |
| 542 | |
| 543 # # # Function for random resampling -------------------------------------------------------------------------------------------- | |
| 544 | |
| 545 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif): | |
| 546 """ Resample randomly codons from sequences (sort of bootsrap) | |
| 547 | |
| 548 Args : | |
| 549 dict_seq (dict) : contains the species name and sequences used ( { 'sp1': seq1, 'sp2': seq2, ...}) without '-' removal | |
| 550 nb_iter (int) : number of resampling iterations (better >= 1000) | |
| 551 len_sample (int) : length (in codons) of a resampled sequence (better >= 1000) | |
| 552 list_codons (list of str): all codons except codons-stop | |
| 553 genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
| 554 aa_classif (dict) : the types of the amino-acids : {type: [aa1, aa2, ...], ...} | |
| 555 reversecode (dict) : the reversed genetic code : {codon1: aa1, codon2: aa2, ...} | |
| 556 reverseclassif (dict) : the reversed amino-acid : clasification {aa1: type, aa2: type, ...} | |
| 557 | |
| 558 Returns : | |
| 559 codons_lst, aa_lst, classif_lst (dicts) : keys : codons/aa/aatypes, values : [] | |
| 560 codons_transitions_lst, aa_transitions_lst, classif_transitions_lst (dict of dicts) : keys : codons/aa/aatypes, values : the 3 previous dicts | |
| 561 """ | |
| 562 | |
| 563 # Initialize empty dictionaries for countings and transitions. It's also possible to isntanciate these ones in the main() but it would make a function with ~14 parameters | |
| 564 codons_0, aa_0, classif_0, codons_transitions_0, aa_transitions_0, classif_transitions_0 = buildDicts(list_codons, 0, genetic_code, aa_classif) | |
| 565 codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst = buildDicts(list_codons, [], genetic_code, aa_classif) | |
| 566 | |
| 567 # determine the max position where sampling is possible | |
| 568 l = len(dict_seq.values()[1]) | |
| 569 if l%3 == 0: max_indice = l-3 | |
| 570 if l%3 == 1: max_indice = l-4 | |
| 571 if l%3 == 2: max_indice = l-5 | |
| 572 | |
| 573 # List of positions to resample | |
| 574 viable_positions = [pos for pos in range(0,max_indice,3) if viable(dict_seq.values(), pos, "all")] | |
| 575 sample_positions = np.random.choice(viable_positions, len_sample) | |
| 576 | |
| 577 # nb_iter resampled sequences | |
| 578 for i in range(nb_iter): | |
| 579 if (i+1)%(nb_iter/10) == 0: | |
| 580 print " "+str( (i+1)*100/nb_iter)+"%" | |
| 581 | |
| 582 seqa, seqb = "", "" | |
| 583 for pos in sample_positions: | |
| 584 codona, codonb = "---", "---" | |
| 585 # The sequence to be resampled in this position is randomly chosen ; no "-" resampled | |
| 586 while "-" in codona : | |
| 587 codona = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] | |
| 588 while "-" in codonb : | |
| 589 codonb = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] | |
| 590 seqa += codona | |
| 591 seqb += codonb | |
| 592 | |
| 593 # dictionaries : frequences of codons, aa, aatypes (seq1) | |
| 594 codons_occ_tmp, codons_freq_tmp, aa_occ_tmp, aa_freq_tmp, aatypes_occ_tmp, aatypes_freq_tmp = computeAllCountingsAndFreqs(seqa, list_codons, codons_0, aa_0, classif_0, genetic_code, aa_classif) | |
| 595 # dictionaries frequences of transitions (seqa->seqb) | |
| 596 codons_transitions_tmp, codons_transitions_freq_tmp, aa_transition_tmp, aa_transitions_freq_tmp, aatypes_transitions_tmp, aatypes_transitions_freq_tmp = computeAllBiases(seqa, seqb, codons_transitions_0, aa_transitions_0, classif_transitions_0, reversecode, reverseclassif) | |
| 597 | |
| 598 # Adding occurrences in final dicts | |
| 599 for key in codons_freq_tmp.keys(): | |
| 600 codons_lst[key].append(codons_freq_tmp[key]) | |
| 601 for key in aa_freq_tmp.keys(): | |
| 602 aa_lst[key].append(aa_freq_tmp[key]) | |
| 603 for key in aatypes_freq_tmp.keys(): | |
| 604 classif_lst[key].append(aatypes_freq_tmp[key]) | |
| 605 | |
| 606 # Adding occurrences in final dicts (transitions) | |
| 607 for key in codons_transitions_freq_tmp.keys(): | |
| 608 for key2 in codons_transitions_freq_tmp[key].keys(): | |
| 609 codons_transitions_lst[key][key2].append(codons_transitions_freq_tmp[key][key2]) | |
| 610 for key in aa_transitions_freq_tmp.keys(): | |
| 611 for key2 in aa_transitions_freq_tmp[key].keys(): | |
| 612 aa_transitions_lst[key][key2].append(aa_transitions_freq_tmp[key][key2]) | |
| 613 for key in aatypes_transitions_freq_tmp.keys(): | |
| 614 for key2 in aatypes_transitions_freq_tmp[key].keys(): | |
| 615 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2]) | |
| 616 | |
| 617 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst | |
| 618 | |
| 619 def testPvalues(dict_counts, dict_resampling, nb_iter, method): | |
| 620 """ Computes where the observed value is located in the expected counting distribution | |
| 621 | |
| 622 Args : | |
| 623 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() | |
| 624 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function | |
| 625 | |
| 626 Return : | |
| 627 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts) | |
| 628 | |
| 629 | |
| 630 pnorm computes the pvalue to have a value inferior to the observed value under a normal distribution | |
| 631 One sided to left tail : | |
| 632 p < 0.05 indicates significantly lower counts | |
| 633 p > 0.95 indicates significantly higher counts | |
| 634 """ | |
| 635 | |
| 636 | |
| 637 def p_resampling(obs, values, nb_iter): | |
| 638 """ The pvalue is the proportion of bootsrapped values smaller than the observed value | |
| 639 If p = 0.025 : 2.5% of the bootstrapped values are smaller than the observed value | |
| 640 p < 0.025 : the obs value is most likely significantly lower. | |
| 641 If p = 0.975 : 97.5% of the bootstrapped values are smaller than the observed value | |
| 642 p > 0.975 the obs value is most likely significantly higher. | |
| 643 | |
| 644 Args : | |
| 645 obs : int or float - the observed value | |
| 646 values : list - values of resampling (int or floats) | |
| 647 nb_iter : int - the number of resampled values (=len(values)) | |
| 648 | |
| 649 Return : | |
| 650 pvalue (float) | |
| 651 """ | |
| 652 | |
| 653 num = len([x for x in values if x < obs]) | |
| 654 return float(num + 1) / (nb_iter+1) | |
| 655 | |
| 656 def testPvalue(obs, exp, nb_iter): | |
| 657 """ Compute a pvalue | |
| 658 | |
| 659 Args : | |
| 660 exp (list of floatsà : a list of length nb_iter, containing expected frequencies of a codon/aa/aatype at each iteration | |
| 661 obs (float) : the observed value | |
| 662 nb_iter (int) : the number of iterations for resampling | |
| 663 | |
| 664 Returns : | |
| 665 pvalue (float) | |
| 666 """ | |
| 667 | |
| 668 max_val = nb_iter-1 | |
| 669 min_val = 0 | |
| 670 test_val = (max_val+min_val)/2 | |
| 671 | |
| 672 while max_val-min_val > 1: | |
| 673 if obs > exp[test_val]: | |
| 674 min_val = test_val | |
| 675 test_val = (max_val+min_val)/2 | |
| 676 elif obs < exp[test_val]: | |
| 677 max_val = test_val | |
| 678 test_val = (max_val+min_val)/2 | |
| 679 else: | |
| 680 break | |
| 681 | |
| 682 pvalue = float(test_val+1)/(nb_iter+1) | |
| 683 | |
| 684 return pvalue | |
| 685 | |
| 686 # ------ The function ------ # | |
| 687 | |
| 688 pvalues = {} | |
| 689 | |
| 690 for key in dict_resampling.keys(): | |
| 691 if type(dict_resampling.values()[1]) is not dict : | |
| 692 if method == 'origin': | |
| 693 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter) | |
| 694 elif method == 'pnorm': | |
| 695 pvalues[key] = scipy.stats.norm.cdf(dict_counts[key], np.mean(dict_resampling[key]), np.std(dict_resampling[key])) | |
| 696 elif method == 'p_resampling': | |
| 697 pvalues[key] = p_resampling(dict_counts[key], dict_resampling[key], nb_iter) | |
| 698 else : | |
| 699 pvalues[key] = {} | |
| 700 for key2 in dict_resampling[key].keys(): | |
| 701 if method == 'origin': | |
| 702 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | |
| 703 elif method == 'pnorm': | |
| 704 pvalues[key][key2] = scipy.stats.norm.cdf(dict_counts[key][key2], np.mean(dict_resampling[key][key2]), np.std(dict_resampling[key][key2])) | |
| 705 elif method == 'p_resampling': | |
| 706 pvalues[key][key2] = p_resampling(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | |
| 707 | |
| 708 return pvalues | |
| 709 | |
| 710 def main(): | |
| 711 | |
| 712 # arguments | |
| 713 parser = argparse.ArgumentParser() | |
| 714 parser.add_argument("sequences_file", help="File containing sequences (the output of the tool 'ConcatPhyl'") | |
| 715 parser.add_argument("considered_species", help="The species name, separated by commas (must be the same than in the sequences_file). It is possible to consider only a subset of species.") | |
| 716 parser.add_argument("species_for_bootstrap", help="The species which will be used for bootstrapping, separated by commas. It is possible to consider only a subset of species.") | |
| 717 parser.add_argument("iteration", help="The number of iterations for bootstrapping (better if => 1000)", type=int) | |
| 718 parser.add_argument("sample_length", help="The lenght of a bootstrapped sequences (better if >= 1000", type=int) | |
| 719 args = parser.parse_args() | |
| 720 | |
| 721 print "\n ------ Occurences and frequencies of codons, amino-acids, amino-acids types -------\n" | |
| 722 | |
| 723 print "The script counts the number of codons, amino acids, and types of amino acids in sequences," | |
| 724 print "as well as the mutation bias from one item to another between 2 sequences.\n" | |
| 725 | |
| 726 print "Counting are then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences." | |
| 727 print "In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from" | |
| 728 print "a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting." | |
| 729 | |
| 730 print " Sequences file : {}".format(args.sequences_file) | |
| 731 print " Species retained for countings : {}\n".format(args.considered_species) | |
| 732 | |
| 733 print "Processing : reading input file, opening output files, building dictionaries." | |
| 734 | |
| 735 # make pairs | |
| 736 list_species = str.split(args.considered_species, ",") | |
| 737 list_species_boot = str.split(args.species_for_bootstrap, ",") | |
| 738 pairs_list=list(itertools.combinations(list_species,2)) | |
| 739 | |
| 740 # read sequences | |
| 741 sequences_for_counts = {} | |
| 742 sequences_for_resampling = {} | |
| 743 with open(args.sequences_file, "r") as file: | |
| 744 for line1,line2 in itertools.izip_longest(*[file]*2): | |
| 745 species = line1.strip(">\r\n") | |
| 746 sequence = line2.strip("\r\n") | |
| 747 if species in list_species: | |
| 748 sequences_for_counts[species] = sequence | |
| 749 if species in list_species_boot: | |
| 750 sequences_for_resampling[species] = sequence | |
| 751 | |
| 752 print " Warning : countings might be biased and show high differences between species because of high variations of the indels proportions among sequences." | |
| 753 print " Frequences are more representative." | |
| 754 | |
| 755 print "\n Indels percent :" | |
| 756 | |
| 757 for k,v in sequences_for_counts.items(): | |
| 758 print " {} : {} %".format(k, float(v.count("-"))/len(v)*100) | |
| 759 | |
| 760 # useful dictionaries | |
| 761 dict_genetic_code={"F":["ttt","ttc"], | |
| 762 "L":["tta","ttg","ctt","ctc","cta","ctg"], | |
| 763 "I":["att","atc","ata"], | |
| 764 "M":["atg"], | |
| 765 "V":["gtt","gtc","gta","gtg"], | |
| 766 "S":["tct","tcc","tca","tcg","agt","agc"], | |
| 767 "P":["cct","cca","ccg","ccc"], | |
| 768 "T":["act","acc","aca","acg"], | |
| 769 "A":["gct","gcc","gca","gcg"], | |
| 770 "Y":["tat","tac"], | |
| 771 "H":["cat","cac"], | |
| 772 "Q":["caa","cag"], | |
| 773 "N":["aat","aac"], | |
| 774 "K":["aaa","aag"], | |
| 775 "D":["gat","gac"], | |
| 776 "E":["gaa","gag"], | |
| 777 "C":["tgt","tgc"], | |
| 778 "W":["tgg"], | |
| 779 "R":["cgt","cgc","cga","cgg","aga","agg"], | |
| 780 "G":["ggt","ggc","gga","ggg"]} | |
| 781 | |
| 782 dict_aa_classif={"unpolar":["G","A","V","L","M","I"], | |
| 783 "polar":["S","T","C","P","N","Q"], | |
| 784 "charged":["K","R","H","D","E"], | |
| 785 "aromatics":["F","Y","W"]} | |
| 786 | |
| 787 reversecode={v:k for k in dict_genetic_code for v in dict_genetic_code[k]} | |
| 788 reverseclassif={v:k for k in dict_aa_classif for v in dict_aa_classif[k]} | |
| 789 | |
| 790 # codons list (without stop codons) | |
| 791 nucleotides = ['a', 'c', 'g', 't'] | |
| 792 list_codons = [''.join(comb) for comb in itertools.product(nucleotides, repeat=3)] | |
| 793 list_codons.remove("taa") | |
| 794 list_codons.remove("tag") | |
| 795 list_codons.remove("tga") | |
| 796 | |
| 797 # Store already computed species + row.names in output files | |
| 798 index = [] | |
| 799 index_transi = [] | |
| 800 | |
| 801 # Final dictionaries writed to csv files | |
| 802 all_codons = {} | |
| 803 all_aa = {} | |
| 804 all_aatypes = {} | |
| 805 all_various = {} | |
| 806 all_codons_transitions = {} # Not used because too much : 61*61 columns | |
| 807 all_aa_transitions = {} | |
| 808 all_aatypes_transitions = {} | |
| 809 | |
| 810 # RUN | |
| 811 | |
| 812 print "\nProcessing : resampling ..." | |
| 813 print " Parameters : {niter} iterations, {lensample} codon per resampled sequence, species used : {species}\n".format(niter=args.iteration, lensample=args.sample_length, species=args.species_for_bootstrap) | |
| 814 | |
| 815 codons_boot, aa_boot, aatypes_boot, codons_transi_boot, aa_transi_boot, aatypes_transi_boot = sampling(sequences_for_resampling, args.iteration, args.sample_length, list_codons, dict_genetic_code, dict_aa_classif, reversecode, reverseclassif) | |
| 816 print " Done.\n" | |
| 817 | |
| 818 print "Processing : countings....\n" | |
| 819 | |
| 820 # Initialize empty dictionaries for countings and transitions | |
| 821 init_dict_codons, init_dict_aa, init_dict_classif, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions = buildDicts(list_codons, 0, dict_genetic_code, dict_aa_classif) | |
| 822 | |
| 823 for pair in pairs_list: | |
| 824 p1, p2 = pair[0], pair[1] | |
| 825 if p1 not in index: | |
| 826 print "Countings on {}".format(p1) | |
| 827 | |
| 828 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | |
| 829 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs) | |
| 830 | |
| 831 | |
| 832 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration, 'p_resampling') | |
| 833 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration, 'p_resampling') | |
| 834 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') | |
| 835 | |
| 836 all_codons[p1+"_obs_counts"] = p1_codons_counts | |
| 837 all_codons[p1+"_obs_freqs"] = p1_codons_freqs | |
| 838 all_codons[p1+"_pvalues"] = p1_codons_pvalues | |
| 839 all_aa[p1+"_obs_counts"] = p1_aa_counts | |
| 840 all_aa[p1+"_obs_freqs"] = p1_aa_freqs | |
| 841 all_aa[p1+"_pvalues"] = p1_aa_pvalues | |
| 842 all_aatypes[p1+"_obs_counts"] = p1_aatypes_counts | |
| 843 all_aatypes[p1+"_obs_freqs"] = p1_aatypes_freqs | |
| 844 all_aatypes[p1+"_pvalues"] = p1_aatypes_pvalues | |
| 845 all_various[p1] = [p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP] | |
| 846 | |
| 847 index.append(p1) | |
| 848 | |
| 849 if p2 not in index: | |
| 850 print "Countings on {}".format(p2) | |
| 851 | |
| 852 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | |
| 853 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs) | |
| 854 | |
| 855 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration, 'p_resampling') | |
| 856 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration, 'p_resampling') | |
| 857 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') | |
| 858 | |
| 859 all_codons[p2+"_obs_counts"] = p2_codons_counts | |
| 860 all_codons[p2+"_obs_freqs"] = p2_codons_freqs | |
| 861 all_codons[p2+"_pvalues"] = p2_codons_pvalues | |
| 862 all_aa[p2+"_obs_counts"] = p2_aa_counts | |
| 863 all_aa[p2+"_obs_freqs"] = p2_aa_freqs | |
| 864 all_aa[p2+"_pvalues"] = p2_aa_pvalues | |
| 865 all_aatypes[p2+"_obs_counts"] = p2_aatypes_counts | |
| 866 all_aatypes[p2+"_obs_freqs"] = p2_aatypes_freqs | |
| 867 all_aatypes[p2+"_pvalues"] = p2_aatypes_pvalues | |
| 868 all_various[p2] = p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP | |
| 869 | |
| 870 index.append(p2) | |
| 871 | |
| 872 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts: | |
| 873 print "Countings transitions between {} and {}".format(p1, p2) | |
| 874 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif) | |
| 875 | |
| 876 # Ajout | |
| 877 codons_sed, aa_sed, aatypes_sed = all_sed(p1_codons_counts, p1_aa_counts, p1_aatypes_counts, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions) | |
| 878 | |
| 879 index_transi.append((p1,p2)) | |
| 880 | |
| 881 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration, 'p_resampling') | |
| 882 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration, 'p_resampling') | |
| 883 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration, 'p_resampling') | |
| 884 | |
| 885 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions | |
| 886 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs | |
| 887 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues | |
| 888 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions | |
| 889 all_aa_transitions[p1+">"+p2+"_obs_freqs"] = aa_transitions_freqs | |
| 890 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues | |
| 891 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions | |
| 892 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs | |
| 893 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues | |
| 894 | |
| 895 all_codons_transitions[p1+">"+p2+"_sed"] = codons_sed | |
| 896 all_aa_transitions[p1+">"+p2+"_sed"] = aa_sed | |
| 897 all_aatypes_transitions[p1+">"+p2+"_sed"] = aatypes_sed | |
| 898 | |
| 899 index_transi.append((p1, p2)) | |
| 900 | |
| 901 print "\n Done.\n" | |
| 902 | |
| 903 print "Processing : creating dataframes ..." | |
| 904 | |
| 905 frame_codons = pd.DataFrame(all_codons).T.astype('object') | |
| 906 frame_aa = pd.DataFrame(all_aa).T.astype('object') | |
| 907 frame_aatypes = pd.DataFrame(all_aatypes).T.astype('object') | |
| 908 | |
| 909 frame_codons_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_codons_transitions.items()}).unstack() | |
| 910 frame_codons_transitions.columns = frame_codons_transitions.columns.map('>'.join) | |
| 911 | |
| 912 frame_aa_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aa_transitions.items()}).unstack() | |
| 913 frame_aa_transitions.columns = frame_aa_transitions.columns.map('>'.join) | |
| 914 | |
| 915 frame_aatypes_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aatypes_transitions.items()}).unstack() | |
| 916 frame_aatypes_transitions.columns = frame_aatypes_transitions.columns.map('>'.join) | |
| 917 | |
| 918 frame_various = pd.DataFrame(all_various).T | |
| 919 frame_various.columns = ["GC3","GC12","IVYWREL","EKQH","PAYRESDGM","purineload", "CvP"] | |
| 920 | |
| 921 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species" | |
| 922 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species" | |
| 923 | |
| 924 print "Writing dataframes to output files ...\n" | |
| 925 | |
| 926 frame_codons.round(8).to_csv("codons_freqs.csv", sep=",", encoding="utf-8") | |
| 927 frame_aa.round(8).to_csv("aa_freqs.csv", sep=",", encoding="utf-8") | |
| 928 frame_aatypes.astype('object').round(8).to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8") | |
| 929 frame_codons_transitions.round(8).to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8") | |
| 930 frame_aa_transitions.round(8).to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8") | |
| 931 frame_aatypes_transitions.round(8).to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8") | |
| 932 frame_various.round(8).to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8") | |
| 933 | |
| 934 print "Done." | |
| 935 | |
| 936 if __name__ == "__main__": | |
| 937 main() |
