mutcount: scripts/S01a_codons_counting.py comparison

comparison scripts/S01a_codons_counting.py @ 0:acc3674e515b draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty

author	abims-sbr
date	Fri, 01 Feb 2019 10:28:50 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:acc3674e515b
+#!/usr/bin/env python
+# coding: utf-8
+# Author : Victor Mataigne
+import string, os, sys, re, random, itertools, argparse, copy, math
+import pandas as pd
+import numpy as np
+def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif):
+""" Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting
+Args :
+list_codons (list of str) : all codons except codons-stop
+content (int or list) : an integer (for coutings and transitions), or an empty list (for resampling)
+dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
+dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...}
+Returns :
+dico_codons, dico_aa, dico_aatypes (dicts) : keys : codons/amico-acids/amico-acids types, values : 0 or []
+dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions (dicts of dicts) :
+actually, the three first dictionaries nested as values of keys codons/amico-acids/amico-acids
+"""
+# I could have make sub-routines here.
+# the copy commands are mandatory, otherwise all dictionaries will reference the same variable(s)
+dico_codons = {}
+for codon in list_codons:
+dico_codons[codon] = copy.deepcopy(content)
+dico_aa = {}
+for aa in dict_genetic_code.keys():
+dico_aa[aa] = copy.deepcopy(content)
+dico_aatypes = {}
+for aatype in dict_aa_classif.keys():
+dico_aatypes[aatype] = copy.deepcopy(content)
+dico_codons_transitions=copy.deepcopy(dico_codons)
+for key in dico_codons_transitions.keys():
+dico_codons_transitions[key]=copy.deepcopy(dico_codons)
+dico_aa_transitions = copy.deepcopy(dico_aa)
+for key in dico_aa_transitions.keys():
+dico_aa_transitions[key]=copy.deepcopy(dico_aa)
+dico_aatypes_transitions = copy.deepcopy(dico_aatypes)
+for key in dico_aatypes_transitions.keys():
+dico_aatypes_transitions[key]=copy.deepcopy(dico_aatypes)
+return dico_codons, dico_aa, dico_aatypes, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions
+def viable(seqs, pos, method):
+""" Compute if, among a set of sequences, either at least one of the codons at the specified position has not a "-",
+or not any codon has a "-"
+Args :
+seqs : a list (the sequences, which must all have the same size)
+pos : an integer (the positions, <= len(seqs) -3)
+Returns:
+bool
+"""
+codons = [seq[pos:pos+3] for seq in seqs]
+if method == "all":
+return not all("-" in codon for codon in codons)
+elif method == "any":
+return not any("-" in codon for codon in codons)
+# # # Function for codons, aa, aatypes Countings -------------------------------------------------------------------------------
+def computeAllCountingsAndFreqs(seq, list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif):
+""" Call all functions dedicated to the computation of occurences and frequencies of codons, amino-acids, amino-acids types
+Args : see sub-routines
+Returns : 6 dictionaries (occurences and frequencies for codons, amino-acids, amino-acids types). See sub-routines for details.
+"""
+# ------ Sub-routines ------ #
+def codonsCountings(seq, list_codons, init_dict_codons):
+""" Count occurences of each codon in a sequence
+First reading frame only : input sequence is supposed to be an ORF
+Args :
+seq (str) : the sequence
+list_codons (list of str) : all codons except codons-stop
+init_dict_codons (dict) : {codon1 : 0, codon2: 0, ...}
+Return :
+codon (dict) : codons (keys) and their occurences (values) in the sequence
+"""
+codons = copy.deepcopy(init_dict_codons)
+l = len(seq)
+if l%3 == 0: max_indice = l-3
+if l%3 == 1: max_indice = l-4
+if l%3 == 2: max_indice = l-5
+for codon in range(0,max_indice+1,3):
+if "-" not in seq[codon:codon+3] :
+codons[seq[codon:codon+3]] += 1
+return codons
+def codonsFreqs(dict_codons_counts):
+""" Computes frequencies of each codon in a sequence
+Args :
+dict_codons (dict) : the output of codonsCounting()
+Return :
+codons_freqs (dict) : codons (keys) and their frequencies (values)
+"""
+codons_freqs = {}
+for key in dict_codons_counts.keys():
+freq = float(dict_codons_counts[key])/sum(dict_codons_counts.values())
+codons_freqs[key] = freq
+return codons_freqs
+def aaCountings(dict_codons, dict_genetic_code, init_dict_aa):
+""" Count occurences of each amino-acid in a sequence, based on the countings of codons (1st ORF)
+Args :
+dict_codons (dict) : the output of codonsCounting()
+dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
+init_dict_aa (dict) : {aa1 : 0, aa2: 0, ...}
+Return :
+dict_aa (dict) : amino-acids (keys) and their occurences (values)
+"""
+dict_aa = copy.deepcopy(init_dict_aa)
+for key in dict_codons.keys():
+for value in dict_genetic_code.values():
+if key in value:
+dict_aa[dict_genetic_code.keys()[dict_genetic_code.values().index(value)]] += dict_codons[key]
+return dict_aa
+def aaFreqs(dict_aa_counts):
+""" Computes frequencies of each amino-acid in a sequence, based on the countings of codons (1st ORF)
+Args :
+dict_aa_counts (dict) : the output of aaCountings()
+Return :
+dict_aa_freqs (dict) : amino-acids (keys) and their frequencies (values)
+"""
+dict_aa_freqs = {}
+for key in dict_aa_counts.keys():
+freq = float(dict_aa_counts[key])/sum(dict_aa_counts.values())
+dict_aa_freqs[key] = freq
+return dict_aa_freqs
+def aatypesCountings(dict_aa, dict_aa_classif, init_dict_classif):
+""" Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF)
+Args :
+- dict_aa (dict) : the output of aaCountings()
+- dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} )
+- init_dict_classif (dict) : {'polar': 0, 'apolar': 0, ...}
+Return :
+dict_aatypes (dict) : amino-acids types (keys) and their occurences (values)
+"""
+dict_aatypes = copy.deepcopy(init_dict_classif)
+for key_classif in dict_aa_classif.keys():
+for key_aa in dict_aa.keys():
+if key_aa in dict_aa_classif[key_classif]:
+dict_aatypes[key_classif] += dict_aa[key_aa]
+return dict_aatypes
+def aatypesFreqs(dict_aatypes):
+""" Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF)
+Args :
+dict_aatypes (dict) : the output of aatypesCountings()
+Return :
+dict_aatypes_freqs : amino-acids types (keys) and their frequencies (values)
+"""
+dict_aatypes_freqs = {}
+for key in dict_aatypes.keys():
+freq = float(dict_aatypes[key])/sum(dict_aatypes.values())
+dict_aatypes_freqs[key] = freq
+return dict_aatypes_freqs
+# ------ The function ------ #
+codons_c = codonsCountings(seq, list_codons, init_dict_codons)
+codons_f = codonsFreqs(codons_c)
+aa_c = aaCountings(codons_c, dict_genetic_code, init_dict_aa)
+aa_f = aaFreqs(aa_c)
+aat_c = aatypesCountings(aa_c, dict_aa_classif, init_dict_classif)
+aat_f = aatypesFreqs(aat_c)
+return codons_c, codons_f, aa_c, aa_f, aat_c, aat_f
+# # # Functions for various measures (ivywrel, ekqh...) -------------------------------------------------------------------------
+def computeVarious(seq, dict_aa_counts, dict_aa_types):
+""" Call al the functions for nucleic and amino-acids sequences description
+Args : See sub-routines for details.
+Returns : 6 integer or floats. See sub-routines for details.
+"""
+# ------ Sub-routines ------ #
+def nbCodons(seq):
+""" Compute the number of full codons in a sequence
+Arg : seq (str): the sequence
+Return: nb_codons (int)
+"""
+l = len(seq)
+if l%3 == 0: nb_codons = l/3
+if l%3 == 1: nb_codons = (l-1)/3
+if l%3 == 2: nb_codons = (l-2)/3
+return nb_codons
+def maxIndice(seq):
+""" Compute the highest indice for parsing a sequence
+Arg : seq (str): the sequence
+Return : max_indice (int)
+"""
+l = len(seq)
+if l%3 == 0: max_indice = l-3
+if l%3 == 1: max_indice = l-4
+if l%3 == 2: max_indice = l-5
+return max_indice
+def gc12Andgc3Count(seq, nb_codons, max_indice):
+""" Compute the frequency of gc12 in a sequence
+Arg : seq (str) : the sequence
+Return : (float)
+"""
+# TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ?
+# But : involves a less readable code
+gc12 = 0
+gc3 = 0
+for i in range(0, max_indice+1,3):
+if seq[i] in ["c","g"]: gc12 += 1
+if seq[i+1] in ["c","g"]: gc12 += 1
+if seq[i+2] in ["c","g"] or seq[i+2] in ["c","g"]: gc3 += 1
+return float(gc3)/nb_codons, float(gc12)/(2*nb_codons)
+def ivywrelCount(nb_codons, dict_aa_counts):
+""" Compute the sum of occurences of amino-acids IVYWREL divided by the number of codons
+Args :
+nb_codons (int) : the number of codons in the sequence
+dict_aa_counts (dict) : the output of aaCountings()
+return : (float)
+"""
+IVYWREL = 0
+for aa in ["I","V","Y","W","R","E","L"]: # Impossible to make a simple sum, in case one the aa is not in the dict keys
+if aa in dict_aa_counts.keys():
+IVYWREL += dict_aa_counts[aa]
+return float(IVYWREL)/nb_codons
+def ekqhCount(dict_aa_counts):
+""" Compute the ratio of amino-acids EK/QH
+Arg : dict_aa_counts (dict) : the output of aaCountings()
+Return : (float)
+"""
+ek = 0
+qh = 0
+ek = dict_aa_counts["E"] + dict_aa_counts["K"]
+qh = dict_aa_counts["Q"] + dict_aa_counts["H"]
+if qh != 0:
+return float(ek)/qh
+else : return ek
+def payresdgmCount(dict_aa_counts):
+""" Compute the ratio of amino-acids PASRE/SDGM
+Arg : dict_aa_counts (dict) : the output of aaCountings()
+Return : (float)
+"""
+payre = 0
+sdgm = 0
+for aa in ["P","A","Y","R","E"]:
+payre += dict_aa_counts[aa]
+for aa in ["S","D","G","M"]:
+sdgm += dict_aa_counts[aa]
+if sdgm != 0:
+return float(payre)/sdgm
+else : return payre
+def purineLoad(seq, nb_codons):
+""" Compute the purine load indice of a sequence
+Args :
+seq (str) : the sequence
+nb_codons (int) : the number of codons in the sequence
+Return  (float)
+"""
+# TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ?
+# But : involves a less readable code
+g12, g3, A, c12, c3, T = 0.0,0.0,seq.count("a"),0.0,0.0,seq.count("t")
+# g3 and c3 : g and c in 3rd position of a codon
+s = ""
+for i in range(2, len(seq), 3):
+s += seq[i]
+g3 = s.count("g")
+c3 = s.count("c")
+# g12 and c12 : g and c in 1st and 2d positions of a codon
+s = ""
+for i in range(0, len(seq), 3):
+s += seq[i]
+g12 = s.count("g")
+c12 = s.count("c")
+s = ""
+for i in range(1, len(seq), 3):
+s += seq[i]
+g12 += s.count("g")
+c12 += s.count("c")
+return float(1000*(g12+g3+A-c12-c3-T))/(3*nb_codons)
+def cvp(dict_aatypes):
+""" Compute the difference nb_charged_aamino_acids - nb_polar_amino_acids
+Return: (int)
+"""
+return dict_aatypes["charged"] - dict_aatypes["polar"]
+# ------ The function ------ #
+nb_codons = nbCodons(seq)
+max_indice = maxIndice(seq)
+GC3, GC12 = gc12Andgc3Count(seq, nb_codons, max_indice)
+IVYWREL, EKQH, PAYRESDGM = ivywrelCount(nb_codons, dict_aa_counts), ekqhCount(dict_aa_counts), payresdgmCount(dict_aa_counts)
+purineload, CvP = purineLoad(seq, nb_codons), cvp(dict_aa_types)
+return GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP
+# # # Function for codons, aa, aatypes Transitions -----------------------------------------------------------------------------
+def computeAllBiases(seq1, seq2, dico_codons_transi, dico_aa_transi, dico_aatypes_transi, reversecode, reverseclassif) :
+""" Compute all biases (transisitions codon->codon, aa->-aa, aa_type->aa_type) between two sequences
+Args : See sub-routines for details
+Returns 3 dictionaries of dictionaries. See sub-routines for details
+"""
+# ------ Sub-routines ------ #
+def codons_transitions(seq1, seq2, dico_codons_transi):
+""" Compute the number of transitions from a codon of a sequence to the codon of a second sequence
+Args :
+seq1 (str) : the first sequence
+seq2 (str) : the second sequence
+dico_codons_transi (dict of dicts) : { codon1 : {codon1: 0, codon2 : 0, ...}, ..}
+Return :
+codons_transi (dict of dicts) : the occurences of each codon to codon transition
+"""
+codons_transi = copy.deepcopy(dico_codons_transi)
+for i in range(0, len(seq1), 3):
+# check if no indel and if len seq[i:i+3] is really 3 (check for the last codon)
+if viable([seq1, seq2], i, "any") and len(seq1[i:i+3]) == 3 and len(seq2[i:i+3]) == 3 :
+codons_transi[seq1[i:i+3]][seq2[i:i+3]] += 1
+return codons_transi
+def codons_transitions_freqs(codons_transitions_counts):
+""" Computes frequencies of codon transitions between two sequences
+Arg :
+codons_transitions_counts (dict) : the output of codons_transitions()
+Return :
+codons_transi_freqs (dict of dicts) : the frequencies of each codon to codon transition
+"""
+codons_transi_freqs = {}
+for key in codons_transitions_counts.keys():
+codons_transi_freqs[key] = {}
+for key2 in codons_transitions_counts[key].keys():
+if sum(codons_transitions_counts[key].values()) != 0:
+freq = float(codons_transitions_counts[key][key2])/sum(codons_transitions_counts[key].values())
+codons_transi_freqs[key][key2] = freq
+else :
+codons_transi_freqs[key][key2] = 0
+return codons_transi_freqs
+def aa_transitions(dico_codons_transi, dico_aa_transi, reversecode):
+""" Compute the number of transitions from an amino-acid of a sequence to the amino-acid of a second sequence
+Args :
+dico_codons_transi (dict of dicts) : the codons transitions computed by codons_transitions()
+dico_aa_transi (dict of dicts) : { aa1 : {aa1: 0, aa2 : 0, ...}, ..}
+reversecode (dict) : the reversed genetic code {aa1 : [codons],...} -> {codon1: aa1, codon2: aa2, ...}
+Return :
+aa_transi (dict of dicts) : the occurences of each aa to aa transition
+"""
+aa_transi = copy.deepcopy(dico_aa_transi)
+for k in dico_codons_transi.keys():
+newk = reversecode[k]
+for k2 in dico_codons_transi[k].keys():
+newk2 = reversecode[k2]
+aa_transi[newk][newk2] += dico_codons_transi[k][k2]
+return aa_transi
+def aa_transitions_freqs(aa_transitions_counts):
+""" Computes frequencies of amico-acids transitions between two sequences
+Arg : aa_transitions_counts (dict of dicts): the output of aa_transitions()
+Return : aa_transi_freqs (dict of dicts) : the frequencies of each aa to aa transition
+"""
+aa_transi_freqs = {}
+for key in aa_transitions_counts.keys():
+aa_transi_freqs[key] = {}
+for key2 in aa_transitions_counts[key].keys():
+if sum(aa_transitions_counts[key].values()) != 0:
+freq = float(aa_transitions_counts[key][key2])/sum(aa_transitions_counts[key].values())
+aa_transi_freqs[key][key2] = freq
+else :
+aa_transi_freqs[key][key2] = 0
+return aa_transi_freqs
+def aatypes_transitions(dico_aa_transi, dico_aatypes_transi, reverseclassif):
+""" Compute the number of transitions from an amino-acid type of a sequence to the amino-acid type of a second sequence
+Args :
+dico_aa_transi (dict of dicts) : the output of aa_transitions()
+dico_aatypes_transi (dict of dicts) : { type1 : {type1: 0, type2 : 0, ...}, ..}
+reverseclassif (dict) : the reversed amino-acid clasification {aa1: type, aa2: type, ...}
+Return :
+aatypes_transi (dict of dicts) : the occurences of each aatype to aatype transition
+"""
+aatypes_transi = copy.deepcopy(dico_aatypes_transi)
+for k in dico_aa_transi.keys():
+newk = reverseclassif[k]
+for k2 in dico_aa_transi[k].keys():
+newk2 = reverseclassif[k2]
+aatypes_transi[newk][newk2] += dico_aa_transi[k][k2]
+return aatypes_transi
+def aatypes_transitions_freqs(aatypes_transitions_counts):
+""" Computes frequencies of amico-acids types transitions between two sequences
+Args : aatypes_transitions_counts (dict of dicts) : the output of aatypes_transitions()
+Return : aatypes_transi_freqs (dict of dicts) : the frequencies of each aatype to aatype transition
+"""
+aatypes_transi_freqs = {}
+for key in aatypes_transitions_counts.keys():
+aatypes_transi_freqs[key] = {}
+for key2 in aatypes_transitions_counts[key].keys():
+if sum(aatypes_transitions_counts[key].values()) != 0:
+freq = float(aatypes_transitions_counts[key][key2])/sum(aatypes_transitions_counts[key].values())
+aatypes_transi_freqs[key][key2] = freq
+else :
+aatypes_transi_freqs[key][key2] = 0
+return aatypes_transi_freqs
+# ------ The function ------ #
+codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi)
+codons_transitions_freqs = codons_transitions_freqs(codons_transitions)
+aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode)
+aa_transitions_freqs = aa_transitions_freqs(aa_transitions)
+aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif)
+aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions)
+return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs
+def all_sed(codons_c, aa_c, aat_c, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transi, dico_aa_transi, dico_aatypes_transi):
+def compute_sed(transi, counts, dico):
+""" Compute the substitution exchangeability disequilibrium (SED) from one species A to another B between codons/aa//aatypes couples
+Args:
+transi ; dict - dictionaries of all counts of transition from codon/aa/aatype X to Y from sp A to sp B
+counts : dict - dictionaries of codons/aa/aatypes counts in species A
+dico : dict - a dictionary (nested) with all values to 0
+"""
+dict_sed = copy.deepcopy(dico)
+for key in transi.keys():
+for key2 in transi.keys():
+if counts[key] != 0 and float(transi[key2][key])/counts[key] != 0.0:
+x = (float(transi[key][key2])/counts[key]) / (float(transi[key2][key])/counts[key])
+dict_sed[key][key2] = - pow(2,1-x)+1
+else :
+dict_sed[key][key2] = 'NA'
+return dict_sed
+codons_sed = compute_sed(codons_transitions, codons_c, dico_codons_transi)
+aa_sed = compute_sed(aa_transitions, aa_c, dico_aa_transi)
+aatypes_sed = compute_sed(aatypes_transitions, aat_c, dico_aatypes_transi)
+return codons_sed, aa_sed, aatypes_sed
+# # # Function for random resampling --------------------------------------------------------------------------------------------
+def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif):
+""" Resample randomly codons from sequences (sort of bootsrap)
+Args :
+dict_seq (dict) : contains the species name and sequences used ( { 'sp1': seq1, 'sp2': seq2, ...}) without '-' removal
+nb_iter (int) : number of resampling iterations (better >= 1000)
+len_sample (int) : length (in codons) of a resampled sequence (better >= 1000)
+list_codons (list of str): all codons except codons-stop
+genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
+aa_classif (dict) : the types of the amino-acids : {type: [aa1, aa2, ...], ...}
+reversecode (dict) : the reversed genetic code : {codon1: aa1, codon2: aa2, ...}
+reverseclassif (dict) : the reversed amino-acid : clasification {aa1: type, aa2: type, ...}
+Returns :
+codons_lst, aa_lst, classif_lst (dicts) : keys : codons/aa/aatypes, values : []
+codons_transitions_lst, aa_transitions_lst, classif_transitions_lst (dict of dicts) : keys : codons/aa/aatypes, values : the 3 previous dicts
+"""
+# Initialize empty dictionaries for countings and transitions. It's also possible to isntanciate these ones in the main() but it would make a function with ~14 parameters
+codons_0, aa_0, classif_0, codons_transitions_0, aa_transitions_0, classif_transitions_0 = buildDicts(list_codons, 0, genetic_code, aa_classif)
+codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst = buildDicts(list_codons, [], genetic_code, aa_classif)
+# determine the max position where sampling is possible
+l = len(dict_seq.values()[1])
+if l%3 == 0: max_indice = l-3
+if l%3 == 1: max_indice = l-4
+if l%3 == 2: max_indice = l-5
+# List of positions to resample
+viable_positions = [pos for pos in range(0,max_indice,3) if viable(dict_seq.values(), pos, "all")]
+sample_positions = np.random.choice(viable_positions, len_sample)
+# nb_iter resampled sequences
+for i in range(nb_iter):
+if (i+1)%(nb_iter/10) == 0:
+print "    "+str( (i+1)*100/nb_iter)+"%"
+seqa, seqb = "", ""
+for pos in sample_positions:
+codona, codonb = "---", "---"
+# The sequence to be resampled in this position is randomly chosen  ; no "-" resampled
+while "-" in codona :
+codona = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3]
+while "-" in codonb :
+codonb = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3]
+seqa += codona
+seqb += codonb
+# dictionaries : frequences of codons, aa, aatypes (seq1)
+codons_occ_tmp, codons_freq_tmp, aa_occ_tmp, aa_freq_tmp, aatypes_occ_tmp, aatypes_freq_tmp = computeAllCountingsAndFreqs(seqa, list_codons, codons_0, aa_0, classif_0, genetic_code, aa_classif)
+# dictionaries frequences of transitions (seqa->seqb)
+codons_transitions_tmp, codons_transitions_freq_tmp, aa_transition_tmp, aa_transitions_freq_tmp, aatypes_transitions_tmp, aatypes_transitions_freq_tmp = computeAllBiases(seqa, seqb, codons_transitions_0, aa_transitions_0, classif_transitions_0, reversecode, reverseclassif)
+# Adding occurrences in final dicts
+for key in codons_freq_tmp.keys():
+codons_lst[key].append(codons_freq_tmp[key])
+for key in aa_freq_tmp.keys():
+aa_lst[key].append(aa_freq_tmp[key])
+for key in aatypes_freq_tmp.keys():
+classif_lst[key].append(aatypes_freq_tmp[key])
+# Adding occurrences in final dicts (transitions)
+for key in codons_transitions_freq_tmp.keys():
+for key2 in codons_transitions_freq_tmp[key].keys():
+codons_transitions_lst[key][key2].append(codons_transitions_freq_tmp[key][key2])
+for key in aa_transitions_freq_tmp.keys():
+for key2 in aa_transitions_freq_tmp[key].keys():
+aa_transitions_lst[key][key2].append(aa_transitions_freq_tmp[key][key2])
+for key in aatypes_transitions_freq_tmp.keys():
+for key2 in aatypes_transitions_freq_tmp[key].keys():
+classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2])
+return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst
+def testPvalues(dict_counts, dict_resampling, nb_iter, method):
+""" Computes where the observed value is located in the expected counting distribution
+Args :
+dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases()
+dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function
+Return :
+pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts)
+pnorm computes the pvalue to have a value inferior to the observed value under a normal distribution
+One sided to left tail :
+p < 0.05 indicates significantly lower counts
+p > 0.95 indicates significantly higher counts
+"""
+def p_resampling(obs, values, nb_iter):
+""" The pvalue is the proportion of bootsrapped values smaller than the observed value
+If p = 0.025 : 2.5% of the bootstrapped values are smaller than the observed value
+p < 0.025 : the obs value is most likely significantly lower.
+If p = 0.975 : 97.5% of the bootstrapped values are smaller than the observed value
+p > 0.975   the obs value is most likely significantly higher.
+Args :
+obs : int or float - the observed value
+values : list - values of resampling (int or floats)
+nb_iter : int - the number of resampled values (=len(values))
+Return :
+pvalue (float)
+"""
+num = len([x for x in values if x < obs])
+return float(num + 1) / (nb_iter+1)
+def testPvalue(obs, exp, nb_iter):
+""" Compute a pvalue
+Args :
+exp (list of floatsà : a list of length nb_iter, containing expected frequencies of a codon/aa/aatype at each iteration
+obs (float) : the observed value
+nb_iter (int) : the number of iterations for resampling
+Returns :
+pvalue (float)
+"""
+max_val = nb_iter-1
+min_val = 0
+test_val = (max_val+min_val)/2
+while max_val-min_val > 1:
+if obs > exp[test_val]:
+min_val = test_val
+test_val = (max_val+min_val)/2
+elif obs < exp[test_val]:
+max_val = test_val
+test_val = (max_val+min_val)/2
+else:
+break
+pvalue = float(test_val+1)/(nb_iter+1)
+return pvalue
+# ------ The function ------ #
+pvalues = {}
+for key in dict_resampling.keys():
+if type(dict_resampling.values()[1]) is not dict :
+if method == 'origin':
+pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter)
+elif method == 'pnorm':
+pvalues[key] = scipy.stats.norm.cdf(dict_counts[key], np.mean(dict_resampling[key]), np.std(dict_resampling[key]))
+elif method == 'p_resampling':
+pvalues[key] = p_resampling(dict_counts[key], dict_resampling[key], nb_iter)
+else :
+pvalues[key] = {}
+for key2 in dict_resampling[key].keys():
+if method == 'origin':
+pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter)
+elif method == 'pnorm':
+pvalues[key][key2] = scipy.stats.norm.cdf(dict_counts[key][key2], np.mean(dict_resampling[key][key2]), np.std(dict_resampling[key][key2]))
+elif method == 'p_resampling':
+pvalues[key][key2] = p_resampling(dict_counts[key][key2], dict_resampling[key][key2], nb_iter)
+return pvalues
+def main():
+# arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("sequences_file", help="File containing sequences (the output of the tool 'ConcatPhyl'")
+parser.add_argument("considered_species", help="The species name, separated by commas (must be the same than in the sequences_file). It is possible to consider only a subset of species.")
+parser.add_argument("species_for_bootstrap", help="The species which will be used for bootstrapping, separated by commas. It is possible to consider only a subset of species.")
+parser.add_argument("iteration", help="The number of iterations for bootstrapping (better if => 1000)", type=int)
+parser.add_argument("sample_length", help="The lenght of a bootstrapped sequences (better if >= 1000", type=int)
+args = parser.parse_args()
+print "\n ------ Occurences and frequencies of codons, amino-acids, amino-acids types -------\n"
+print "The script counts the number of codons, amino acids, and types of amino acids in sequences,"
+print "as well as the mutation bias from one item to another between 2 sequences.\n"
+print "Counting are then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences."
+print "In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from"
+print "a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting."
+print "    Sequences file : {}".format(args.sequences_file)
+print "    Species retained for countings : {}\n".format(args.considered_species)
+print "Processing : reading input file, opening output files, building dictionaries."
+# make pairs
+list_species = str.split(args.considered_species, ",")
+list_species_boot = str.split(args.species_for_bootstrap, ",")
+pairs_list=list(itertools.combinations(list_species,2))
+# read sequences
+sequences_for_counts = {}
+sequences_for_resampling = {}
+with open(args.sequences_file, "r") as file:
+for line1,line2 in itertools.izip_longest(*[file]*2):
+species = line1.strip(">\r\n")
+sequence = line2.strip("\r\n")
+if species in list_species:
+sequences_for_counts[species] = sequence
+if species in list_species_boot:
+sequences_for_resampling[species] = sequence
+print "    Warning : countings might be biased and show high differences between species because of high variations of the indels proportions among sequences."
+print "    Frequences are more representative."
+print "\n    Indels percent :"
+for k,v in sequences_for_counts.items():
+print "    {} : {} %".format(k, float(v.count("-"))/len(v)*100)
+# useful dictionaries
+dict_genetic_code={"F":["ttt","ttc"],
+"L":["tta","ttg","ctt","ctc","cta","ctg"],
+"I":["att","atc","ata"],
+"M":["atg"],
+"V":["gtt","gtc","gta","gtg"],
+"S":["tct","tcc","tca","tcg","agt","agc"],
+"P":["cct","cca","ccg","ccc"],
+"T":["act","acc","aca","acg"],
+"A":["gct","gcc","gca","gcg"],
+"Y":["tat","tac"],
+"H":["cat","cac"],
+"Q":["caa","cag"],
+"N":["aat","aac"],
+"K":["aaa","aag"],
+"D":["gat","gac"],
+"E":["gaa","gag"],
+"C":["tgt","tgc"],
+"W":["tgg"],
+"R":["cgt","cgc","cga","cgg","aga","agg"],
+"G":["ggt","ggc","gga","ggg"]}
+dict_aa_classif={"unpolar":["G","A","V","L","M","I"],
+"polar":["S","T","C","P","N","Q"],
+"charged":["K","R","H","D","E"],
+"aromatics":["F","Y","W"]}
+reversecode={v:k for k in dict_genetic_code for v in dict_genetic_code[k]}
+reverseclassif={v:k for k in dict_aa_classif for v in dict_aa_classif[k]}
+# codons list (without stop codons)
+nucleotides = ['a', 'c', 'g', 't']
+list_codons = [''.join(comb) for comb in itertools.product(nucleotides, repeat=3)]
+list_codons.remove("taa")
+list_codons.remove("tag")
+list_codons.remove("tga")
+# Store already computed species + row.names in output files
+index = []
+index_transi = []
+# Final dictionaries writed to csv files
+all_codons = {}
+all_aa = {}
+all_aatypes = {}
+all_various = {}
+all_codons_transitions = {} # Not used because too much : 61*61 columns
+all_aa_transitions = {}
+all_aatypes_transitions = {}
+# RUN
+print "\nProcessing : resampling ..."
+print "    Parameters : {niter} iterations, {lensample} codon per resampled sequence, species used : {species}\n".format(niter=args.iteration, lensample=args.sample_length, species=args.species_for_bootstrap)
+codons_boot, aa_boot, aatypes_boot, codons_transi_boot, aa_transi_boot, aatypes_transi_boot = sampling(sequences_for_resampling, args.iteration, args.sample_length, list_codons, dict_genetic_code, dict_aa_classif, reversecode, reverseclassif)
+print "    Done.\n"
+print "Processing : countings....\n"
+# Initialize empty dictionaries for countings and transitions
+init_dict_codons, init_dict_aa, init_dict_classif, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions = buildDicts(list_codons, 0, dict_genetic_code, dict_aa_classif)
+for pair in pairs_list:
+p1, p2 = pair[0], pair[1]
+if p1 not in index:
+print "Countings on {}".format(p1)
+p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
+p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs)
+p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration, 'p_resampling')
+p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration, 'p_resampling')
+p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling')
+all_codons[p1+"_obs_counts"] = p1_codons_counts
+all_codons[p1+"_obs_freqs"] = p1_codons_freqs
+all_codons[p1+"_pvalues"] = p1_codons_pvalues
+all_aa[p1+"_obs_counts"] = p1_aa_counts
+all_aa[p1+"_obs_freqs"] = p1_aa_freqs
+all_aa[p1+"_pvalues"] = p1_aa_pvalues
+all_aatypes[p1+"_obs_counts"] = p1_aatypes_counts
+all_aatypes[p1+"_obs_freqs"] = p1_aatypes_freqs
+all_aatypes[p1+"_pvalues"] = p1_aatypes_pvalues
+all_various[p1] = [p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP]
+index.append(p1)
+if p2 not in index:
+print "Countings on {}".format(p2)
+p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
+p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs)
+p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration, 'p_resampling')
+p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration, 'p_resampling')
+p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling')
+all_codons[p2+"_obs_counts"] = p2_codons_counts
+all_codons[p2+"_obs_freqs"] = p2_codons_freqs
+all_codons[p2+"_pvalues"] = p2_codons_pvalues
+all_aa[p2+"_obs_counts"] = p2_aa_counts
+all_aa[p2+"_obs_freqs"] = p2_aa_freqs
+all_aa[p2+"_pvalues"] = p2_aa_pvalues
+all_aatypes[p2+"_obs_counts"] = p2_aatypes_counts
+all_aatypes[p2+"_obs_freqs"] = p2_aatypes_freqs
+all_aatypes[p2+"_pvalues"] = p2_aatypes_pvalues
+all_various[p2] = p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP
+index.append(p2)
+if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts:
+print "Countings transitions between {} and {}".format(p1, p2)
+codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif)
+# Ajout
+codons_sed, aa_sed, aatypes_sed = all_sed(p1_codons_counts, p1_aa_counts, p1_aatypes_counts, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions)
+index_transi.append((p1,p2))
+p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration, 'p_resampling')
+p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration, 'p_resampling')
+p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration, 'p_resampling')
+all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions
+all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs
+all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues
+all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions
+all_aa_transitions[p1+">"+p2+"_obs_freqs"] = aa_transitions_freqs
+all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues
+all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions
+all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs
+all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues
+all_codons_transitions[p1+">"+p2+"_sed"] = codons_sed
+all_aa_transitions[p1+">"+p2+"_sed"] = aa_sed
+all_aatypes_transitions[p1+">"+p2+"_sed"] = aatypes_sed
+index_transi.append((p1, p2))
+print "\n    Done.\n"
+print "Processing : creating dataframes ..."
+frame_codons = pd.DataFrame(all_codons).T.astype('object')
+frame_aa = pd.DataFrame(all_aa).T.astype('object')
+frame_aatypes = pd.DataFrame(all_aatypes).T.astype('object')
+frame_codons_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_codons_transitions.items()}).unstack()
+frame_codons_transitions.columns = frame_codons_transitions.columns.map('>'.join)
+frame_aa_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aa_transitions.items()}).unstack()
+frame_aa_transitions.columns = frame_aa_transitions.columns.map('>'.join)
+frame_aatypes_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aatypes_transitions.items()}).unstack()
+frame_aatypes_transitions.columns = frame_aatypes_transitions.columns.map('>'.join)
+frame_various = pd.DataFrame(all_various).T
+frame_various.columns = ["GC3","GC12","IVYWREL","EKQH","PAYRESDGM","purineload", "CvP"]
+frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species"
+frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species"
+print "Writing dataframes to output files ...\n"
+frame_codons.round(8).to_csv("codons_freqs.csv", sep=",", encoding="utf-8")
+frame_aa.round(8).to_csv("aa_freqs.csv", sep=",", encoding="utf-8")
+frame_aatypes.astype('object').round(8).to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8")
+frame_codons_transitions.round(8).to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8")
+frame_aa_transitions.round(8).to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8")
+frame_aatypes_transitions.round(8).to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8")
+frame_various.round(8).to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8")
+print "Done."
+if __name__ == "__main__":
+main()

Mercurial > repos > abims-sbr > mutcount

comparison scripts/S01a_codons_counting.py @ 0:acc3674e515b draft default tip