Mercurial > repos > abims-sbr > mutcount
diff scripts/S01a_codons_counting.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:28:50 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01a_codons_counting.py Fri Feb 01 10:28:50 2019 -0500 @@ -0,0 +1,937 @@ +#!/usr/bin/env python +# coding: utf-8 +# Author : Victor Mataigne + +import string, os, sys, re, random, itertools, argparse, copy, math +import pandas as pd +import numpy as np + +def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif): + """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting + + Args : + list_codons (list of str) : all codons except codons-stop + content (int or list) : an integer (for coutings and transitions), or an empty list (for resampling) + dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} + dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} + + Returns : + dico_codons, dico_aa, dico_aatypes (dicts) : keys : codons/amico-acids/amico-acids types, values : 0 or [] + dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions (dicts of dicts) : + actually, the three first dictionaries nested as values of keys codons/amico-acids/amico-acids + """ + + # I could have make sub-routines here. + # the copy commands are mandatory, otherwise all dictionaries will reference the same variable(s) + + dico_codons = {} + for codon in list_codons: + dico_codons[codon] = copy.deepcopy(content) + + dico_aa = {} + for aa in dict_genetic_code.keys(): + dico_aa[aa] = copy.deepcopy(content) + + dico_aatypes = {} + for aatype in dict_aa_classif.keys(): + dico_aatypes[aatype] = copy.deepcopy(content) + + dico_codons_transitions=copy.deepcopy(dico_codons) + for key in dico_codons_transitions.keys(): + dico_codons_transitions[key]=copy.deepcopy(dico_codons) + + dico_aa_transitions = copy.deepcopy(dico_aa) + for key in dico_aa_transitions.keys(): + dico_aa_transitions[key]=copy.deepcopy(dico_aa) + + dico_aatypes_transitions = copy.deepcopy(dico_aatypes) + for key in dico_aatypes_transitions.keys(): + dico_aatypes_transitions[key]=copy.deepcopy(dico_aatypes) + + return dico_codons, dico_aa, dico_aatypes, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions + +def viable(seqs, pos, method): + """ Compute if, among a set of sequences, either at least one of the codons at the specified position has not a "-", + or not any codon has a "-" + + Args : + seqs : a list (the sequences, which must all have the same size) + pos : an integer (the positions, <= len(seqs) -3) + + Returns: + bool + """ + codons = [seq[pos:pos+3] for seq in seqs] + if method == "all": + return not all("-" in codon for codon in codons) + elif method == "any": + return not any("-" in codon for codon in codons) + +# # # Function for codons, aa, aatypes Countings ------------------------------------------------------------------------------- + +def computeAllCountingsAndFreqs(seq, list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif): + """ Call all functions dedicated to the computation of occurences and frequencies of codons, amino-acids, amino-acids types + + Args : see sub-routines + + Returns : 6 dictionaries (occurences and frequencies for codons, amino-acids, amino-acids types). See sub-routines for details. + """ + + # ------ Sub-routines ------ # + + def codonsCountings(seq, list_codons, init_dict_codons): + """ Count occurences of each codon in a sequence + First reading frame only : input sequence is supposed to be an ORF + + Args : + seq (str) : the sequence + list_codons (list of str) : all codons except codons-stop + init_dict_codons (dict) : {codon1 : 0, codon2: 0, ...} + + Return : + codon (dict) : codons (keys) and their occurences (values) in the sequence + """ + + codons = copy.deepcopy(init_dict_codons) + + l = len(seq) + + if l%3 == 0: max_indice = l-3 + if l%3 == 1: max_indice = l-4 + if l%3 == 2: max_indice = l-5 + + for codon in range(0,max_indice+1,3): + if "-" not in seq[codon:codon+3] : + codons[seq[codon:codon+3]] += 1 + + return codons + + def codonsFreqs(dict_codons_counts): + """ Computes frequencies of each codon in a sequence + + Args : + dict_codons (dict) : the output of codonsCounting() + + Return : + codons_freqs (dict) : codons (keys) and their frequencies (values) + """ + + codons_freqs = {} + + for key in dict_codons_counts.keys(): + freq = float(dict_codons_counts[key])/sum(dict_codons_counts.values()) + codons_freqs[key] = freq + + return codons_freqs + + def aaCountings(dict_codons, dict_genetic_code, init_dict_aa): + """ Count occurences of each amino-acid in a sequence, based on the countings of codons (1st ORF) + + Args : + dict_codons (dict) : the output of codonsCounting() + dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} + init_dict_aa (dict) : {aa1 : 0, aa2: 0, ...} + + Return : + dict_aa (dict) : amino-acids (keys) and their occurences (values) + """ + dict_aa = copy.deepcopy(init_dict_aa) + + for key in dict_codons.keys(): + for value in dict_genetic_code.values(): + if key in value: + dict_aa[dict_genetic_code.keys()[dict_genetic_code.values().index(value)]] += dict_codons[key] + + return dict_aa + + def aaFreqs(dict_aa_counts): + """ Computes frequencies of each amino-acid in a sequence, based on the countings of codons (1st ORF) + + Args : + dict_aa_counts (dict) : the output of aaCountings() + + Return : + dict_aa_freqs (dict) : amino-acids (keys) and their frequencies (values) + """ + + dict_aa_freqs = {} + + for key in dict_aa_counts.keys(): + freq = float(dict_aa_counts[key])/sum(dict_aa_counts.values()) + dict_aa_freqs[key] = freq + + return dict_aa_freqs + + def aatypesCountings(dict_aa, dict_aa_classif, init_dict_classif): + """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) + + Args : + - dict_aa (dict) : the output of aaCountings() + - dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} ) + - init_dict_classif (dict) : {'polar': 0, 'apolar': 0, ...} + + Return : + dict_aatypes (dict) : amino-acids types (keys) and their occurences (values) + """ + dict_aatypes = copy.deepcopy(init_dict_classif) + + for key_classif in dict_aa_classif.keys(): + for key_aa in dict_aa.keys(): + if key_aa in dict_aa_classif[key_classif]: + dict_aatypes[key_classif] += dict_aa[key_aa] + + return dict_aatypes + + def aatypesFreqs(dict_aatypes): + """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) + + Args : + dict_aatypes (dict) : the output of aatypesCountings() + + Return : + dict_aatypes_freqs : amino-acids types (keys) and their frequencies (values) + """ + dict_aatypes_freqs = {} + + for key in dict_aatypes.keys(): + freq = float(dict_aatypes[key])/sum(dict_aatypes.values()) + dict_aatypes_freqs[key] = freq + + return dict_aatypes_freqs + + # ------ The function ------ # + + codons_c = codonsCountings(seq, list_codons, init_dict_codons) + codons_f = codonsFreqs(codons_c) + aa_c = aaCountings(codons_c, dict_genetic_code, init_dict_aa) + aa_f = aaFreqs(aa_c) + aat_c = aatypesCountings(aa_c, dict_aa_classif, init_dict_classif) + aat_f = aatypesFreqs(aat_c) + + return codons_c, codons_f, aa_c, aa_f, aat_c, aat_f + +# # # Functions for various measures (ivywrel, ekqh...) ------------------------------------------------------------------------- + +def computeVarious(seq, dict_aa_counts, dict_aa_types): + """ Call al the functions for nucleic and amino-acids sequences description + + Args : See sub-routines for details. + + Returns : 6 integer or floats. See sub-routines for details. + """ + + # ------ Sub-routines ------ # + + def nbCodons(seq): + """ Compute the number of full codons in a sequence + Arg : seq (str): the sequence + Return: nb_codons (int) + """ + l = len(seq) + if l%3 == 0: nb_codons = l/3 + if l%3 == 1: nb_codons = (l-1)/3 + if l%3 == 2: nb_codons = (l-2)/3 + return nb_codons + + def maxIndice(seq): + """ Compute the highest indice for parsing a sequence + Arg : seq (str): the sequence + Return : max_indice (int) + """ + l = len(seq) + if l%3 == 0: max_indice = l-3 + if l%3 == 1: max_indice = l-4 + if l%3 == 2: max_indice = l-5 + return max_indice + + def gc12Andgc3Count(seq, nb_codons, max_indice): + """ Compute the frequency of gc12 in a sequence + Arg : seq (str) : the sequence + Return : (float) + """ + + # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? + # But : involves a less readable code + + gc12 = 0 + gc3 = 0 + + for i in range(0, max_indice+1,3): + if seq[i] in ["c","g"]: gc12 += 1 + if seq[i+1] in ["c","g"]: gc12 += 1 + if seq[i+2] in ["c","g"] or seq[i+2] in ["c","g"]: gc3 += 1 + + return float(gc3)/nb_codons, float(gc12)/(2*nb_codons) + + def ivywrelCount(nb_codons, dict_aa_counts): + """ Compute the sum of occurences of amino-acids IVYWREL divided by the number of codons + + Args : + nb_codons (int) : the number of codons in the sequence + dict_aa_counts (dict) : the output of aaCountings() + + return : (float) + """ + + IVYWREL = 0 + + for aa in ["I","V","Y","W","R","E","L"]: # Impossible to make a simple sum, in case one the aa is not in the dict keys + if aa in dict_aa_counts.keys(): + IVYWREL += dict_aa_counts[aa] + + return float(IVYWREL)/nb_codons + + def ekqhCount(dict_aa_counts): + """ Compute the ratio of amino-acids EK/QH + Arg : dict_aa_counts (dict) : the output of aaCountings() + Return : (float) + """ + ek = 0 + qh = 0 + + ek = dict_aa_counts["E"] + dict_aa_counts["K"] + qh = dict_aa_counts["Q"] + dict_aa_counts["H"] + + if qh != 0: + return float(ek)/qh + else : return ek + + def payresdgmCount(dict_aa_counts): + """ Compute the ratio of amino-acids PASRE/SDGM + Arg : dict_aa_counts (dict) : the output of aaCountings() + Return : (float) + """ + payre = 0 + sdgm = 0 + + for aa in ["P","A","Y","R","E"]: + payre += dict_aa_counts[aa] + for aa in ["S","D","G","M"]: + sdgm += dict_aa_counts[aa] + + if sdgm != 0: + return float(payre)/sdgm + else : return payre + + def purineLoad(seq, nb_codons): + """ Compute the purine load indice of a sequence + Args : + seq (str) : the sequence + nb_codons (int) : the number of codons in the sequence + + Return (float) + """ + + # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? + # But : involves a less readable code + + g12, g3, A, c12, c3, T = 0.0,0.0,seq.count("a"),0.0,0.0,seq.count("t") + + # g3 and c3 : g and c in 3rd position of a codon + s = "" + for i in range(2, len(seq), 3): + s += seq[i] + g3 = s.count("g") + c3 = s.count("c") + + # g12 and c12 : g and c in 1st and 2d positions of a codon + s = "" + for i in range(0, len(seq), 3): + s += seq[i] + g12 = s.count("g") + c12 = s.count("c") + s = "" + for i in range(1, len(seq), 3): + s += seq[i] + g12 += s.count("g") + c12 += s.count("c") + + return float(1000*(g12+g3+A-c12-c3-T))/(3*nb_codons) + + def cvp(dict_aatypes): + """ Compute the difference nb_charged_aamino_acids - nb_polar_amino_acids + Return: (int) + """ + return dict_aatypes["charged"] - dict_aatypes["polar"] + + # ------ The function ------ # + + nb_codons = nbCodons(seq) + max_indice = maxIndice(seq) + GC3, GC12 = gc12Andgc3Count(seq, nb_codons, max_indice) + IVYWREL, EKQH, PAYRESDGM = ivywrelCount(nb_codons, dict_aa_counts), ekqhCount(dict_aa_counts), payresdgmCount(dict_aa_counts) + purineload, CvP = purineLoad(seq, nb_codons), cvp(dict_aa_types) + return GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP + +# # # Function for codons, aa, aatypes Transitions ----------------------------------------------------------------------------- + +def computeAllBiases(seq1, seq2, dico_codons_transi, dico_aa_transi, dico_aatypes_transi, reversecode, reverseclassif) : + """ Compute all biases (transisitions codon->codon, aa->-aa, aa_type->aa_type) between two sequences + + Args : See sub-routines for details + + Returns 3 dictionaries of dictionaries. See sub-routines for details + """ + + # ------ Sub-routines ------ # + + def codons_transitions(seq1, seq2, dico_codons_transi): + """ Compute the number of transitions from a codon of a sequence to the codon of a second sequence + + Args : + seq1 (str) : the first sequence + seq2 (str) : the second sequence + dico_codons_transi (dict of dicts) : { codon1 : {codon1: 0, codon2 : 0, ...}, ..} + + Return : + codons_transi (dict of dicts) : the occurences of each codon to codon transition + """ + + codons_transi = copy.deepcopy(dico_codons_transi) + + for i in range(0, len(seq1), 3): + # check if no indel and if len seq[i:i+3] is really 3 (check for the last codon) + if viable([seq1, seq2], i, "any") and len(seq1[i:i+3]) == 3 and len(seq2[i:i+3]) == 3 : + codons_transi[seq1[i:i+3]][seq2[i:i+3]] += 1 + + return codons_transi + + def codons_transitions_freqs(codons_transitions_counts): + """ Computes frequencies of codon transitions between two sequences + + Arg : + codons_transitions_counts (dict) : the output of codons_transitions() + + Return : + codons_transi_freqs (dict of dicts) : the frequencies of each codon to codon transition + """ + + codons_transi_freqs = {} + + for key in codons_transitions_counts.keys(): + codons_transi_freqs[key] = {} + for key2 in codons_transitions_counts[key].keys(): + if sum(codons_transitions_counts[key].values()) != 0: + freq = float(codons_transitions_counts[key][key2])/sum(codons_transitions_counts[key].values()) + codons_transi_freqs[key][key2] = freq + else : + codons_transi_freqs[key][key2] = 0 + return codons_transi_freqs + + def aa_transitions(dico_codons_transi, dico_aa_transi, reversecode): + """ Compute the number of transitions from an amino-acid of a sequence to the amino-acid of a second sequence + + Args : + dico_codons_transi (dict of dicts) : the codons transitions computed by codons_transitions() + dico_aa_transi (dict of dicts) : { aa1 : {aa1: 0, aa2 : 0, ...}, ..} + reversecode (dict) : the reversed genetic code {aa1 : [codons],...} -> {codon1: aa1, codon2: aa2, ...} + + Return : + aa_transi (dict of dicts) : the occurences of each aa to aa transition + """ + + aa_transi = copy.deepcopy(dico_aa_transi) + + for k in dico_codons_transi.keys(): + newk = reversecode[k] + for k2 in dico_codons_transi[k].keys(): + newk2 = reversecode[k2] + aa_transi[newk][newk2] += dico_codons_transi[k][k2] + + return aa_transi + + def aa_transitions_freqs(aa_transitions_counts): + """ Computes frequencies of amico-acids transitions between two sequences + Arg : aa_transitions_counts (dict of dicts): the output of aa_transitions() + Return : aa_transi_freqs (dict of dicts) : the frequencies of each aa to aa transition + """ + + aa_transi_freqs = {} + + for key in aa_transitions_counts.keys(): + aa_transi_freqs[key] = {} + for key2 in aa_transitions_counts[key].keys(): + if sum(aa_transitions_counts[key].values()) != 0: + freq = float(aa_transitions_counts[key][key2])/sum(aa_transitions_counts[key].values()) + aa_transi_freqs[key][key2] = freq + else : + aa_transi_freqs[key][key2] = 0 + return aa_transi_freqs + + def aatypes_transitions(dico_aa_transi, dico_aatypes_transi, reverseclassif): + """ Compute the number of transitions from an amino-acid type of a sequence to the amino-acid type of a second sequence + + Args : + dico_aa_transi (dict of dicts) : the output of aa_transitions() + dico_aatypes_transi (dict of dicts) : { type1 : {type1: 0, type2 : 0, ...}, ..} + reverseclassif (dict) : the reversed amino-acid clasification {aa1: type, aa2: type, ...} + + Return : + aatypes_transi (dict of dicts) : the occurences of each aatype to aatype transition + """ + + aatypes_transi = copy.deepcopy(dico_aatypes_transi) + for k in dico_aa_transi.keys(): + newk = reverseclassif[k] + for k2 in dico_aa_transi[k].keys(): + newk2 = reverseclassif[k2] + aatypes_transi[newk][newk2] += dico_aa_transi[k][k2] + + return aatypes_transi + + def aatypes_transitions_freqs(aatypes_transitions_counts): + """ Computes frequencies of amico-acids types transitions between two sequences + Args : aatypes_transitions_counts (dict of dicts) : the output of aatypes_transitions() + Return : aatypes_transi_freqs (dict of dicts) : the frequencies of each aatype to aatype transition + """ + + aatypes_transi_freqs = {} + + for key in aatypes_transitions_counts.keys(): + aatypes_transi_freqs[key] = {} + for key2 in aatypes_transitions_counts[key].keys(): + if sum(aatypes_transitions_counts[key].values()) != 0: + freq = float(aatypes_transitions_counts[key][key2])/sum(aatypes_transitions_counts[key].values()) + aatypes_transi_freqs[key][key2] = freq + else : + aatypes_transi_freqs[key][key2] = 0 + return aatypes_transi_freqs + + + + + # ------ The function ------ # + + codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi) + codons_transitions_freqs = codons_transitions_freqs(codons_transitions) + aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode) + aa_transitions_freqs = aa_transitions_freqs(aa_transitions) + aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif) + aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions) + + return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs + +def all_sed(codons_c, aa_c, aat_c, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transi, dico_aa_transi, dico_aatypes_transi): + + def compute_sed(transi, counts, dico): + """ Compute the substitution exchangeability disequilibrium (SED) from one species A to another B between codons/aa//aatypes couples + + Args: + transi ; dict - dictionaries of all counts of transition from codon/aa/aatype X to Y from sp A to sp B + counts : dict - dictionaries of codons/aa/aatypes counts in species A + dico : dict - a dictionary (nested) with all values to 0 + + """ + dict_sed = copy.deepcopy(dico) + + for key in transi.keys(): + for key2 in transi.keys(): + if counts[key] != 0 and float(transi[key2][key])/counts[key] != 0.0: + x = (float(transi[key][key2])/counts[key]) / (float(transi[key2][key])/counts[key]) + dict_sed[key][key2] = - pow(2,1-x)+1 + else : + dict_sed[key][key2] = 'NA' + + return dict_sed + + codons_sed = compute_sed(codons_transitions, codons_c, dico_codons_transi) + aa_sed = compute_sed(aa_transitions, aa_c, dico_aa_transi) + aatypes_sed = compute_sed(aatypes_transitions, aat_c, dico_aatypes_transi) + + return codons_sed, aa_sed, aatypes_sed + +# # # Function for random resampling -------------------------------------------------------------------------------------------- + +def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif): + """ Resample randomly codons from sequences (sort of bootsrap) + + Args : + dict_seq (dict) : contains the species name and sequences used ( { 'sp1': seq1, 'sp2': seq2, ...}) without '-' removal + nb_iter (int) : number of resampling iterations (better >= 1000) + len_sample (int) : length (in codons) of a resampled sequence (better >= 1000) + list_codons (list of str): all codons except codons-stop + genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} + aa_classif (dict) : the types of the amino-acids : {type: [aa1, aa2, ...], ...} + reversecode (dict) : the reversed genetic code : {codon1: aa1, codon2: aa2, ...} + reverseclassif (dict) : the reversed amino-acid : clasification {aa1: type, aa2: type, ...} + + Returns : + codons_lst, aa_lst, classif_lst (dicts) : keys : codons/aa/aatypes, values : [] + codons_transitions_lst, aa_transitions_lst, classif_transitions_lst (dict of dicts) : keys : codons/aa/aatypes, values : the 3 previous dicts + """ + + # Initialize empty dictionaries for countings and transitions. It's also possible to isntanciate these ones in the main() but it would make a function with ~14 parameters + codons_0, aa_0, classif_0, codons_transitions_0, aa_transitions_0, classif_transitions_0 = buildDicts(list_codons, 0, genetic_code, aa_classif) + codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst = buildDicts(list_codons, [], genetic_code, aa_classif) + + # determine the max position where sampling is possible + l = len(dict_seq.values()[1]) + if l%3 == 0: max_indice = l-3 + if l%3 == 1: max_indice = l-4 + if l%3 == 2: max_indice = l-5 + + # List of positions to resample + viable_positions = [pos for pos in range(0,max_indice,3) if viable(dict_seq.values(), pos, "all")] + sample_positions = np.random.choice(viable_positions, len_sample) + + # nb_iter resampled sequences + for i in range(nb_iter): + if (i+1)%(nb_iter/10) == 0: + print " "+str( (i+1)*100/nb_iter)+"%" + + seqa, seqb = "", "" + for pos in sample_positions: + codona, codonb = "---", "---" + # The sequence to be resampled in this position is randomly chosen ; no "-" resampled + while "-" in codona : + codona = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] + while "-" in codonb : + codonb = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] + seqa += codona + seqb += codonb + + # dictionaries : frequences of codons, aa, aatypes (seq1) + codons_occ_tmp, codons_freq_tmp, aa_occ_tmp, aa_freq_tmp, aatypes_occ_tmp, aatypes_freq_tmp = computeAllCountingsAndFreqs(seqa, list_codons, codons_0, aa_0, classif_0, genetic_code, aa_classif) + # dictionaries frequences of transitions (seqa->seqb) + codons_transitions_tmp, codons_transitions_freq_tmp, aa_transition_tmp, aa_transitions_freq_tmp, aatypes_transitions_tmp, aatypes_transitions_freq_tmp = computeAllBiases(seqa, seqb, codons_transitions_0, aa_transitions_0, classif_transitions_0, reversecode, reverseclassif) + + # Adding occurrences in final dicts + for key in codons_freq_tmp.keys(): + codons_lst[key].append(codons_freq_tmp[key]) + for key in aa_freq_tmp.keys(): + aa_lst[key].append(aa_freq_tmp[key]) + for key in aatypes_freq_tmp.keys(): + classif_lst[key].append(aatypes_freq_tmp[key]) + + # Adding occurrences in final dicts (transitions) + for key in codons_transitions_freq_tmp.keys(): + for key2 in codons_transitions_freq_tmp[key].keys(): + codons_transitions_lst[key][key2].append(codons_transitions_freq_tmp[key][key2]) + for key in aa_transitions_freq_tmp.keys(): + for key2 in aa_transitions_freq_tmp[key].keys(): + aa_transitions_lst[key][key2].append(aa_transitions_freq_tmp[key][key2]) + for key in aatypes_transitions_freq_tmp.keys(): + for key2 in aatypes_transitions_freq_tmp[key].keys(): + classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2]) + + return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst + +def testPvalues(dict_counts, dict_resampling, nb_iter, method): + """ Computes where the observed value is located in the expected counting distribution + + Args : + dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() + dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function + + Return : + pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts) + + + pnorm computes the pvalue to have a value inferior to the observed value under a normal distribution + One sided to left tail : + p < 0.05 indicates significantly lower counts + p > 0.95 indicates significantly higher counts + """ + + + def p_resampling(obs, values, nb_iter): + """ The pvalue is the proportion of bootsrapped values smaller than the observed value + If p = 0.025 : 2.5% of the bootstrapped values are smaller than the observed value + p < 0.025 : the obs value is most likely significantly lower. + If p = 0.975 : 97.5% of the bootstrapped values are smaller than the observed value + p > 0.975 the obs value is most likely significantly higher. + + Args : + obs : int or float - the observed value + values : list - values of resampling (int or floats) + nb_iter : int - the number of resampled values (=len(values)) + + Return : + pvalue (float) + """ + + num = len([x for x in values if x < obs]) + return float(num + 1) / (nb_iter+1) + + def testPvalue(obs, exp, nb_iter): + """ Compute a pvalue + + Args : + exp (list of floatsà : a list of length nb_iter, containing expected frequencies of a codon/aa/aatype at each iteration + obs (float) : the observed value + nb_iter (int) : the number of iterations for resampling + + Returns : + pvalue (float) + """ + + max_val = nb_iter-1 + min_val = 0 + test_val = (max_val+min_val)/2 + + while max_val-min_val > 1: + if obs > exp[test_val]: + min_val = test_val + test_val = (max_val+min_val)/2 + elif obs < exp[test_val]: + max_val = test_val + test_val = (max_val+min_val)/2 + else: + break + + pvalue = float(test_val+1)/(nb_iter+1) + + return pvalue + + # ------ The function ------ # + + pvalues = {} + + for key in dict_resampling.keys(): + if type(dict_resampling.values()[1]) is not dict : + if method == 'origin': + pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter) + elif method == 'pnorm': + pvalues[key] = scipy.stats.norm.cdf(dict_counts[key], np.mean(dict_resampling[key]), np.std(dict_resampling[key])) + elif method == 'p_resampling': + pvalues[key] = p_resampling(dict_counts[key], dict_resampling[key], nb_iter) + else : + pvalues[key] = {} + for key2 in dict_resampling[key].keys(): + if method == 'origin': + pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) + elif method == 'pnorm': + pvalues[key][key2] = scipy.stats.norm.cdf(dict_counts[key][key2], np.mean(dict_resampling[key][key2]), np.std(dict_resampling[key][key2])) + elif method == 'p_resampling': + pvalues[key][key2] = p_resampling(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) + + return pvalues + +def main(): + + # arguments + parser = argparse.ArgumentParser() + parser.add_argument("sequences_file", help="File containing sequences (the output of the tool 'ConcatPhyl'") + parser.add_argument("considered_species", help="The species name, separated by commas (must be the same than in the sequences_file). It is possible to consider only a subset of species.") + parser.add_argument("species_for_bootstrap", help="The species which will be used for bootstrapping, separated by commas. It is possible to consider only a subset of species.") + parser.add_argument("iteration", help="The number of iterations for bootstrapping (better if => 1000)", type=int) + parser.add_argument("sample_length", help="The lenght of a bootstrapped sequences (better if >= 1000", type=int) + args = parser.parse_args() + + print "\n ------ Occurences and frequencies of codons, amino-acids, amino-acids types -------\n" + + print "The script counts the number of codons, amino acids, and types of amino acids in sequences," + print "as well as the mutation bias from one item to another between 2 sequences.\n" + + print "Counting are then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences." + print "In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from" + print "a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting." + + print " Sequences file : {}".format(args.sequences_file) + print " Species retained for countings : {}\n".format(args.considered_species) + + print "Processing : reading input file, opening output files, building dictionaries." + + # make pairs + list_species = str.split(args.considered_species, ",") + list_species_boot = str.split(args.species_for_bootstrap, ",") + pairs_list=list(itertools.combinations(list_species,2)) + + # read sequences + sequences_for_counts = {} + sequences_for_resampling = {} + with open(args.sequences_file, "r") as file: + for line1,line2 in itertools.izip_longest(*[file]*2): + species = line1.strip(">\r\n") + sequence = line2.strip("\r\n") + if species in list_species: + sequences_for_counts[species] = sequence + if species in list_species_boot: + sequences_for_resampling[species] = sequence + + print " Warning : countings might be biased and show high differences between species because of high variations of the indels proportions among sequences." + print " Frequences are more representative." + + print "\n Indels percent :" + + for k,v in sequences_for_counts.items(): + print " {} : {} %".format(k, float(v.count("-"))/len(v)*100) + + # useful dictionaries + dict_genetic_code={"F":["ttt","ttc"], + "L":["tta","ttg","ctt","ctc","cta","ctg"], + "I":["att","atc","ata"], + "M":["atg"], + "V":["gtt","gtc","gta","gtg"], + "S":["tct","tcc","tca","tcg","agt","agc"], + "P":["cct","cca","ccg","ccc"], + "T":["act","acc","aca","acg"], + "A":["gct","gcc","gca","gcg"], + "Y":["tat","tac"], + "H":["cat","cac"], + "Q":["caa","cag"], + "N":["aat","aac"], + "K":["aaa","aag"], + "D":["gat","gac"], + "E":["gaa","gag"], + "C":["tgt","tgc"], + "W":["tgg"], + "R":["cgt","cgc","cga","cgg","aga","agg"], + "G":["ggt","ggc","gga","ggg"]} + + dict_aa_classif={"unpolar":["G","A","V","L","M","I"], + "polar":["S","T","C","P","N","Q"], + "charged":["K","R","H","D","E"], + "aromatics":["F","Y","W"]} + + reversecode={v:k for k in dict_genetic_code for v in dict_genetic_code[k]} + reverseclassif={v:k for k in dict_aa_classif for v in dict_aa_classif[k]} + + # codons list (without stop codons) + nucleotides = ['a', 'c', 'g', 't'] + list_codons = [''.join(comb) for comb in itertools.product(nucleotides, repeat=3)] + list_codons.remove("taa") + list_codons.remove("tag") + list_codons.remove("tga") + + # Store already computed species + row.names in output files + index = [] + index_transi = [] + + # Final dictionaries writed to csv files + all_codons = {} + all_aa = {} + all_aatypes = {} + all_various = {} + all_codons_transitions = {} # Not used because too much : 61*61 columns + all_aa_transitions = {} + all_aatypes_transitions = {} + + # RUN + + print "\nProcessing : resampling ..." + print " Parameters : {niter} iterations, {lensample} codon per resampled sequence, species used : {species}\n".format(niter=args.iteration, lensample=args.sample_length, species=args.species_for_bootstrap) + + codons_boot, aa_boot, aatypes_boot, codons_transi_boot, aa_transi_boot, aatypes_transi_boot = sampling(sequences_for_resampling, args.iteration, args.sample_length, list_codons, dict_genetic_code, dict_aa_classif, reversecode, reverseclassif) + print " Done.\n" + + print "Processing : countings....\n" + + # Initialize empty dictionaries for countings and transitions + init_dict_codons, init_dict_aa, init_dict_classif, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions = buildDicts(list_codons, 0, dict_genetic_code, dict_aa_classif) + + for pair in pairs_list: + p1, p2 = pair[0], pair[1] + if p1 not in index: + print "Countings on {}".format(p1) + + p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) + p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs) + + + p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration, 'p_resampling') + p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration, 'p_resampling') + p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') + + all_codons[p1+"_obs_counts"] = p1_codons_counts + all_codons[p1+"_obs_freqs"] = p1_codons_freqs + all_codons[p1+"_pvalues"] = p1_codons_pvalues + all_aa[p1+"_obs_counts"] = p1_aa_counts + all_aa[p1+"_obs_freqs"] = p1_aa_freqs + all_aa[p1+"_pvalues"] = p1_aa_pvalues + all_aatypes[p1+"_obs_counts"] = p1_aatypes_counts + all_aatypes[p1+"_obs_freqs"] = p1_aatypes_freqs + all_aatypes[p1+"_pvalues"] = p1_aatypes_pvalues + all_various[p1] = [p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP] + + index.append(p1) + + if p2 not in index: + print "Countings on {}".format(p2) + + p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) + p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs) + + p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration, 'p_resampling') + p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration, 'p_resampling') + p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') + + all_codons[p2+"_obs_counts"] = p2_codons_counts + all_codons[p2+"_obs_freqs"] = p2_codons_freqs + all_codons[p2+"_pvalues"] = p2_codons_pvalues + all_aa[p2+"_obs_counts"] = p2_aa_counts + all_aa[p2+"_obs_freqs"] = p2_aa_freqs + all_aa[p2+"_pvalues"] = p2_aa_pvalues + all_aatypes[p2+"_obs_counts"] = p2_aatypes_counts + all_aatypes[p2+"_obs_freqs"] = p2_aatypes_freqs + all_aatypes[p2+"_pvalues"] = p2_aatypes_pvalues + all_various[p2] = p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP + + index.append(p2) + + if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts: + print "Countings transitions between {} and {}".format(p1, p2) + codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif) + + # Ajout + codons_sed, aa_sed, aatypes_sed = all_sed(p1_codons_counts, p1_aa_counts, p1_aatypes_counts, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions) + + index_transi.append((p1,p2)) + + p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration, 'p_resampling') + p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration, 'p_resampling') + p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration, 'p_resampling') + + all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions + all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs + all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues + all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions + all_aa_transitions[p1+">"+p2+"_obs_freqs"] = aa_transitions_freqs + all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues + all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions + all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs + all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues + + all_codons_transitions[p1+">"+p2+"_sed"] = codons_sed + all_aa_transitions[p1+">"+p2+"_sed"] = aa_sed + all_aatypes_transitions[p1+">"+p2+"_sed"] = aatypes_sed + + index_transi.append((p1, p2)) + + print "\n Done.\n" + + print "Processing : creating dataframes ..." + + frame_codons = pd.DataFrame(all_codons).T.astype('object') + frame_aa = pd.DataFrame(all_aa).T.astype('object') + frame_aatypes = pd.DataFrame(all_aatypes).T.astype('object') + + frame_codons_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_codons_transitions.items()}).unstack() + frame_codons_transitions.columns = frame_codons_transitions.columns.map('>'.join) + + frame_aa_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aa_transitions.items()}).unstack() + frame_aa_transitions.columns = frame_aa_transitions.columns.map('>'.join) + + frame_aatypes_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aatypes_transitions.items()}).unstack() + frame_aatypes_transitions.columns = frame_aatypes_transitions.columns.map('>'.join) + + frame_various = pd.DataFrame(all_various).T + frame_various.columns = ["GC3","GC12","IVYWREL","EKQH","PAYRESDGM","purineload", "CvP"] + + frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species" + frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species" + + print "Writing dataframes to output files ...\n" + + frame_codons.round(8).to_csv("codons_freqs.csv", sep=",", encoding="utf-8") + frame_aa.round(8).to_csv("aa_freqs.csv", sep=",", encoding="utf-8") + frame_aatypes.astype('object').round(8).to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8") + frame_codons_transitions.round(8).to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8") + frame_aa_transitions.round(8).to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8") + frame_aatypes_transitions.round(8).to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8") + frame_various.round(8).to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8") + + print "Done." + +if __name__ == "__main__": + main() \ No newline at end of file