Mercurial > repos > abims-sbr > mutcount
diff scripts/S01b_extract_variable_prot.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:28:50 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01b_extract_variable_prot.py Fri Feb 01 10:28:50 2019 -0500 @@ -0,0 +1,321 @@ +#!/usr/bin/env python +#coding: utf-8 +#Author : Eric Fontanillas (2010) - Victor Mataigne (2018) + +# TODO : + # - Deal with missing data : do not do the sign test if missing species in a group + # - Find a way to avoid the list_species argument + +import argparse, os +from functions import dico, write_output, fill_with_NaN + +def aa_properties(amino_acids_properties_file): + """ Read the file 'amino_acids_properties' and stores its content + + Args : + amino_acids_properties_file (String) : the file + + Return : + aa_properties (dict) : key/amino-acid - value/list of properties + """ + + dict_aa_properties={} + with open(amino_acids_properties_file, 'r') as f: + f.readline() #jump headers + for line in f.readlines(): + S1 = line.split(",") + aa_name = S1[1] + S2 = aa_name.split("/") + aa_code = S2[1][:-1] + + frequencies = S1[2][:-1] + residue_weight = S1[5] + residue_volume = S1[6] + partial_specific_volume = S1[7] + hydration = S1[8] + + dict_aa_properties[aa_code] = [frequencies, residue_weight, residue_volume, partial_specific_volume, hydration] + + return(dict_aa_properties) + +""" Functions for proteic format """ + +def all_aa_counts(seq): + """ Count the occurrences of all amino-acids in a sequence + + Args: + seq (String) : a proteic sequence + + Returns: a dictionary with amino-acids counts + """ + + aa_counts = {} + seqU = seq.upper() + LAA =['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G'] + + for aa in LAA: + aa_counts[aa] = seqU.count(aa) + + return aa_counts + +def all_aa_props(seq_counts): + """ Converts a dictionnary of counts into a dictionnary of proportions + + Args: + seq_counts (dict) : dictionnary computed by the function all_aa_counts() + + Returns: a dictionary with counts replaced by proportions + """ + + aa_props = {} + for key in seq_counts.keys(): + aa_props[key] = float(seq_counts[key]) / sum(seq_counts.values()) + return aa_props + +def aa_variables_counts_and_props(aa_counts): + """ Computes several thermostability indices (summed occurrences of some AAs, and then various ratios) + + Args: + aa_counts (dict) : dictionnary computed by the function all_aa_counts() + + Returns: + aa_variables_counts : a dictionary with indices values + aa_variables_props : a dictionary with indices proportions (ratios excluded) + """ + + # Hyperthermophile Prokaryotes criterias + # IVYWREL : positivelly correlated with otpimal growth + # ERK : (i.e. ) => positivelly correlated with optimal growth temperature + # ERK/DNQTSHA (or DNQTSH ??) + # EK/QH + + # Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP + # The mutational bias model predict a linear relationship between GARP vs FYMINK + # ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by something else (selection ?) + + + # Hydophobicity hypothesis [should INCREASE with thermal adaptation] + # AL + # Only non-aromatic : AVLIM + # Only aromatic : FYW + + # Charged hypothesis => positivelly correlated with optimal growth temperature + # All charged : RHKDE + # Only positive : RHK + # Only negative : DE + + # Neutral polar hypothesis [should DECREASE with thermal adaptation] + # STNQ + + # Fontanillas' criteria + # PAYRE + # MVGDS + # PAYRE/MVGDS + + # Jollivet's criteria + # AC + # MVGDS + # AC/MVGDS + + aa_variables_counts = {} + aa_variables_props = {} + len_seq = sum(aa_counts.values()) # length of the sequence + + # counts of variables + aa_variables_counts['AC'] = aa_counts['A'] + aa_counts['C'] + aa_variables_counts['APGC'] = aa_counts['A'] + aa_counts['P'] + aa_counts['G'] + aa_counts['C'] + aa_variables_counts['AVLIM'] = aa_counts['A'] + aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M'] + aa_variables_counts['AVLIMFYW'] = aa_variables_counts['AVLIM'] + aa_counts['F'] + aa_counts['Y'] + aa_counts['W'] + aa_variables_counts['DE'] = aa_counts['D'] + aa_counts['E'] + aa_variables_counts['DNQTSHA'] = aa_counts['D'] + aa_counts['N'] + aa_counts['Q'] + aa_counts['T'] + aa_counts['S'] + aa_counts['H'] + aa_counts['A'] + aa_variables_counts['EK'] = aa_counts['E'] + aa_counts['K'] + aa_variables_counts['ERK'] = aa_counts['E'] + aa_counts['K'] + aa_counts['K'] + aa_variables_counts['FYMINK'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['M'] + aa_counts['I'] + aa_counts['N'] + aa_counts['K'] + aa_variables_counts['FYW'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['W'] + aa_variables_counts['GARP'] = aa_counts['G'] + aa_counts['A'] + aa_counts['R'] + aa_counts['P'] + aa_variables_counts['IVYWREL'] = aa_counts['I'] + aa_counts['V'] + aa_counts['Y'] + aa_counts['W'] + aa_counts['R'] + aa_counts['E'] + aa_counts['L'] + aa_variables_counts['QH'] = aa_counts['Q'] + aa_counts['H'] + aa_variables_counts['RHK'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] + aa_variables_counts['RHKDE'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] + aa_counts['D'] + aa_counts['E'] + aa_variables_counts['STNQ'] = aa_counts['S'] + aa_counts['T'] + aa_counts['N'] + aa_counts['Q'] + aa_variables_counts['VLIM'] = aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M'] + aa_variables_counts['PAYRE'] = aa_counts['P'] + aa_counts['A'] + aa_counts['Y'] + aa_counts['R'] + aa_counts['E'] + aa_variables_counts['MVGDS'] = aa_counts['M'] + aa_counts['V'] + aa_counts['G'] + aa_counts['D'] + aa_counts['S'] + + # compute proportions + for key in aa_variables_counts.keys(): + aa_variables_props[key] = float(aa_variables_counts[key]) / float(len_seq) + + if aa_variables_counts['DNQTSHA'] != 0: + ratio_ERK_DNQTSHA = float(aa_variables_counts['ERK'])/float(aa_variables_counts['DNQTSHA']) + else : + ratio_ERK_DNQTSHA = -1 + + if aa_variables_counts['QH'] != 0: + ratio_EK_QH = float(aa_variables_counts['EK'])/float(aa_variables_counts['QH']) + else : + ratio_EK_QH = -1 + + if aa_variables_counts['FYMINK'] != 0: + ratio_GARP_FYMINK = float(aa_variables_counts['EK'])/float(aa_variables_counts['FYMINK']) + else : + ratio_GARP_FYMINK = -1 + + if aa_variables_counts['VLIM'] != 0: + ratio_AC_VLIM = float(aa_variables_counts['AC'])/float(aa_variables_counts['VLIM']) + ratio_APGC_VLIM = float(aa_variables_counts['APGC'])/float(aa_variables_counts['VLIM']) + else : + ratio_AC_VLIM = -1 + ratio_APGC_VLIM = -1 + + if aa_variables_counts['MVGDS'] != 0: + ratio_PAYRE_MVGDS = float(aa_variables_counts['PAYRE'])/float(aa_variables_counts['MVGDS']) + else : + ratio_PAYRE_MVGDS = -1 + + if aa_variables_counts['MVGDS'] != 0: + ratio_AC_MVGDS = float(aa_variables_counts['AC'])/float(aa_variables_counts['MVGDS']) + else : + ratio_AC_MVGDS = -1 + + aa_variables_counts['ratio_ERK_DNQTSHA'] = ratio_ERK_DNQTSHA + aa_variables_counts['ratio_EK_QH'] = ratio_EK_QH + aa_variables_counts['ratio_GARP_FYMINK'] = ratio_GARP_FYMINK + aa_variables_counts['ratio_AC_VLIM'] = ratio_AC_VLIM + aa_variables_counts['ratio_APGC_VLIM'] = ratio_APGC_VLIM + aa_variables_counts['ratio_PAYRE_MVGDS'] = ratio_PAYRE_MVGDS + aa_variables_counts['ratio_AC_MVGDS'] = ratio_AC_MVGDS + + return aa_variables_counts, aa_variables_props + +def sequence_properties_from_aa_properties(aa_counts, aa_properties): + """ Computes a sequence properties (based on an external data file) + + Args: + - aa_counts (dict) : counts of amino-acids in the sequence + - aa_properties (dict) : key/amino-acid - value/list of properties extract from the external data file + + Returns: + - seq_props (dict) : values of the sequence properties + """ + + LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation'] + seq_props = {} + + for i in range(1,5): + seq_props[LS[i-1]] = 0 + for key in aa_counts.keys(): + seq_props[LS[i-1]] += aa_counts[key] * float(aa_properties[key][i]) + + return seq_props + +""" Main """ + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("species_list", help="List of species separated by commas") + parser.add_argument("aa_properties", help="File with all amino-acids properties") + args = parser.parse_args() + + LAA = ['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G'] + LV = ['IVYWREL','EK','ERK','DNQTSHA','QH','ratio_ERK_DNQTSHA','ratio_EK_QH','FYMINK','GARP', + 'ratio_GARP_FYMINK','AVLIM','FYW','AVLIMFYW','STNQ','RHK','DE','RHKDE','APGC','AC', + 'VLIM','ratio_AC_VLIM','ratio_APGC_VLIM'] + LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation'] + + list_inputs = [] + + path_inputs = '01_input_files' + list_inputs = os.listdir(path_inputs) + + lsp = args.species_list.split(',') + lsp = sorted(lsp) + flsp = '' + for el in lsp: + flsp += el+',' + + path_outputs_1 = '02_tables_per_aa' + path_outputs_2 = '02_tables_per_aa_variable' + os.mkdir(path_outputs_1) + os.mkdir(path_outputs_2) + + # Init empty dicts for results + dict_for_files_aa_counts = {} # counts + dict_for_files_aa_props = {} # proportions + dict_for_files_variables_counts = {} + dict_for_files_variables_props = {} + dict_for_files_seq_properties = {} + + aa_properties_file = aa_properties(args.aa_properties) # read the aa_properties file + + # All counts and props + for file in list_inputs: + # iterate over input files + sequences = dico(file, path_inputs) + + # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS + # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only + # This causes a crash here (in the ratios function). The correction skip the whole file. + # When CDS_Search is corrected, lines with 'skip' can be removed + skip = False + for key in sequences.keys(): + if all(x == '-' for x in sequences[key]): + skip = True + + if not skip: + + aa_counts_per_seq = {} + aa_props_per_seq = {} + aa_variables_counts_per_seq = {} + aa_variables_props_per_seq = {} + seq_properties = {} + + for key in sequences.keys(): + # iterate over sequences in the file + aa_counts_per_seq[key] = all_aa_counts(sequences[key]) + aa_props_per_seq[key] = all_aa_props(aa_counts_per_seq[key]) + aa_variables_counts_per_seq[key], aa_variables_props_per_seq[key] = aa_variables_counts_and_props(aa_counts_per_seq[key]) + seq_properties[key] = sequence_properties_from_aa_properties(aa_counts_per_seq[key], aa_properties_file) + + # Add NaN for missing species + for key in set(lsp).difference(set(sequences.keys())): + aa_counts_per_seq[key] = fill_with_NaN(LAA) + aa_props_per_seq[key] = fill_with_NaN(LAA) + aa_variables_counts_per_seq[key] = fill_with_NaN(LV) + seq_properties[key] = fill_with_NaN(LS) + + # Add computations to final dicts + dict_for_files_aa_counts[file] = aa_counts_per_seq + dict_for_files_aa_props[file] = aa_props_per_seq + dict_for_files_variables_counts[file] = aa_variables_counts_per_seq + dict_for_files_variables_props[file] = aa_variables_props_per_seq + dict_for_files_seq_properties[file] = seq_properties + + # Try with pandas ? + write_output(LAA, flsp, path_outputs_1, dict_for_files_aa_counts) # one file per AA + write_output(LV, flsp, path_outputs_2, dict_for_files_variables_counts) # one file per aa_variable + write_output(LS, flsp, path_outputs_2, dict_for_files_seq_properties) #one file per seq properties + + # { file_name1 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, + # seq2: {'A' : 0, 'C':0, 'E':0, ...}, ... + # }, + # { file_name2 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, + # seq2: {'A' : 0, 'C':0, 'E':0, ...}, + # }, + # ... } + + # { file_name1 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, + # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, ... + # }, + # file_name2 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, + # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, + # }, + # ... } + + # { file_name1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, + # file_name2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, + # ... + # } + +if __name__ == '__main__': + main()