Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01b_extract_variable_prot.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
| author | abims-sbr |
|---|---|
| date | Fri, 01 Feb 2019 10:28:50 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc3674e515b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #coding: utf-8 | |
| 3 #Author : Eric Fontanillas (2010) - Victor Mataigne (2018) | |
| 4 | |
| 5 # TODO : | |
| 6 # - Deal with missing data : do not do the sign test if missing species in a group | |
| 7 # - Find a way to avoid the list_species argument | |
| 8 | |
| 9 import argparse, os | |
| 10 from functions import dico, write_output, fill_with_NaN | |
| 11 | |
| 12 def aa_properties(amino_acids_properties_file): | |
| 13 """ Read the file 'amino_acids_properties' and stores its content | |
| 14 | |
| 15 Args : | |
| 16 amino_acids_properties_file (String) : the file | |
| 17 | |
| 18 Return : | |
| 19 aa_properties (dict) : key/amino-acid - value/list of properties | |
| 20 """ | |
| 21 | |
| 22 dict_aa_properties={} | |
| 23 with open(amino_acids_properties_file, 'r') as f: | |
| 24 f.readline() #jump headers | |
| 25 for line in f.readlines(): | |
| 26 S1 = line.split(",") | |
| 27 aa_name = S1[1] | |
| 28 S2 = aa_name.split("/") | |
| 29 aa_code = S2[1][:-1] | |
| 30 | |
| 31 frequencies = S1[2][:-1] | |
| 32 residue_weight = S1[5] | |
| 33 residue_volume = S1[6] | |
| 34 partial_specific_volume = S1[7] | |
| 35 hydration = S1[8] | |
| 36 | |
| 37 dict_aa_properties[aa_code] = [frequencies, residue_weight, residue_volume, partial_specific_volume, hydration] | |
| 38 | |
| 39 return(dict_aa_properties) | |
| 40 | |
| 41 """ Functions for proteic format """ | |
| 42 | |
| 43 def all_aa_counts(seq): | |
| 44 """ Count the occurrences of all amino-acids in a sequence | |
| 45 | |
| 46 Args: | |
| 47 seq (String) : a proteic sequence | |
| 48 | |
| 49 Returns: a dictionary with amino-acids counts | |
| 50 """ | |
| 51 | |
| 52 aa_counts = {} | |
| 53 seqU = seq.upper() | |
| 54 LAA =['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G'] | |
| 55 | |
| 56 for aa in LAA: | |
| 57 aa_counts[aa] = seqU.count(aa) | |
| 58 | |
| 59 return aa_counts | |
| 60 | |
| 61 def all_aa_props(seq_counts): | |
| 62 """ Converts a dictionnary of counts into a dictionnary of proportions | |
| 63 | |
| 64 Args: | |
| 65 seq_counts (dict) : dictionnary computed by the function all_aa_counts() | |
| 66 | |
| 67 Returns: a dictionary with counts replaced by proportions | |
| 68 """ | |
| 69 | |
| 70 aa_props = {} | |
| 71 for key in seq_counts.keys(): | |
| 72 aa_props[key] = float(seq_counts[key]) / sum(seq_counts.values()) | |
| 73 return aa_props | |
| 74 | |
| 75 def aa_variables_counts_and_props(aa_counts): | |
| 76 """ Computes several thermostability indices (summed occurrences of some AAs, and then various ratios) | |
| 77 | |
| 78 Args: | |
| 79 aa_counts (dict) : dictionnary computed by the function all_aa_counts() | |
| 80 | |
| 81 Returns: | |
| 82 aa_variables_counts : a dictionary with indices values | |
| 83 aa_variables_props : a dictionary with indices proportions (ratios excluded) | |
| 84 """ | |
| 85 | |
| 86 # Hyperthermophile Prokaryotes criterias | |
| 87 # IVYWREL : positivelly correlated with otpimal growth | |
| 88 # ERK : (i.e. ) => positivelly correlated with optimal growth temperature | |
| 89 # ERK/DNQTSHA (or DNQTSH ??) | |
| 90 # EK/QH | |
| 91 | |
| 92 # Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP | |
| 93 # The mutational bias model predict a linear relationship between GARP vs FYMINK | |
| 94 # ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by something else (selection ?) | |
| 95 | |
| 96 | |
| 97 # Hydophobicity hypothesis [should INCREASE with thermal adaptation] | |
| 98 # AL | |
| 99 # Only non-aromatic : AVLIM | |
| 100 # Only aromatic : FYW | |
| 101 | |
| 102 # Charged hypothesis => positivelly correlated with optimal growth temperature | |
| 103 # All charged : RHKDE | |
| 104 # Only positive : RHK | |
| 105 # Only negative : DE | |
| 106 | |
| 107 # Neutral polar hypothesis [should DECREASE with thermal adaptation] | |
| 108 # STNQ | |
| 109 | |
| 110 # Fontanillas' criteria | |
| 111 # PAYRE | |
| 112 # MVGDS | |
| 113 # PAYRE/MVGDS | |
| 114 | |
| 115 # Jollivet's criteria | |
| 116 # AC | |
| 117 # MVGDS | |
| 118 # AC/MVGDS | |
| 119 | |
| 120 aa_variables_counts = {} | |
| 121 aa_variables_props = {} | |
| 122 len_seq = sum(aa_counts.values()) # length of the sequence | |
| 123 | |
| 124 # counts of variables | |
| 125 aa_variables_counts['AC'] = aa_counts['A'] + aa_counts['C'] | |
| 126 aa_variables_counts['APGC'] = aa_counts['A'] + aa_counts['P'] + aa_counts['G'] + aa_counts['C'] | |
| 127 aa_variables_counts['AVLIM'] = aa_counts['A'] + aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M'] | |
| 128 aa_variables_counts['AVLIMFYW'] = aa_variables_counts['AVLIM'] + aa_counts['F'] + aa_counts['Y'] + aa_counts['W'] | |
| 129 aa_variables_counts['DE'] = aa_counts['D'] + aa_counts['E'] | |
| 130 aa_variables_counts['DNQTSHA'] = aa_counts['D'] + aa_counts['N'] + aa_counts['Q'] + aa_counts['T'] + aa_counts['S'] + aa_counts['H'] + aa_counts['A'] | |
| 131 aa_variables_counts['EK'] = aa_counts['E'] + aa_counts['K'] | |
| 132 aa_variables_counts['ERK'] = aa_counts['E'] + aa_counts['K'] + aa_counts['K'] | |
| 133 aa_variables_counts['FYMINK'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['M'] + aa_counts['I'] + aa_counts['N'] + aa_counts['K'] | |
| 134 aa_variables_counts['FYW'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['W'] | |
| 135 aa_variables_counts['GARP'] = aa_counts['G'] + aa_counts['A'] + aa_counts['R'] + aa_counts['P'] | |
| 136 aa_variables_counts['IVYWREL'] = aa_counts['I'] + aa_counts['V'] + aa_counts['Y'] + aa_counts['W'] + aa_counts['R'] + aa_counts['E'] + aa_counts['L'] | |
| 137 aa_variables_counts['QH'] = aa_counts['Q'] + aa_counts['H'] | |
| 138 aa_variables_counts['RHK'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] | |
| 139 aa_variables_counts['RHKDE'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] + aa_counts['D'] + aa_counts['E'] | |
| 140 aa_variables_counts['STNQ'] = aa_counts['S'] + aa_counts['T'] + aa_counts['N'] + aa_counts['Q'] | |
| 141 aa_variables_counts['VLIM'] = aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M'] | |
| 142 aa_variables_counts['PAYRE'] = aa_counts['P'] + aa_counts['A'] + aa_counts['Y'] + aa_counts['R'] + aa_counts['E'] | |
| 143 aa_variables_counts['MVGDS'] = aa_counts['M'] + aa_counts['V'] + aa_counts['G'] + aa_counts['D'] + aa_counts['S'] | |
| 144 | |
| 145 # compute proportions | |
| 146 for key in aa_variables_counts.keys(): | |
| 147 aa_variables_props[key] = float(aa_variables_counts[key]) / float(len_seq) | |
| 148 | |
| 149 if aa_variables_counts['DNQTSHA'] != 0: | |
| 150 ratio_ERK_DNQTSHA = float(aa_variables_counts['ERK'])/float(aa_variables_counts['DNQTSHA']) | |
| 151 else : | |
| 152 ratio_ERK_DNQTSHA = -1 | |
| 153 | |
| 154 if aa_variables_counts['QH'] != 0: | |
| 155 ratio_EK_QH = float(aa_variables_counts['EK'])/float(aa_variables_counts['QH']) | |
| 156 else : | |
| 157 ratio_EK_QH = -1 | |
| 158 | |
| 159 if aa_variables_counts['FYMINK'] != 0: | |
| 160 ratio_GARP_FYMINK = float(aa_variables_counts['EK'])/float(aa_variables_counts['FYMINK']) | |
| 161 else : | |
| 162 ratio_GARP_FYMINK = -1 | |
| 163 | |
| 164 if aa_variables_counts['VLIM'] != 0: | |
| 165 ratio_AC_VLIM = float(aa_variables_counts['AC'])/float(aa_variables_counts['VLIM']) | |
| 166 ratio_APGC_VLIM = float(aa_variables_counts['APGC'])/float(aa_variables_counts['VLIM']) | |
| 167 else : | |
| 168 ratio_AC_VLIM = -1 | |
| 169 ratio_APGC_VLIM = -1 | |
| 170 | |
| 171 if aa_variables_counts['MVGDS'] != 0: | |
| 172 ratio_PAYRE_MVGDS = float(aa_variables_counts['PAYRE'])/float(aa_variables_counts['MVGDS']) | |
| 173 else : | |
| 174 ratio_PAYRE_MVGDS = -1 | |
| 175 | |
| 176 if aa_variables_counts['MVGDS'] != 0: | |
| 177 ratio_AC_MVGDS = float(aa_variables_counts['AC'])/float(aa_variables_counts['MVGDS']) | |
| 178 else : | |
| 179 ratio_AC_MVGDS = -1 | |
| 180 | |
| 181 aa_variables_counts['ratio_ERK_DNQTSHA'] = ratio_ERK_DNQTSHA | |
| 182 aa_variables_counts['ratio_EK_QH'] = ratio_EK_QH | |
| 183 aa_variables_counts['ratio_GARP_FYMINK'] = ratio_GARP_FYMINK | |
| 184 aa_variables_counts['ratio_AC_VLIM'] = ratio_AC_VLIM | |
| 185 aa_variables_counts['ratio_APGC_VLIM'] = ratio_APGC_VLIM | |
| 186 aa_variables_counts['ratio_PAYRE_MVGDS'] = ratio_PAYRE_MVGDS | |
| 187 aa_variables_counts['ratio_AC_MVGDS'] = ratio_AC_MVGDS | |
| 188 | |
| 189 return aa_variables_counts, aa_variables_props | |
| 190 | |
| 191 def sequence_properties_from_aa_properties(aa_counts, aa_properties): | |
| 192 """ Computes a sequence properties (based on an external data file) | |
| 193 | |
| 194 Args: | |
| 195 - aa_counts (dict) : counts of amino-acids in the sequence | |
| 196 - aa_properties (dict) : key/amino-acid - value/list of properties extract from the external data file | |
| 197 | |
| 198 Returns: | |
| 199 - seq_props (dict) : values of the sequence properties | |
| 200 """ | |
| 201 | |
| 202 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation'] | |
| 203 seq_props = {} | |
| 204 | |
| 205 for i in range(1,5): | |
| 206 seq_props[LS[i-1]] = 0 | |
| 207 for key in aa_counts.keys(): | |
| 208 seq_props[LS[i-1]] += aa_counts[key] * float(aa_properties[key][i]) | |
| 209 | |
| 210 return seq_props | |
| 211 | |
| 212 """ Main """ | |
| 213 | |
| 214 def main(): | |
| 215 parser = argparse.ArgumentParser() | |
| 216 parser.add_argument("species_list", help="List of species separated by commas") | |
| 217 parser.add_argument("aa_properties", help="File with all amino-acids properties") | |
| 218 args = parser.parse_args() | |
| 219 | |
| 220 LAA = ['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G'] | |
| 221 LV = ['IVYWREL','EK','ERK','DNQTSHA','QH','ratio_ERK_DNQTSHA','ratio_EK_QH','FYMINK','GARP', | |
| 222 'ratio_GARP_FYMINK','AVLIM','FYW','AVLIMFYW','STNQ','RHK','DE','RHKDE','APGC','AC', | |
| 223 'VLIM','ratio_AC_VLIM','ratio_APGC_VLIM'] | |
| 224 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation'] | |
| 225 | |
| 226 list_inputs = [] | |
| 227 | |
| 228 path_inputs = '01_input_files' | |
| 229 list_inputs = os.listdir(path_inputs) | |
| 230 | |
| 231 lsp = args.species_list.split(',') | |
| 232 lsp = sorted(lsp) | |
| 233 flsp = '' | |
| 234 for el in lsp: | |
| 235 flsp += el+',' | |
| 236 | |
| 237 path_outputs_1 = '02_tables_per_aa' | |
| 238 path_outputs_2 = '02_tables_per_aa_variable' | |
| 239 os.mkdir(path_outputs_1) | |
| 240 os.mkdir(path_outputs_2) | |
| 241 | |
| 242 # Init empty dicts for results | |
| 243 dict_for_files_aa_counts = {} # counts | |
| 244 dict_for_files_aa_props = {} # proportions | |
| 245 dict_for_files_variables_counts = {} | |
| 246 dict_for_files_variables_props = {} | |
| 247 dict_for_files_seq_properties = {} | |
| 248 | |
| 249 aa_properties_file = aa_properties(args.aa_properties) # read the aa_properties file | |
| 250 | |
| 251 # All counts and props | |
| 252 for file in list_inputs: | |
| 253 # iterate over input files | |
| 254 sequences = dico(file, path_inputs) | |
| 255 | |
| 256 # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS | |
| 257 # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only | |
| 258 # This causes a crash here (in the ratios function). The correction skip the whole file. | |
| 259 # When CDS_Search is corrected, lines with 'skip' can be removed | |
| 260 skip = False | |
| 261 for key in sequences.keys(): | |
| 262 if all(x == '-' for x in sequences[key]): | |
| 263 skip = True | |
| 264 | |
| 265 if not skip: | |
| 266 | |
| 267 aa_counts_per_seq = {} | |
| 268 aa_props_per_seq = {} | |
| 269 aa_variables_counts_per_seq = {} | |
| 270 aa_variables_props_per_seq = {} | |
| 271 seq_properties = {} | |
| 272 | |
| 273 for key in sequences.keys(): | |
| 274 # iterate over sequences in the file | |
| 275 aa_counts_per_seq[key] = all_aa_counts(sequences[key]) | |
| 276 aa_props_per_seq[key] = all_aa_props(aa_counts_per_seq[key]) | |
| 277 aa_variables_counts_per_seq[key], aa_variables_props_per_seq[key] = aa_variables_counts_and_props(aa_counts_per_seq[key]) | |
| 278 seq_properties[key] = sequence_properties_from_aa_properties(aa_counts_per_seq[key], aa_properties_file) | |
| 279 | |
| 280 # Add NaN for missing species | |
| 281 for key in set(lsp).difference(set(sequences.keys())): | |
| 282 aa_counts_per_seq[key] = fill_with_NaN(LAA) | |
| 283 aa_props_per_seq[key] = fill_with_NaN(LAA) | |
| 284 aa_variables_counts_per_seq[key] = fill_with_NaN(LV) | |
| 285 seq_properties[key] = fill_with_NaN(LS) | |
| 286 | |
| 287 # Add computations to final dicts | |
| 288 dict_for_files_aa_counts[file] = aa_counts_per_seq | |
| 289 dict_for_files_aa_props[file] = aa_props_per_seq | |
| 290 dict_for_files_variables_counts[file] = aa_variables_counts_per_seq | |
| 291 dict_for_files_variables_props[file] = aa_variables_props_per_seq | |
| 292 dict_for_files_seq_properties[file] = seq_properties | |
| 293 | |
| 294 # Try with pandas ? | |
| 295 write_output(LAA, flsp, path_outputs_1, dict_for_files_aa_counts) # one file per AA | |
| 296 write_output(LV, flsp, path_outputs_2, dict_for_files_variables_counts) # one file per aa_variable | |
| 297 write_output(LS, flsp, path_outputs_2, dict_for_files_seq_properties) #one file per seq properties | |
| 298 | |
| 299 # { file_name1 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, | |
| 300 # seq2: {'A' : 0, 'C':0, 'E':0, ...}, ... | |
| 301 # }, | |
| 302 # { file_name2 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, | |
| 303 # seq2: {'A' : 0, 'C':0, 'E':0, ...}, | |
| 304 # }, | |
| 305 # ... } | |
| 306 | |
| 307 # { file_name1 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
| 308 # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, ... | |
| 309 # }, | |
| 310 # file_name2 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
| 311 # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
| 312 # }, | |
| 313 # ... } | |
| 314 | |
| 315 # { file_name1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
| 316 # file_name2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
| 317 # ... | |
| 318 # } | |
| 319 | |
| 320 if __name__ == '__main__': | |
| 321 main() |
