Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01b_extract_variable_prot.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:28:50 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc3674e515b |
---|---|
1 #!/usr/bin/env python | |
2 #coding: utf-8 | |
3 #Author : Eric Fontanillas (2010) - Victor Mataigne (2018) | |
4 | |
5 # TODO : | |
6 # - Deal with missing data : do not do the sign test if missing species in a group | |
7 # - Find a way to avoid the list_species argument | |
8 | |
9 import argparse, os | |
10 from functions import dico, write_output, fill_with_NaN | |
11 | |
12 def aa_properties(amino_acids_properties_file): | |
13 """ Read the file 'amino_acids_properties' and stores its content | |
14 | |
15 Args : | |
16 amino_acids_properties_file (String) : the file | |
17 | |
18 Return : | |
19 aa_properties (dict) : key/amino-acid - value/list of properties | |
20 """ | |
21 | |
22 dict_aa_properties={} | |
23 with open(amino_acids_properties_file, 'r') as f: | |
24 f.readline() #jump headers | |
25 for line in f.readlines(): | |
26 S1 = line.split(",") | |
27 aa_name = S1[1] | |
28 S2 = aa_name.split("/") | |
29 aa_code = S2[1][:-1] | |
30 | |
31 frequencies = S1[2][:-1] | |
32 residue_weight = S1[5] | |
33 residue_volume = S1[6] | |
34 partial_specific_volume = S1[7] | |
35 hydration = S1[8] | |
36 | |
37 dict_aa_properties[aa_code] = [frequencies, residue_weight, residue_volume, partial_specific_volume, hydration] | |
38 | |
39 return(dict_aa_properties) | |
40 | |
41 """ Functions for proteic format """ | |
42 | |
43 def all_aa_counts(seq): | |
44 """ Count the occurrences of all amino-acids in a sequence | |
45 | |
46 Args: | |
47 seq (String) : a proteic sequence | |
48 | |
49 Returns: a dictionary with amino-acids counts | |
50 """ | |
51 | |
52 aa_counts = {} | |
53 seqU = seq.upper() | |
54 LAA =['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G'] | |
55 | |
56 for aa in LAA: | |
57 aa_counts[aa] = seqU.count(aa) | |
58 | |
59 return aa_counts | |
60 | |
61 def all_aa_props(seq_counts): | |
62 """ Converts a dictionnary of counts into a dictionnary of proportions | |
63 | |
64 Args: | |
65 seq_counts (dict) : dictionnary computed by the function all_aa_counts() | |
66 | |
67 Returns: a dictionary with counts replaced by proportions | |
68 """ | |
69 | |
70 aa_props = {} | |
71 for key in seq_counts.keys(): | |
72 aa_props[key] = float(seq_counts[key]) / sum(seq_counts.values()) | |
73 return aa_props | |
74 | |
75 def aa_variables_counts_and_props(aa_counts): | |
76 """ Computes several thermostability indices (summed occurrences of some AAs, and then various ratios) | |
77 | |
78 Args: | |
79 aa_counts (dict) : dictionnary computed by the function all_aa_counts() | |
80 | |
81 Returns: | |
82 aa_variables_counts : a dictionary with indices values | |
83 aa_variables_props : a dictionary with indices proportions (ratios excluded) | |
84 """ | |
85 | |
86 # Hyperthermophile Prokaryotes criterias | |
87 # IVYWREL : positivelly correlated with otpimal growth | |
88 # ERK : (i.e. ) => positivelly correlated with optimal growth temperature | |
89 # ERK/DNQTSHA (or DNQTSH ??) | |
90 # EK/QH | |
91 | |
92 # Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP | |
93 # The mutational bias model predict a linear relationship between GARP vs FYMINK | |
94 # ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by something else (selection ?) | |
95 | |
96 | |
97 # Hydophobicity hypothesis [should INCREASE with thermal adaptation] | |
98 # AL | |
99 # Only non-aromatic : AVLIM | |
100 # Only aromatic : FYW | |
101 | |
102 # Charged hypothesis => positivelly correlated with optimal growth temperature | |
103 # All charged : RHKDE | |
104 # Only positive : RHK | |
105 # Only negative : DE | |
106 | |
107 # Neutral polar hypothesis [should DECREASE with thermal adaptation] | |
108 # STNQ | |
109 | |
110 # Fontanillas' criteria | |
111 # PAYRE | |
112 # MVGDS | |
113 # PAYRE/MVGDS | |
114 | |
115 # Jollivet's criteria | |
116 # AC | |
117 # MVGDS | |
118 # AC/MVGDS | |
119 | |
120 aa_variables_counts = {} | |
121 aa_variables_props = {} | |
122 len_seq = sum(aa_counts.values()) # length of the sequence | |
123 | |
124 # counts of variables | |
125 aa_variables_counts['AC'] = aa_counts['A'] + aa_counts['C'] | |
126 aa_variables_counts['APGC'] = aa_counts['A'] + aa_counts['P'] + aa_counts['G'] + aa_counts['C'] | |
127 aa_variables_counts['AVLIM'] = aa_counts['A'] + aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M'] | |
128 aa_variables_counts['AVLIMFYW'] = aa_variables_counts['AVLIM'] + aa_counts['F'] + aa_counts['Y'] + aa_counts['W'] | |
129 aa_variables_counts['DE'] = aa_counts['D'] + aa_counts['E'] | |
130 aa_variables_counts['DNQTSHA'] = aa_counts['D'] + aa_counts['N'] + aa_counts['Q'] + aa_counts['T'] + aa_counts['S'] + aa_counts['H'] + aa_counts['A'] | |
131 aa_variables_counts['EK'] = aa_counts['E'] + aa_counts['K'] | |
132 aa_variables_counts['ERK'] = aa_counts['E'] + aa_counts['K'] + aa_counts['K'] | |
133 aa_variables_counts['FYMINK'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['M'] + aa_counts['I'] + aa_counts['N'] + aa_counts['K'] | |
134 aa_variables_counts['FYW'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['W'] | |
135 aa_variables_counts['GARP'] = aa_counts['G'] + aa_counts['A'] + aa_counts['R'] + aa_counts['P'] | |
136 aa_variables_counts['IVYWREL'] = aa_counts['I'] + aa_counts['V'] + aa_counts['Y'] + aa_counts['W'] + aa_counts['R'] + aa_counts['E'] + aa_counts['L'] | |
137 aa_variables_counts['QH'] = aa_counts['Q'] + aa_counts['H'] | |
138 aa_variables_counts['RHK'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] | |
139 aa_variables_counts['RHKDE'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] + aa_counts['D'] + aa_counts['E'] | |
140 aa_variables_counts['STNQ'] = aa_counts['S'] + aa_counts['T'] + aa_counts['N'] + aa_counts['Q'] | |
141 aa_variables_counts['VLIM'] = aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M'] | |
142 aa_variables_counts['PAYRE'] = aa_counts['P'] + aa_counts['A'] + aa_counts['Y'] + aa_counts['R'] + aa_counts['E'] | |
143 aa_variables_counts['MVGDS'] = aa_counts['M'] + aa_counts['V'] + aa_counts['G'] + aa_counts['D'] + aa_counts['S'] | |
144 | |
145 # compute proportions | |
146 for key in aa_variables_counts.keys(): | |
147 aa_variables_props[key] = float(aa_variables_counts[key]) / float(len_seq) | |
148 | |
149 if aa_variables_counts['DNQTSHA'] != 0: | |
150 ratio_ERK_DNQTSHA = float(aa_variables_counts['ERK'])/float(aa_variables_counts['DNQTSHA']) | |
151 else : | |
152 ratio_ERK_DNQTSHA = -1 | |
153 | |
154 if aa_variables_counts['QH'] != 0: | |
155 ratio_EK_QH = float(aa_variables_counts['EK'])/float(aa_variables_counts['QH']) | |
156 else : | |
157 ratio_EK_QH = -1 | |
158 | |
159 if aa_variables_counts['FYMINK'] != 0: | |
160 ratio_GARP_FYMINK = float(aa_variables_counts['EK'])/float(aa_variables_counts['FYMINK']) | |
161 else : | |
162 ratio_GARP_FYMINK = -1 | |
163 | |
164 if aa_variables_counts['VLIM'] != 0: | |
165 ratio_AC_VLIM = float(aa_variables_counts['AC'])/float(aa_variables_counts['VLIM']) | |
166 ratio_APGC_VLIM = float(aa_variables_counts['APGC'])/float(aa_variables_counts['VLIM']) | |
167 else : | |
168 ratio_AC_VLIM = -1 | |
169 ratio_APGC_VLIM = -1 | |
170 | |
171 if aa_variables_counts['MVGDS'] != 0: | |
172 ratio_PAYRE_MVGDS = float(aa_variables_counts['PAYRE'])/float(aa_variables_counts['MVGDS']) | |
173 else : | |
174 ratio_PAYRE_MVGDS = -1 | |
175 | |
176 if aa_variables_counts['MVGDS'] != 0: | |
177 ratio_AC_MVGDS = float(aa_variables_counts['AC'])/float(aa_variables_counts['MVGDS']) | |
178 else : | |
179 ratio_AC_MVGDS = -1 | |
180 | |
181 aa_variables_counts['ratio_ERK_DNQTSHA'] = ratio_ERK_DNQTSHA | |
182 aa_variables_counts['ratio_EK_QH'] = ratio_EK_QH | |
183 aa_variables_counts['ratio_GARP_FYMINK'] = ratio_GARP_FYMINK | |
184 aa_variables_counts['ratio_AC_VLIM'] = ratio_AC_VLIM | |
185 aa_variables_counts['ratio_APGC_VLIM'] = ratio_APGC_VLIM | |
186 aa_variables_counts['ratio_PAYRE_MVGDS'] = ratio_PAYRE_MVGDS | |
187 aa_variables_counts['ratio_AC_MVGDS'] = ratio_AC_MVGDS | |
188 | |
189 return aa_variables_counts, aa_variables_props | |
190 | |
191 def sequence_properties_from_aa_properties(aa_counts, aa_properties): | |
192 """ Computes a sequence properties (based on an external data file) | |
193 | |
194 Args: | |
195 - aa_counts (dict) : counts of amino-acids in the sequence | |
196 - aa_properties (dict) : key/amino-acid - value/list of properties extract from the external data file | |
197 | |
198 Returns: | |
199 - seq_props (dict) : values of the sequence properties | |
200 """ | |
201 | |
202 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation'] | |
203 seq_props = {} | |
204 | |
205 for i in range(1,5): | |
206 seq_props[LS[i-1]] = 0 | |
207 for key in aa_counts.keys(): | |
208 seq_props[LS[i-1]] += aa_counts[key] * float(aa_properties[key][i]) | |
209 | |
210 return seq_props | |
211 | |
212 """ Main """ | |
213 | |
214 def main(): | |
215 parser = argparse.ArgumentParser() | |
216 parser.add_argument("species_list", help="List of species separated by commas") | |
217 parser.add_argument("aa_properties", help="File with all amino-acids properties") | |
218 args = parser.parse_args() | |
219 | |
220 LAA = ['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G'] | |
221 LV = ['IVYWREL','EK','ERK','DNQTSHA','QH','ratio_ERK_DNQTSHA','ratio_EK_QH','FYMINK','GARP', | |
222 'ratio_GARP_FYMINK','AVLIM','FYW','AVLIMFYW','STNQ','RHK','DE','RHKDE','APGC','AC', | |
223 'VLIM','ratio_AC_VLIM','ratio_APGC_VLIM'] | |
224 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation'] | |
225 | |
226 list_inputs = [] | |
227 | |
228 path_inputs = '01_input_files' | |
229 list_inputs = os.listdir(path_inputs) | |
230 | |
231 lsp = args.species_list.split(',') | |
232 lsp = sorted(lsp) | |
233 flsp = '' | |
234 for el in lsp: | |
235 flsp += el+',' | |
236 | |
237 path_outputs_1 = '02_tables_per_aa' | |
238 path_outputs_2 = '02_tables_per_aa_variable' | |
239 os.mkdir(path_outputs_1) | |
240 os.mkdir(path_outputs_2) | |
241 | |
242 # Init empty dicts for results | |
243 dict_for_files_aa_counts = {} # counts | |
244 dict_for_files_aa_props = {} # proportions | |
245 dict_for_files_variables_counts = {} | |
246 dict_for_files_variables_props = {} | |
247 dict_for_files_seq_properties = {} | |
248 | |
249 aa_properties_file = aa_properties(args.aa_properties) # read the aa_properties file | |
250 | |
251 # All counts and props | |
252 for file in list_inputs: | |
253 # iterate over input files | |
254 sequences = dico(file, path_inputs) | |
255 | |
256 # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS | |
257 # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only | |
258 # This causes a crash here (in the ratios function). The correction skip the whole file. | |
259 # When CDS_Search is corrected, lines with 'skip' can be removed | |
260 skip = False | |
261 for key in sequences.keys(): | |
262 if all(x == '-' for x in sequences[key]): | |
263 skip = True | |
264 | |
265 if not skip: | |
266 | |
267 aa_counts_per_seq = {} | |
268 aa_props_per_seq = {} | |
269 aa_variables_counts_per_seq = {} | |
270 aa_variables_props_per_seq = {} | |
271 seq_properties = {} | |
272 | |
273 for key in sequences.keys(): | |
274 # iterate over sequences in the file | |
275 aa_counts_per_seq[key] = all_aa_counts(sequences[key]) | |
276 aa_props_per_seq[key] = all_aa_props(aa_counts_per_seq[key]) | |
277 aa_variables_counts_per_seq[key], aa_variables_props_per_seq[key] = aa_variables_counts_and_props(aa_counts_per_seq[key]) | |
278 seq_properties[key] = sequence_properties_from_aa_properties(aa_counts_per_seq[key], aa_properties_file) | |
279 | |
280 # Add NaN for missing species | |
281 for key in set(lsp).difference(set(sequences.keys())): | |
282 aa_counts_per_seq[key] = fill_with_NaN(LAA) | |
283 aa_props_per_seq[key] = fill_with_NaN(LAA) | |
284 aa_variables_counts_per_seq[key] = fill_with_NaN(LV) | |
285 seq_properties[key] = fill_with_NaN(LS) | |
286 | |
287 # Add computations to final dicts | |
288 dict_for_files_aa_counts[file] = aa_counts_per_seq | |
289 dict_for_files_aa_props[file] = aa_props_per_seq | |
290 dict_for_files_variables_counts[file] = aa_variables_counts_per_seq | |
291 dict_for_files_variables_props[file] = aa_variables_props_per_seq | |
292 dict_for_files_seq_properties[file] = seq_properties | |
293 | |
294 # Try with pandas ? | |
295 write_output(LAA, flsp, path_outputs_1, dict_for_files_aa_counts) # one file per AA | |
296 write_output(LV, flsp, path_outputs_2, dict_for_files_variables_counts) # one file per aa_variable | |
297 write_output(LS, flsp, path_outputs_2, dict_for_files_seq_properties) #one file per seq properties | |
298 | |
299 # { file_name1 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, | |
300 # seq2: {'A' : 0, 'C':0, 'E':0, ...}, ... | |
301 # }, | |
302 # { file_name2 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, | |
303 # seq2: {'A' : 0, 'C':0, 'E':0, ...}, | |
304 # }, | |
305 # ... } | |
306 | |
307 # { file_name1 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
308 # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, ... | |
309 # }, | |
310 # file_name2 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
311 # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
312 # }, | |
313 # ... } | |
314 | |
315 # { file_name1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
316 # file_name2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, | |
317 # ... | |
318 # } | |
319 | |
320 if __name__ == '__main__': | |
321 main() |