comparison scripts/S01b_extract_variable_prot.py @ 0:acc3674e515b draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:28:50 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc3674e515b
1 #!/usr/bin/env python
2 #coding: utf-8
3 #Author : Eric Fontanillas (2010) - Victor Mataigne (2018)
4
5 # TODO :
6 # - Deal with missing data : do not do the sign test if missing species in a group
7 # - Find a way to avoid the list_species argument
8
9 import argparse, os
10 from functions import dico, write_output, fill_with_NaN
11
12 def aa_properties(amino_acids_properties_file):
13 """ Read the file 'amino_acids_properties' and stores its content
14
15 Args :
16 amino_acids_properties_file (String) : the file
17
18 Return :
19 aa_properties (dict) : key/amino-acid - value/list of properties
20 """
21
22 dict_aa_properties={}
23 with open(amino_acids_properties_file, 'r') as f:
24 f.readline() #jump headers
25 for line in f.readlines():
26 S1 = line.split(",")
27 aa_name = S1[1]
28 S2 = aa_name.split("/")
29 aa_code = S2[1][:-1]
30
31 frequencies = S1[2][:-1]
32 residue_weight = S1[5]
33 residue_volume = S1[6]
34 partial_specific_volume = S1[7]
35 hydration = S1[8]
36
37 dict_aa_properties[aa_code] = [frequencies, residue_weight, residue_volume, partial_specific_volume, hydration]
38
39 return(dict_aa_properties)
40
41 """ Functions for proteic format """
42
43 def all_aa_counts(seq):
44 """ Count the occurrences of all amino-acids in a sequence
45
46 Args:
47 seq (String) : a proteic sequence
48
49 Returns: a dictionary with amino-acids counts
50 """
51
52 aa_counts = {}
53 seqU = seq.upper()
54 LAA =['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G']
55
56 for aa in LAA:
57 aa_counts[aa] = seqU.count(aa)
58
59 return aa_counts
60
61 def all_aa_props(seq_counts):
62 """ Converts a dictionnary of counts into a dictionnary of proportions
63
64 Args:
65 seq_counts (dict) : dictionnary computed by the function all_aa_counts()
66
67 Returns: a dictionary with counts replaced by proportions
68 """
69
70 aa_props = {}
71 for key in seq_counts.keys():
72 aa_props[key] = float(seq_counts[key]) / sum(seq_counts.values())
73 return aa_props
74
75 def aa_variables_counts_and_props(aa_counts):
76 """ Computes several thermostability indices (summed occurrences of some AAs, and then various ratios)
77
78 Args:
79 aa_counts (dict) : dictionnary computed by the function all_aa_counts()
80
81 Returns:
82 aa_variables_counts : a dictionary with indices values
83 aa_variables_props : a dictionary with indices proportions (ratios excluded)
84 """
85
86 # Hyperthermophile Prokaryotes criterias
87 # IVYWREL : positivelly correlated with otpimal growth
88 # ERK : (i.e. ) => positivelly correlated with optimal growth temperature
89 # ERK/DNQTSHA (or DNQTSH ??)
90 # EK/QH
91
92 # Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP
93 # The mutational bias model predict a linear relationship between GARP vs FYMINK
94 # ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by something else (selection ?)
95
96
97 # Hydophobicity hypothesis [should INCREASE with thermal adaptation]
98 # AL
99 # Only non-aromatic : AVLIM
100 # Only aromatic : FYW
101
102 # Charged hypothesis => positivelly correlated with optimal growth temperature
103 # All charged : RHKDE
104 # Only positive : RHK
105 # Only negative : DE
106
107 # Neutral polar hypothesis [should DECREASE with thermal adaptation]
108 # STNQ
109
110 # Fontanillas' criteria
111 # PAYRE
112 # MVGDS
113 # PAYRE/MVGDS
114
115 # Jollivet's criteria
116 # AC
117 # MVGDS
118 # AC/MVGDS
119
120 aa_variables_counts = {}
121 aa_variables_props = {}
122 len_seq = sum(aa_counts.values()) # length of the sequence
123
124 # counts of variables
125 aa_variables_counts['AC'] = aa_counts['A'] + aa_counts['C']
126 aa_variables_counts['APGC'] = aa_counts['A'] + aa_counts['P'] + aa_counts['G'] + aa_counts['C']
127 aa_variables_counts['AVLIM'] = aa_counts['A'] + aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M']
128 aa_variables_counts['AVLIMFYW'] = aa_variables_counts['AVLIM'] + aa_counts['F'] + aa_counts['Y'] + aa_counts['W']
129 aa_variables_counts['DE'] = aa_counts['D'] + aa_counts['E']
130 aa_variables_counts['DNQTSHA'] = aa_counts['D'] + aa_counts['N'] + aa_counts['Q'] + aa_counts['T'] + aa_counts['S'] + aa_counts['H'] + aa_counts['A']
131 aa_variables_counts['EK'] = aa_counts['E'] + aa_counts['K']
132 aa_variables_counts['ERK'] = aa_counts['E'] + aa_counts['K'] + aa_counts['K']
133 aa_variables_counts['FYMINK'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['M'] + aa_counts['I'] + aa_counts['N'] + aa_counts['K']
134 aa_variables_counts['FYW'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['W']
135 aa_variables_counts['GARP'] = aa_counts['G'] + aa_counts['A'] + aa_counts['R'] + aa_counts['P']
136 aa_variables_counts['IVYWREL'] = aa_counts['I'] + aa_counts['V'] + aa_counts['Y'] + aa_counts['W'] + aa_counts['R'] + aa_counts['E'] + aa_counts['L']
137 aa_variables_counts['QH'] = aa_counts['Q'] + aa_counts['H']
138 aa_variables_counts['RHK'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K']
139 aa_variables_counts['RHKDE'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] + aa_counts['D'] + aa_counts['E']
140 aa_variables_counts['STNQ'] = aa_counts['S'] + aa_counts['T'] + aa_counts['N'] + aa_counts['Q']
141 aa_variables_counts['VLIM'] = aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M']
142 aa_variables_counts['PAYRE'] = aa_counts['P'] + aa_counts['A'] + aa_counts['Y'] + aa_counts['R'] + aa_counts['E']
143 aa_variables_counts['MVGDS'] = aa_counts['M'] + aa_counts['V'] + aa_counts['G'] + aa_counts['D'] + aa_counts['S']
144
145 # compute proportions
146 for key in aa_variables_counts.keys():
147 aa_variables_props[key] = float(aa_variables_counts[key]) / float(len_seq)
148
149 if aa_variables_counts['DNQTSHA'] != 0:
150 ratio_ERK_DNQTSHA = float(aa_variables_counts['ERK'])/float(aa_variables_counts['DNQTSHA'])
151 else :
152 ratio_ERK_DNQTSHA = -1
153
154 if aa_variables_counts['QH'] != 0:
155 ratio_EK_QH = float(aa_variables_counts['EK'])/float(aa_variables_counts['QH'])
156 else :
157 ratio_EK_QH = -1
158
159 if aa_variables_counts['FYMINK'] != 0:
160 ratio_GARP_FYMINK = float(aa_variables_counts['EK'])/float(aa_variables_counts['FYMINK'])
161 else :
162 ratio_GARP_FYMINK = -1
163
164 if aa_variables_counts['VLIM'] != 0:
165 ratio_AC_VLIM = float(aa_variables_counts['AC'])/float(aa_variables_counts['VLIM'])
166 ratio_APGC_VLIM = float(aa_variables_counts['APGC'])/float(aa_variables_counts['VLIM'])
167 else :
168 ratio_AC_VLIM = -1
169 ratio_APGC_VLIM = -1
170
171 if aa_variables_counts['MVGDS'] != 0:
172 ratio_PAYRE_MVGDS = float(aa_variables_counts['PAYRE'])/float(aa_variables_counts['MVGDS'])
173 else :
174 ratio_PAYRE_MVGDS = -1
175
176 if aa_variables_counts['MVGDS'] != 0:
177 ratio_AC_MVGDS = float(aa_variables_counts['AC'])/float(aa_variables_counts['MVGDS'])
178 else :
179 ratio_AC_MVGDS = -1
180
181 aa_variables_counts['ratio_ERK_DNQTSHA'] = ratio_ERK_DNQTSHA
182 aa_variables_counts['ratio_EK_QH'] = ratio_EK_QH
183 aa_variables_counts['ratio_GARP_FYMINK'] = ratio_GARP_FYMINK
184 aa_variables_counts['ratio_AC_VLIM'] = ratio_AC_VLIM
185 aa_variables_counts['ratio_APGC_VLIM'] = ratio_APGC_VLIM
186 aa_variables_counts['ratio_PAYRE_MVGDS'] = ratio_PAYRE_MVGDS
187 aa_variables_counts['ratio_AC_MVGDS'] = ratio_AC_MVGDS
188
189 return aa_variables_counts, aa_variables_props
190
191 def sequence_properties_from_aa_properties(aa_counts, aa_properties):
192 """ Computes a sequence properties (based on an external data file)
193
194 Args:
195 - aa_counts (dict) : counts of amino-acids in the sequence
196 - aa_properties (dict) : key/amino-acid - value/list of properties extract from the external data file
197
198 Returns:
199 - seq_props (dict) : values of the sequence properties
200 """
201
202 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation']
203 seq_props = {}
204
205 for i in range(1,5):
206 seq_props[LS[i-1]] = 0
207 for key in aa_counts.keys():
208 seq_props[LS[i-1]] += aa_counts[key] * float(aa_properties[key][i])
209
210 return seq_props
211
212 """ Main """
213
214 def main():
215 parser = argparse.ArgumentParser()
216 parser.add_argument("species_list", help="List of species separated by commas")
217 parser.add_argument("aa_properties", help="File with all amino-acids properties")
218 args = parser.parse_args()
219
220 LAA = ['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G']
221 LV = ['IVYWREL','EK','ERK','DNQTSHA','QH','ratio_ERK_DNQTSHA','ratio_EK_QH','FYMINK','GARP',
222 'ratio_GARP_FYMINK','AVLIM','FYW','AVLIMFYW','STNQ','RHK','DE','RHKDE','APGC','AC',
223 'VLIM','ratio_AC_VLIM','ratio_APGC_VLIM']
224 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation']
225
226 list_inputs = []
227
228 path_inputs = '01_input_files'
229 list_inputs = os.listdir(path_inputs)
230
231 lsp = args.species_list.split(',')
232 lsp = sorted(lsp)
233 flsp = ''
234 for el in lsp:
235 flsp += el+','
236
237 path_outputs_1 = '02_tables_per_aa'
238 path_outputs_2 = '02_tables_per_aa_variable'
239 os.mkdir(path_outputs_1)
240 os.mkdir(path_outputs_2)
241
242 # Init empty dicts for results
243 dict_for_files_aa_counts = {} # counts
244 dict_for_files_aa_props = {} # proportions
245 dict_for_files_variables_counts = {}
246 dict_for_files_variables_props = {}
247 dict_for_files_seq_properties = {}
248
249 aa_properties_file = aa_properties(args.aa_properties) # read the aa_properties file
250
251 # All counts and props
252 for file in list_inputs:
253 # iterate over input files
254 sequences = dico(file, path_inputs)
255
256 # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS
257 # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only
258 # This causes a crash here (in the ratios function). The correction skip the whole file.
259 # When CDS_Search is corrected, lines with 'skip' can be removed
260 skip = False
261 for key in sequences.keys():
262 if all(x == '-' for x in sequences[key]):
263 skip = True
264
265 if not skip:
266
267 aa_counts_per_seq = {}
268 aa_props_per_seq = {}
269 aa_variables_counts_per_seq = {}
270 aa_variables_props_per_seq = {}
271 seq_properties = {}
272
273 for key in sequences.keys():
274 # iterate over sequences in the file
275 aa_counts_per_seq[key] = all_aa_counts(sequences[key])
276 aa_props_per_seq[key] = all_aa_props(aa_counts_per_seq[key])
277 aa_variables_counts_per_seq[key], aa_variables_props_per_seq[key] = aa_variables_counts_and_props(aa_counts_per_seq[key])
278 seq_properties[key] = sequence_properties_from_aa_properties(aa_counts_per_seq[key], aa_properties_file)
279
280 # Add NaN for missing species
281 for key in set(lsp).difference(set(sequences.keys())):
282 aa_counts_per_seq[key] = fill_with_NaN(LAA)
283 aa_props_per_seq[key] = fill_with_NaN(LAA)
284 aa_variables_counts_per_seq[key] = fill_with_NaN(LV)
285 seq_properties[key] = fill_with_NaN(LS)
286
287 # Add computations to final dicts
288 dict_for_files_aa_counts[file] = aa_counts_per_seq
289 dict_for_files_aa_props[file] = aa_props_per_seq
290 dict_for_files_variables_counts[file] = aa_variables_counts_per_seq
291 dict_for_files_variables_props[file] = aa_variables_props_per_seq
292 dict_for_files_seq_properties[file] = seq_properties
293
294 # Try with pandas ?
295 write_output(LAA, flsp, path_outputs_1, dict_for_files_aa_counts) # one file per AA
296 write_output(LV, flsp, path_outputs_2, dict_for_files_variables_counts) # one file per aa_variable
297 write_output(LS, flsp, path_outputs_2, dict_for_files_seq_properties) #one file per seq properties
298
299 # { file_name1 : { seq1: {'A' : 0, 'C':0, 'E':0, ...},
300 # seq2: {'A' : 0, 'C':0, 'E':0, ...}, ...
301 # },
302 # { file_name2 : { seq1: {'A' : 0, 'C':0, 'E':0, ...},
303 # seq2: {'A' : 0, 'C':0, 'E':0, ...},
304 # },
305 # ... }
306
307 # { file_name1 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
308 # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, ...
309 # },
310 # file_name2 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
311 # seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
312 # },
313 # ... }
314
315 # { file_name1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
316 # file_name2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
317 # ...
318 # }
319
320 if __name__ == '__main__':
321 main()