Mercurial > repos > abims-sbr > cds_search
comparison scripts/S01_find_orf_on_multiple_alignment.py @ 0:eb95bf7f90ae draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:26:37 -0500 |
parents | |
children | c79bdda8abfb |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:eb95bf7f90ae |
---|---|
1 #!/usr/bin/env python | |
2 # coding: utf8 | |
3 # Author: Eric Fontanillas | |
4 # Modification: 03/09/14 by Julie BAFFARD | |
5 # Last modification : 25/07/18 by Victor Mataigne | |
6 | |
7 # Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria | |
8 # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF | |
9 # CRITERIA 2 - This longest part should be > 150nc or 50aa | |
10 # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa | |
11 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA | |
12 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA | |
13 | |
14 import string, os, time, re, zipfile, sys, argparse | |
15 from dico import dico | |
16 | |
17 def code_universel(F1): | |
18 """ Creates bash for genetic code (key : codon ; value : amino-acid) """ | |
19 bash_codeUniversel = {} | |
20 | |
21 with open(F1, "r") as file: | |
22 for line in file.readlines(): | |
23 L1 = string.split(line, " ") | |
24 length1 = len(L1) | |
25 if length1 == 3: | |
26 key = L1[0] | |
27 value = L1[2][:-1] | |
28 bash_codeUniversel[key] = value | |
29 else: | |
30 key = L1[0] | |
31 value = L1[2] | |
32 bash_codeUniversel[key] = value | |
33 | |
34 return(bash_codeUniversel) | |
35 | |
36 def multiple3(seq): | |
37 """ Tests if the sequence is a multiple of 3, and if not removes extra-bases | |
38 !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """ | |
39 | |
40 m = len(seq)%3 | |
41 if m != 0 : | |
42 return seq[:-m], m | |
43 else : | |
44 return seq, m | |
45 | |
46 def detect_Methionine(seq_aa, Ortho, minimal_cds_length): | |
47 """ Detects if methionin in the aa sequence """ | |
48 | |
49 ln = len(seq_aa) | |
50 CUTOFF_Last_50aa = ln - minimal_cds_length | |
51 | |
52 # Find all indices of occurances of "M" in a string of aa | |
53 list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"] | |
54 | |
55 # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS | |
56 if list_indices != []: | |
57 first_M = list_indices[0] | |
58 if first_M < CUTOFF_Last_50aa: | |
59 Ortho = 1 # means orthologs found | |
60 | |
61 return(Ortho) | |
62 | |
63 def ReverseComplement2(seq): | |
64 """ Reverse complement DNA sequence """ | |
65 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' | |
66 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } | |
67 | |
68 return "".join([seq_dict[base] for base in reversed(seq)]) | |
69 | |
70 def simply_get_ORF(seq_dna, gen_code): | |
71 seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)] | |
72 seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons] | |
73 | |
74 return ''.join(seq_by_aa) | |
75 | |
76 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec): | |
77 # Multiple sequence based : Based on the alignment of several sequences (orthogroup) | |
78 # Criteria 1 : Get the segment in the alignment with no codon stop | |
79 | |
80 # 1 - Get the list of aligned aa seq for the 3 ORF: | |
81 bash_of_aligned_aa_seq_3ORF = {} | |
82 bash_of_aligned_nuc_seq_3ORF = {} | |
83 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] | |
84 | |
85 for fasta_name in bash_aligned_nc_seq.keys(): | |
86 # Get sequence, chek if multiple 3, then get 6 orfs | |
87 sequence_nc = bash_aligned_nc_seq[fasta_name] | |
88 new_sequence_nc, modulo = multiple3(sequence_nc) | |
89 new_sequence_rev = ReverseComplement2(new_sequence_nc) | |
90 # For each seq of the multialignment => give the 6 ORFs (in nuc) | |
91 bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]] | |
92 | |
93 seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel) | |
94 seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel) | |
95 seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel) | |
96 seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel) | |
97 seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel) | |
98 seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel) | |
99 | |
100 # For each seq of the multialignment => give the 6 ORFs (in aa) | |
101 bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6] | |
102 | |
103 # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) | |
104 BEST_MAX = 0 | |
105 | |
106 for i in [0,1,2,3,4,5]: # Test the 6 ORFs | |
107 ORF_Aligned_aa = [] | |
108 ORF_Aligned_nuc = [] | |
109 | |
110 # 2.1 - Get the alignment of sequence for a given ORF | |
111 # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd | |
112 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): | |
113 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] | |
114 aa_length = len(ORFsequence) | |
115 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = | |
116 | |
117 n = i+1 | |
118 | |
119 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): | |
120 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] | |
121 nuc_length = len(ORFsequence) | |
122 ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" = | |
123 | |
124 # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment | |
125 # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) | |
126 # Next step is to get the longuest subsequence whithout stop | |
127 # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" | |
128 MAX_LENGTH = 0 | |
129 LONGUEST_SEGMENT_UNSTOPPED = "" | |
130 j = 0 # Start from first position in alignment | |
131 List_of_List_subsequences = [] | |
132 List_positions_subsequence = [] | |
133 while j < aa_length: | |
134 column = [] | |
135 for seq in ORF_Aligned_aa: | |
136 column.append(seq[j]) | |
137 j = j+1 | |
138 if "*" in column: | |
139 List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions | |
140 List_positions_subsequence = [] # Re-initialyse list of positions | |
141 else: | |
142 List_positions_subsequence.append(j) | |
143 | |
144 # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) | |
145 LONGUEST_SUBSEQUENCE_LIST_POSITION = [] | |
146 MAX=0 | |
147 for sublist in List_of_List_subsequences: | |
148 if len(sublist) > MAX and len(sublist) > minimal_cds_length: | |
149 MAX = len(sublist) | |
150 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist | |
151 | |
152 # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) | |
153 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: | |
154 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: | |
155 CDS_maybe_truncated = 1 | |
156 else: | |
157 CDS_maybe_truncated = 0 | |
158 else: | |
159 CDS_maybe_truncated = 0 | |
160 | |
161 | |
162 # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) | |
163 # Test whether it is the better ORF | |
164 if MAX > BEST_MAX: | |
165 BEST_MAX = MAX | |
166 BEST_ORF = i+1 | |
167 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION | |
168 | |
169 | |
170 # 3 - ONCE we have this better segment (BEST CODING SEGMENT) | |
171 # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) | |
172 # And get the INDEX of the best ORF [0, 1, or 2] | |
173 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: | |
174 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] | |
175 pos_MIN_aa = pos_MIN_aa - 1 | |
176 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] | |
177 | |
178 | |
179 BESTORF_bash_of_aligned_aa_seq = {} | |
180 BESTORF_bash_of_aligned_aa_seq_CODING = {} | |
181 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): | |
182 index_BEST_ORF = BEST_ORF-1 # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 | |
183 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] | |
184 seq_coding = seq[pos_MIN_aa:pos_MAX_aa] | |
185 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq | |
186 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding | |
187 | |
188 # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment | |
189 pos_MIN_nuc = pos_MIN_aa * 3 | |
190 pos_MAX_nuc = pos_MAX_aa * 3 | |
191 | |
192 BESTORF_bash_aligned_nc_seq = {} | |
193 BESTORF_bash_aligned_nc_seq_CODING = {} | |
194 for fasta_name in bash_aligned_nc_seq.keys(): | |
195 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] | |
196 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] | |
197 BESTORF_bash_aligned_nc_seq[fasta_name] = seq | |
198 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding | |
199 | |
200 else: # no CDS found | |
201 BESTORF_bash_aligned_nc_seq = {} | |
202 BESTORF_bash_aligned_nc_seq_CODING = {} | |
203 BESTORF_bash_of_aligned_aa_seq = {} | |
204 BESTORF_bash_of_aligned_aa_seq_CODING ={} | |
205 | |
206 # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa | |
207 | |
208 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} | |
209 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} | |
210 | |
211 Ortho = 0 | |
212 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): | |
213 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] | |
214 Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### | |
215 | |
216 # CASE 1: A "M" is present and correctly localized (not in last 50 aa) | |
217 if Ortho == 1: | |
218 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING | |
219 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING | |
220 | |
221 # CASE 2: in case the CDS is truncated, so the "M" is maybe missing: | |
222 if Ortho == 0 and CDS_maybe_truncated == 1: | |
223 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING | |
224 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING | |
225 | |
226 # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): | |
227 ## => the 2 bash "CDS_with_M" are left empty ("{}") | |
228 | |
229 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) | |
230 | |
231 def write_output_file(results_dict, name_elems, path_out): | |
232 if results_dict != {}: | |
233 name_elems[3] = str(len(results_dict.keys())) | |
234 new_name = "_".join(name_elems) | |
235 | |
236 out1 = open("%s/%s" %(path_out,new_name), "w") | |
237 for fasta_name in results_dict.keys(): | |
238 seq = results_dict[fasta_name] | |
239 out1.write("%s\n" %fasta_name) | |
240 out1.write("%s\n" %seq) | |
241 out1.close() | |
242 | |
243 def main(): | |
244 parser = argparse.ArgumentParser() | |
245 parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt") | |
246 parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) | |
247 parser.add_argument("min_spec", help="Minimal number of species per alignment") | |
248 parser.add_argument("list_files", help="File with all input files names") | |
249 args = parser.parse_args() | |
250 | |
251 minimal_cds_length = int(args.min_cds_len) # in aa number | |
252 bash_codeUniversel = code_universel(args.codeUniversel) | |
253 minimum_species = int(args.min_spec) | |
254 | |
255 # Inputs from file containing list of species | |
256 list_files = [] | |
257 with open(args.list_files, 'r') as f: | |
258 for line in f.readlines(): | |
259 list_files.append(line.strip('\n')) | |
260 | |
261 # Directories for results | |
262 dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"] | |
263 for directory in dirs: | |
264 os.mkdir(directory) | |
265 | |
266 count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0 | |
267 count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0 | |
268 | |
269 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), | |
270 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. | |
271 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] | |
272 | |
273 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond | |
274 #n0 = 0 | |
275 | |
276 for file in list_files: | |
277 #n0 += 1 | |
278 | |
279 count_file_processed = count_file_processed + 1 | |
280 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number | |
281 fasta_file_path = "./%s" %file | |
282 bash_fasta = dico(fasta_file_path) | |
283 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species) | |
284 | |
285 name_elems[1] = nb_gp | |
286 | |
287 # Update counts and write group in corresponding output directory | |
288 if BESTORF_nuc != {}: | |
289 count_file_with_CDS += 1 | |
290 if len(BESTORF_nuc.keys()) >= minimum_species : | |
291 count_file_with_cds_and_enought_species += 1 | |
292 write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc | |
293 write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting | |
294 else: | |
295 count_file_without_CDS += 1 | |
296 | |
297 if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species: | |
298 write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2]) | |
299 write_output_file(BESTORF_aa_CODING, name_elems, dirs[3]) | |
300 | |
301 if BESTORF_nuc_CDS_with_M != {}: | |
302 count_file_with_CDS_plus_M += 1 | |
303 if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species : | |
304 count_file_with_cds_M_and_enought_species += 1 | |
305 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4]) | |
306 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5]) | |
307 | |
308 print "*************** CDS detection ***************" | |
309 print "\nFiles processed: %d" %count_file_processed | |
310 print "\tFiles with CDS: %d" %count_file_with_CDS | |
311 print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species) | |
312 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M | |
313 print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) | |
314 print "\tFiles without CDS: %d \n" %count_file_without_CDS | |
315 print "" | |
316 | |
317 if __name__ == '__main__': | |
318 main() |