Mercurial > repos > abims-sbr > cds_search
comparison scripts/S01_find_orf_on_multiple_alignment.py @ 1:c79bdda8abfb draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
author | abims-sbr |
---|---|
date | Thu, 09 Jun 2022 12:40:00 +0000 |
parents | eb95bf7f90ae |
children |
comparison
equal
deleted
inserted
replaced
0:eb95bf7f90ae | 1:c79bdda8abfb |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # coding: utf8 | 2 # coding: utf8 |
3 # Author: Eric Fontanillas | 3 # Author: Eric Fontanillas |
4 # Modification: 03/09/14 by Julie BAFFARD | 4 # Modification: 03/09/14 by Julie BAFFARD |
5 # Last modification : 25/07/18 by Victor Mataigne | 5 # Last modification : 10/09/21 by Charlotte Berthelier |
6 | 6 |
7 # Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria | 7 """ |
8 # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF | 8 Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria |
9 # CRITERIA 2 - This longest part should be > 150nc or 50aa | 9 |
10 # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa | 10 - CRITERIA 1 |
11 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA | 11 |
12 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA | 12 Get the longest part of the sequence alignemen without codon stop "*", |
13 | 13 and test in the 3 potential ORF and check with a Blast the best coding sequence |
14 import string, os, time, re, zipfile, sys, argparse | 14 |
15 - CRITERIA 2 | |
16 | |
17 This longest part should be > 150nc or 50aa | |
18 | |
19 - CRITERIA 3 [OPTIONNAL] | |
20 | |
21 A codon start "M" should be present in this longuest part, before the last 50 aa | |
22 | |
23 OUTPUTs: | |
24 "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA | |
25 "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA | |
26 """ | |
27 | |
28 import os | |
29 import re | |
30 import argparse | |
31 | |
32 from Bio.Blast import NCBIWWW | |
33 from Bio.Blast import NCBIXML | |
15 from dico import dico | 34 from dico import dico |
16 | 35 |
17 def code_universel(F1): | 36 |
37 def code_universel(file1): | |
18 """ Creates bash for genetic code (key : codon ; value : amino-acid) """ | 38 """ Creates bash for genetic code (key : codon ; value : amino-acid) """ |
19 bash_codeUniversel = {} | 39 bash_code_universel = {} |
20 | 40 |
21 with open(F1, "r") as file: | 41 with open(file1, "r") as file: |
22 for line in file.readlines(): | 42 for line in file.readlines(): |
23 L1 = string.split(line, " ") | 43 item = str.split(line, " ") |
24 length1 = len(L1) | 44 length1 = len(item) |
25 if length1 == 3: | 45 if length1 == 3: |
26 key = L1[0] | 46 key = item[0] |
27 value = L1[2][:-1] | 47 value = item[2][:-1] |
28 bash_codeUniversel[key] = value | 48 bash_code_universel[key] = value |
29 else: | 49 else: |
30 key = L1[0] | 50 key = item[0] |
31 value = L1[2] | 51 value = item[2] |
32 bash_codeUniversel[key] = value | 52 bash_code_universel[key] = value |
33 | 53 |
34 return(bash_codeUniversel) | 54 return bash_code_universel |
55 | |
35 | 56 |
36 def multiple3(seq): | 57 def multiple3(seq): |
37 """ Tests if the sequence is a multiple of 3, and if not removes extra-bases | 58 """ |
38 !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """ | 59 Tests if the sequence is a multiple of 3, and if not removes extra-bases |
39 | 60 Possible to lost a codon, when I test ORF (as I will decay the ORF) |
40 m = len(seq)%3 | 61 """ |
41 if m != 0 : | 62 |
42 return seq[:-m], m | 63 multiple = len(seq) % 3 |
43 else : | 64 if multiple != 0: |
44 return seq, m | 65 return seq[:-multiple], multiple |
45 | 66 return seq, multiple |
46 def detect_Methionine(seq_aa, Ortho, minimal_cds_length): | 67 |
68 | |
69 def detect_methionine(seq_aa, ortho, minimal_cds_length): | |
47 """ Detects if methionin in the aa sequence """ | 70 """ Detects if methionin in the aa sequence """ |
48 | 71 |
49 ln = len(seq_aa) | 72 size = len(seq_aa) |
50 CUTOFF_Last_50aa = ln - minimal_cds_length | 73 cutoff_last_50aa = size - minimal_cds_length |
51 | 74 |
52 # Find all indices of occurances of "M" in a string of aa | 75 # Find all indices of occurances of "M" in a string of aa |
53 list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"] | 76 list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"] |
54 | 77 |
55 # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS | 78 # If some "M" are present, find whether the first "M" found is not in the |
79 # 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS | |
56 if list_indices != []: | 80 if list_indices != []: |
57 first_M = list_indices[0] | 81 first_m = list_indices[0] |
58 if first_M < CUTOFF_Last_50aa: | 82 if first_m < cutoff_last_50aa: |
59 Ortho = 1 # means orthologs found | 83 ortho = 1 # means orthologs found |
60 | 84 |
61 return(Ortho) | 85 return ortho |
62 | 86 |
63 def ReverseComplement2(seq): | 87 |
88 def reverse_complement2(seq): | |
64 """ Reverse complement DNA sequence """ | 89 """ Reverse complement DNA sequence """ |
65 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' | 90 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' |
66 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } | 91 seq_dict = {seq1[i]: seq1[i + 6] |
92 for i in range(24) if i < 6 or 12 <= i <= 16} | |
67 | 93 |
68 return "".join([seq_dict[base] for base in reversed(seq)]) | 94 return "".join([seq_dict[base] for base in reversed(seq)]) |
69 | 95 |
70 def simply_get_ORF(seq_dna, gen_code): | 96 |
71 seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)] | 97 def simply_get_orf(seq_dna, gen_code): |
72 seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons] | 98 """ Generate the ORF sequence from DNA sequence """ |
99 seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i + 3] | |
100 for i in range(0, len(seq_dna), 3)] | |
101 seq_by_aa = [gen_code[codon] if codon in gen_code.keys() | |
102 else '?' for codon in seq_by_codons] | |
73 | 103 |
74 return ''.join(seq_by_aa) | 104 return ''.join(seq_by_aa) |
75 | 105 |
76 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec): | 106 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec): |
77 # Multiple sequence based : Based on the alignment of several sequences (orthogroup) | 107 # Multiple sequence based : Based on the alignment of several sequences (orthogroup) |
78 # Criteria 1 : Get the segment in the alignment with no codon stop | 108 # Criteria 1 : Get the segment in the alignment with no codon stop |
79 | 109 |
80 # 1 - Get the list of aligned aa seq for the 3 ORF: | 110 # 1 - Get the list of aligned aa seq for the 3 ORF: |
111 print("1 - Get the list of aligned aa seq for the 3 ORF") | |
81 bash_of_aligned_aa_seq_3ORF = {} | 112 bash_of_aligned_aa_seq_3ORF = {} |
82 bash_of_aligned_nuc_seq_3ORF = {} | 113 bash_of_aligned_nuc_seq_3ORF = {} |
83 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] | 114 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] |
84 | 115 |
85 for fasta_name in bash_aligned_nc_seq.keys(): | 116 for fasta_name in bash_aligned_nc_seq.keys(): |
86 # Get sequence, chek if multiple 3, then get 6 orfs | 117 # Get sequence, chek if multiple 3, then get 6 orfs |
87 sequence_nc = bash_aligned_nc_seq[fasta_name] | 118 sequence_nc = bash_aligned_nc_seq[fasta_name] |
88 new_sequence_nc, modulo = multiple3(sequence_nc) | 119 new_sequence_nc, modulo = multiple3(sequence_nc) |
89 new_sequence_rev = ReverseComplement2(new_sequence_nc) | 120 new_sequence_rev = reverse_complement2(new_sequence_nc) |
90 # For each seq of the multialignment => give the 6 ORFs (in nuc) | 121 # For each seq of the multialignment => give the 6 ORFs (in nuc) |
91 bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]] | 122 bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]] |
92 | 123 |
93 seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel) | 124 seq_prot_ORF1 = simply_get_orf(new_sequence_nc, bash_codeUniversel) |
94 seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel) | 125 seq_prot_ORF2 = simply_get_orf(new_sequence_nc[1:-2], bash_codeUniversel) |
95 seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel) | 126 seq_prot_ORF3 = simply_get_orf(new_sequence_nc[2:-1], bash_codeUniversel) |
96 seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel) | 127 seq_prot_ORF4 = simply_get_orf(new_sequence_rev, bash_codeUniversel) |
97 seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel) | 128 seq_prot_ORF5 = simply_get_orf(new_sequence_rev[1:-2], bash_codeUniversel) |
98 seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel) | 129 seq_prot_ORF6 = simply_get_orf(new_sequence_rev[2:-1], bash_codeUniversel) |
99 | 130 |
100 # For each seq of the multialignment => give the 6 ORFs (in aa) | 131 # For each seq of the multialignment => give the 6 ORFs (in aa) |
101 bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6] | 132 bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6] |
102 | 133 |
103 # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) | 134 # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) |
135 print("2 - Test for the best ORF") | |
104 BEST_MAX = 0 | 136 BEST_MAX = 0 |
105 | 137 |
106 for i in [0,1,2,3,4,5]: # Test the 6 ORFs | 138 for i in [0,1,2,3,4,5]: # Test the 6 ORFs |
107 ORF_Aligned_aa = [] | 139 ORF_Aligned_aa = [] |
108 ORF_Aligned_nuc = [] | 140 ORF_Aligned_nuc = [] |
168 | 200 |
169 | 201 |
170 # 3 - ONCE we have this better segment (BEST CODING SEGMENT) | 202 # 3 - ONCE we have this better segment (BEST CODING SEGMENT) |
171 # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) | 203 # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) |
172 # And get the INDEX of the best ORF [0, 1, or 2] | 204 # And get the INDEX of the best ORF [0, 1, or 2] |
205 print("3 - ONCE we have this better segment") | |
173 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: | 206 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: |
174 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] | 207 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] |
175 pos_MIN_aa = pos_MIN_aa - 1 | 208 pos_MIN_aa = pos_MIN_aa - 1 |
176 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] | 209 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] |
177 | 210 |
184 seq_coding = seq[pos_MIN_aa:pos_MAX_aa] | 217 seq_coding = seq[pos_MIN_aa:pos_MAX_aa] |
185 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq | 218 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq |
186 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding | 219 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding |
187 | 220 |
188 # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment | 221 # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment |
222 print("4 - Get the corresponding position") | |
189 pos_MIN_nuc = pos_MIN_aa * 3 | 223 pos_MIN_nuc = pos_MIN_aa * 3 |
190 pos_MAX_nuc = pos_MAX_aa * 3 | 224 pos_MAX_nuc = pos_MAX_aa * 3 |
191 | 225 |
192 BESTORF_bash_aligned_nc_seq = {} | 226 BESTORF_bash_aligned_nc_seq = {} |
193 BESTORF_bash_aligned_nc_seq_CODING = {} | 227 BESTORF_bash_aligned_nc_seq_CODING = {} |
194 for fasta_name in bash_aligned_nc_seq.keys(): | 228 for fasta_name in bash_aligned_nc_seq.keys(): |
195 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] | 229 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] |
196 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] | 230 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] |
197 BESTORF_bash_aligned_nc_seq[fasta_name] = seq | 231 BESTORF_bash_aligned_nc_seq[fasta_name] = seq |
198 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding | 232 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding |
233 seq_cutted = re.sub(r'^.*?[a-zA-Z]', '', seq) | |
234 sequence_for_blast=(fasta_name+'\n'+seq_cutted+'\n') | |
235 good_ORF_found = False | |
236 try: | |
237 #result_handle = "" | |
238 #blast_records = "" | |
239 # logger.debug("sequence_for_blast = %s ", sequence_for_blast) | |
240 print('sequence_for_blast = %s ',sequence_for_blast, end=' ', flush=True) | |
241 result_handle = NCBIWWW.qblast("blastn", "/db/nt/current/fasta/nt.fsa", sequence_for_blast, expect=0.001, hitlist_size=1) | |
242 blast_records = NCBIXML.parse(result_handle) | |
243 except: | |
244 good_ORF_found = False | |
245 else: | |
246 for blast_record in blast_records: | |
247 for alignment in blast_record.alignments: | |
248 for hsp in alignment.hsps: | |
249 if hsp.expect < 0.001: | |
250 good_ORF_found = True | |
251 print("good_ORF_found = %s" %good_ORF_found) | |
199 | 252 |
200 else: # no CDS found | 253 else: # no CDS found |
201 BESTORF_bash_aligned_nc_seq = {} | 254 BESTORF_bash_aligned_nc_seq = {} |
202 BESTORF_bash_aligned_nc_seq_CODING = {} | 255 BESTORF_bash_aligned_nc_seq_CODING = {} |
203 BESTORF_bash_of_aligned_aa_seq = {} | 256 BESTORF_bash_of_aligned_aa_seq = {} |
204 BESTORF_bash_of_aligned_aa_seq_CODING ={} | 257 BESTORF_bash_of_aligned_aa_seq_CODING ={} |
205 | 258 |
206 # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa | 259 # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa |
207 | 260 |
208 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} | 261 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} |
209 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} | 262 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} |
210 | 263 |
211 Ortho = 0 | 264 Ortho = 0 |
212 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): | 265 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): |
213 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] | 266 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] |
214 Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### | 267 Ortho = detect_methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### |
215 | 268 |
216 # CASE 1: A "M" is present and correctly localized (not in last 50 aa) | 269 # CASE 1: A "M" is present and correctly localized (not in last 50 aa) |
217 if Ortho == 1: | 270 if Ortho == 1: |
218 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING | 271 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING |
219 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING | 272 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING |
241 out1.close() | 294 out1.close() |
242 | 295 |
243 def main(): | 296 def main(): |
244 parser = argparse.ArgumentParser() | 297 parser = argparse.ArgumentParser() |
245 parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt") | 298 parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt") |
246 parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) | 299 parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) |
247 parser.add_argument("min_spec", help="Minimal number of species per alignment") | 300 parser.add_argument("min_spec", help="Minimal number of species per alignment") |
248 parser.add_argument("list_files", help="File with all input files names") | 301 parser.add_argument("list_files", help="File with all input files names") |
249 args = parser.parse_args() | 302 args = parser.parse_args() |
250 | 303 |
251 minimal_cds_length = int(args.min_cds_len) # in aa number | 304 minimal_cds_length = int(args.min_cds_len) # in aa number |
252 bash_codeUniversel = code_universel(args.codeUniversel) | 305 bash_codeUniversel = code_universel(args.codeUniversel) |
253 minimum_species = int(args.min_spec) | 306 minimum_species = int(args.min_spec) |
254 | 307 |
255 # Inputs from file containing list of species | 308 # Inputs from file containing list of species |
256 list_files = [] | 309 list_files = [] |
257 with open(args.list_files, 'r') as f: | 310 with open(args.list_files, 'r') as f: |
258 for line in f.readlines(): | 311 for line in f.readlines(): |
259 list_files.append(line.strip('\n')) | 312 list_files.append(line.strip('\n')) |
276 for file in list_files: | 329 for file in list_files: |
277 #n0 += 1 | 330 #n0 += 1 |
278 | 331 |
279 count_file_processed = count_file_processed + 1 | 332 count_file_processed = count_file_processed + 1 |
280 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number | 333 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number |
281 fasta_file_path = "./%s" %file | 334 fasta_file_path = "./%s" %file |
282 bash_fasta = dico(fasta_file_path) | 335 bash_fasta = dico(fasta_file_path) |
283 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species) | 336 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species) |
284 | 337 |
285 name_elems[1] = nb_gp | 338 name_elems[1] = nb_gp |
286 | 339 |
287 # Update counts and write group in corresponding output directory | 340 # Update counts and write group in corresponding output directory |
288 if BESTORF_nuc != {}: | 341 if BESTORF_nuc != {}: |
289 count_file_with_CDS += 1 | 342 count_file_with_CDS += 1 |
290 if len(BESTORF_nuc.keys()) >= minimum_species : | 343 if len(BESTORF_nuc.keys()) >= minimum_species : |
291 count_file_with_cds_and_enought_species += 1 | 344 count_file_with_cds_and_enought_species += 1 |
292 write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc | 345 write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc |
293 write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting | 346 write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting |
303 if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species : | 356 if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species : |
304 count_file_with_cds_M_and_enought_species += 1 | 357 count_file_with_cds_M_and_enought_species += 1 |
305 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4]) | 358 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4]) |
306 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5]) | 359 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5]) |
307 | 360 |
308 print "*************** CDS detection ***************" | 361 print("*************** CDS detection ***************") |
309 print "\nFiles processed: %d" %count_file_processed | 362 print("\nFiles processed: %d" %count_file_processed) |
310 print "\tFiles with CDS: %d" %count_file_with_CDS | 363 print("\tFiles with CDS: %d" %count_file_with_CDS) |
311 print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species) | 364 print("\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)) |
312 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M | 365 print("\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M) |
313 print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) | 366 print("\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) ) |
314 print "\tFiles without CDS: %d \n" %count_file_without_CDS | 367 print("\tFiles without CDS: %d \n" %count_file_without_CDS) |
315 print "" | 368 print("") |
316 | 369 |
317 if __name__ == '__main__': | 370 if __name__ == '__main__': |
318 main() | 371 main() |