changeset 1:c79bdda8abfb draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
author abims-sbr
date Thu, 09 Jun 2022 12:40:00 +0000
parents eb95bf7f90ae
children
files CDS_search.xml macros.xml scripts/S01_find_orf_on_multiple_alignment.py scripts/S02_remove_too_short_bit_or_whole_sequence.py scripts/S03_remove_site_with_not_enough_species_represented.py scripts/dico.py
diffstat 6 files changed, 153 insertions(+), 96 deletions(-) [+]
line wrap: on
line diff
--- a/CDS_search.xml	Fri Feb 01 10:26:37 2019 -0500
+++ b/CDS_search.xml	Thu Jun 09 12:40:00 2022 +0000
@@ -1,4 +1,4 @@
-<tool name="CDS_search" id="cds_search" version="2.1.2">
+<tool name="CDS_search" id="cds_search" version="2.2.2">
 
 	<description>
 		ORF and CDS search
@@ -9,7 +9,7 @@
 	</macros>
 
 	<requirements>
-		<expand macro="python_required" />
+		<requirement type="package" version="1.79">biopython</requirement>
 	</requirements>
 
   	<command><![CDATA[        
--- a/macros.xml	Fri Feb 01 10:26:37 2019 -0500
+++ b/macros.xml	Thu Jun 09 12:40:00 2022 +0000
@@ -4,6 +4,10 @@
 			<requirement type="package" version="2.7">python</requirement>
 	</xml>
 
+	<xml name="python3_required">
+                        <requirement type="package" version="1.79">biopython</requirement>
+        </xml>
+
     <token name="@HELP_AUTHORS@">
 .. class:: infomark
 
--- a/scripts/S01_find_orf_on_multiple_alignment.py	Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/S01_find_orf_on_multiple_alignment.py	Thu Jun 09 12:40:00 2022 +0000
@@ -2,74 +2,104 @@
 # coding: utf8
 # Author: Eric Fontanillas
 # Modification: 03/09/14 by Julie BAFFARD
-# Last modification : 25/07/18 by Victor Mataigne
+# Last modification : 10/09/21 by Charlotte Berthelier
+
+"""
+Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
+
+- CRITERIA 1
+
+Get the longest part of the sequence alignemen without codon stop "*",
+and test in the 3 potential ORF and check with a Blast the best coding sequence
+
+- CRITERIA 2
+
+This longest part should be > 150nc or 50aa
 
-# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
-                # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
-                # CRITERIA 2 - This longest part should be > 150nc or 50aa
-                # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
-                                 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
-                                 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
+- CRITERIA 3 [OPTIONNAL]
+
+A codon start "M" should be present in this longuest part, before the last 50 aa
 
-import string, os, time, re, zipfile, sys, argparse
+OUTPUTs:
+"05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
+"06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
+"""
+
+import os
+import re
+import argparse
+
+from Bio.Blast import NCBIWWW
+from Bio.Blast import NCBIXML
 from dico import dico
 
-def code_universel(F1):
+
+def code_universel(file1):
     """ Creates bash for genetic code (key : codon ; value : amino-acid) """
-    bash_codeUniversel = {}
+    bash_code_universel = {}
 
-    with open(F1, "r") as file:
+    with open(file1, "r") as file:
         for line in file.readlines():
-            L1 = string.split(line, " ")
-            length1 = len(L1)
+            item = str.split(line, " ")
+            length1 = len(item)
             if length1 == 3:
-                key = L1[0]
-                value = L1[2][:-1]
-                bash_codeUniversel[key] = value
+                key = item[0]
+                value = item[2][:-1]
+                bash_code_universel[key] = value
             else:
-                key = L1[0]
-                value = L1[2]
-                bash_codeUniversel[key] = value
+                key = item[0]
+                value = item[2]
+                bash_code_universel[key] = value
 
-    return(bash_codeUniversel)
+    return bash_code_universel
+
 
 def multiple3(seq):
-    """ Tests if the sequence is a multiple of 3, and if not removes extra-bases 
-        !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """
-    
-    m = len(seq)%3
-    if m != 0 :
-        return seq[:-m], m
-    else :
-        return seq, m
+    """
+    Tests if the sequence is a multiple of 3, and if not removes extra-bases
+    Possible to lost a codon, when I test ORF (as I will decay the ORF)
+    """
 
-def detect_Methionine(seq_aa, Ortho, minimal_cds_length):
+    multiple = len(seq) % 3
+    if multiple != 0:
+        return seq[:-multiple], multiple
+    return seq, multiple
+
+
+def detect_methionine(seq_aa, ortho, minimal_cds_length):
     """ Detects if methionin in the aa sequence """
 
-    ln = len(seq_aa)
-    CUTOFF_Last_50aa = ln - minimal_cds_length
+    size = len(seq_aa)
+    cutoff_last_50aa = size - minimal_cds_length
 
-    # Find all indices of occurances of "M" in a string of aa   
+    # Find all indices of occurances of "M" in a string of aa
     list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]
 
-    # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
+    # If some "M" are present, find whether the first "M" found is not in the
+    # 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
     if list_indices != []:
-        first_M = list_indices[0]
-        if first_M < CUTOFF_Last_50aa:
-            Ortho = 1  # means orthologs found
+        first_m = list_indices[0]
+        if first_m < cutoff_last_50aa:
+            ortho = 1  # means orthologs found
 
-    return(Ortho)
+    return ortho
 
-def ReverseComplement2(seq):
+
+def reverse_complement2(seq):
     """ Reverse complement DNA sequence """
     seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
-    seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
+    seq_dict = {seq1[i]: seq1[i + 6]
+                for i in range(24) if i < 6 or 12 <= i <= 16}
 
     return "".join([seq_dict[base] for base in reversed(seq)])
 
-def simply_get_ORF(seq_dna, gen_code):
-    seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)]
-    seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons]
+
+def simply_get_orf(seq_dna, gen_code):
+    """ Generate the ORF sequence from DNA sequence """
+    seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i + 3]
+                     for i in range(0, len(seq_dna), 3)]
+    seq_by_aa = [gen_code[codon] if codon in gen_code.keys()
+                 else '?' for codon in seq_by_codons]
 
     return ''.join(seq_by_aa)
 
@@ -78,29 +108,31 @@
     # Criteria 1 : Get the segment in the alignment with no codon stop
 
     # 1 - Get the list of aligned aa seq for the 3 ORF:
+    print("1 - Get the list of aligned aa seq for the 3 ORF")
     bash_of_aligned_aa_seq_3ORF = {}
     bash_of_aligned_nuc_seq_3ORF = {}
     BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = []
-    
+
     for fasta_name in bash_aligned_nc_seq.keys():
         # Get sequence, chek if multiple 3, then get 6 orfs
         sequence_nc = bash_aligned_nc_seq[fasta_name]
         new_sequence_nc, modulo = multiple3(sequence_nc)
-        new_sequence_rev = ReverseComplement2(new_sequence_nc)        
+        new_sequence_rev = reverse_complement2(new_sequence_nc)
         # For each seq of the multialignment => give the 6 ORFs (in nuc)
         bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]]
 
-        seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel)
-        seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel)
-        seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel)
-        seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel)
-        seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel)
-        seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel)
-        
+        seq_prot_ORF1 = simply_get_orf(new_sequence_nc, bash_codeUniversel)
+        seq_prot_ORF2 = simply_get_orf(new_sequence_nc[1:-2], bash_codeUniversel)
+        seq_prot_ORF3 = simply_get_orf(new_sequence_nc[2:-1], bash_codeUniversel)
+        seq_prot_ORF4 = simply_get_orf(new_sequence_rev, bash_codeUniversel)
+        seq_prot_ORF5 = simply_get_orf(new_sequence_rev[1:-2], bash_codeUniversel)
+        seq_prot_ORF6 = simply_get_orf(new_sequence_rev[2:-1], bash_codeUniversel)
+
         # For each seq of the multialignment => give the 6 ORFs (in aa)
         bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6]
 
     # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
+    print("2 - Test for the best ORF")
     BEST_MAX = 0
 
     for i in [0,1,2,3,4,5]: # Test the 6 ORFs
@@ -170,6 +202,7 @@
     # 3 - ONCE we have this better segment (BEST CODING SEGMENT)
     # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position)
     # And get the INDEX of the best ORF [0, 1, or 2]
+    print("3 - ONCE we have this better segment")
     if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
         pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0]
         pos_MIN_aa = pos_MIN_aa - 1
@@ -186,6 +219,7 @@
             BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding
 
         # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
+        print("4 - Get the corresponding position")
         pos_MIN_nuc = pos_MIN_aa * 3
         pos_MAX_nuc = pos_MAX_aa * 3
 
@@ -196,6 +230,25 @@
             seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc]
             BESTORF_bash_aligned_nc_seq[fasta_name] = seq
             BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding
+            seq_cutted = re.sub(r'^.*?[a-zA-Z]', '', seq)
+            sequence_for_blast=(fasta_name+'\n'+seq_cutted+'\n')
+            good_ORF_found = False
+            try:
+                #result_handle = ""
+                #blast_records = ""
+                # logger.debug("sequence_for_blast = %s ", sequence_for_blast)
+                print('sequence_for_blast = %s ',sequence_for_blast, end=' ', flush=True)
+                result_handle = NCBIWWW.qblast("blastn", "/db/nt/current/fasta/nt.fsa", sequence_for_blast, expect=0.001, hitlist_size=1)
+                blast_records = NCBIXML.parse(result_handle)
+            except:
+                good_ORF_found = False
+            else:
+                for blast_record in blast_records:
+                    for alignment in blast_record.alignments:
+                        for hsp in alignment.hsps:
+                            if hsp.expect < 0.001:
+                                good_ORF_found = True
+            print("good_ORF_found = %s" %good_ORF_found)
 
     else: # no CDS found
         BESTORF_bash_aligned_nc_seq = {}
@@ -204,14 +257,14 @@
         BESTORF_bash_of_aligned_aa_seq_CODING ={}
 
     # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
-    
+
     BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}
     BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}
 
     Ortho = 0
     for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():
         seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]
-        Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length)   ### DEF6 ###
+        Ortho = detect_methionine(seq_aa, Ortho, minimal_cds_length)   ### DEF6 ###
 
     # CASE 1: A "M" is present and correctly localized (not in last 50 aa)
     if Ortho == 1:
@@ -243,7 +296,7 @@
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")
-    parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)    
+    parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)
     parser.add_argument("min_spec", help="Minimal number of species per alignment")
     parser.add_argument("list_files", help="File with all input files names")
     args = parser.parse_args()
@@ -251,7 +304,7 @@
     minimal_cds_length = int(args.min_cds_len)  # in aa number
     bash_codeUniversel = code_universel(args.codeUniversel)
     minimum_species = int(args.min_spec)
-    
+
     # Inputs from file containing list of species
     list_files = []
     with open(args.list_files, 'r') as f:
@@ -278,14 +331,14 @@
 
         count_file_processed = count_file_processed + 1
         nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
-        fasta_file_path = "./%s" %file    
+        fasta_file_path = "./%s" %file
         bash_fasta = dico(fasta_file_path)
         BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)
-        
+
         name_elems[1] = nb_gp
 
         # Update counts and write group in corresponding output directory
-        if BESTORF_nuc != {}: 
+        if BESTORF_nuc != {}:
             count_file_with_CDS += 1
             if len(BESTORF_nuc.keys()) >= minimum_species :
                 count_file_with_cds_and_enought_species += 1
@@ -305,14 +358,14 @@
                 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])
                 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])
 
-    print "*************** CDS detection ***************"
-    print "\nFiles processed: %d" %count_file_processed
-    print "\tFiles with CDS: %d" %count_file_with_CDS
-    print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)
-    print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
-    print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) 
-    print "\tFiles without CDS: %d \n" %count_file_without_CDS
-    print ""
+    print("*************** CDS detection ***************")
+    print("\nFiles processed: %d" %count_file_processed)
+    print("\tFiles with CDS: %d" %count_file_with_CDS)
+    print("\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species))
+    print("\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M)
+    print("\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) )
+    print("\tFiles without CDS: %d \n" %count_file_without_CDS)
+    print("")
 
 if __name__ == '__main__':
     main()
--- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py	Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py	Thu Jun 09 12:40:00 2022 +0000
@@ -89,9 +89,9 @@
         seq_nuc = dico_nuc[fasta_name]
 
         if "?" in seq:
-            seq = string.replace(seq, "?", "-")
+            seq = str.replace(seq, "?", "-")
         if "?" in seq_nuc:
-            seq_nuc = string.replace(seq_nuc, "?", "-")
+            seq_nuc = str.replace(seq_nuc, "?", "-")
         
         ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?"
         ## aa
@@ -107,7 +107,7 @@
         
         ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa")
         LIST_sublist_aa=[]
-        S1 = string.split(seq, "-")
+        S1 = str.split(seq, "-")
         for element in S1:
             if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa:
                 LIST_sublist_aa.append(element)
@@ -126,7 +126,7 @@
         
         for subsequence in LIST_sublist_aa:
             ## aa
-            START = string.find(seq, subsequence)
+            START = str.find(seq, subsequence)
             END = START + len(subsequence)
             seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:]  ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence
             ## nuc
@@ -135,11 +135,11 @@
             seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:]
         
         ## 4.5 ## Save new sequence in bash if not empty
-        seq_empty_test = string.replace(seq_gap, "-", "")
+        seq_empty_test = str.replace(seq_gap, "-", "")
         if seq_empty_test != "":
             new_bash_aa[fasta_name] = seq_gap
 
-        seq_empty_test = string.replace(seq_gap_nuc, "-", "")
+        seq_empty_test = str.replace(seq_gap_nuc, "-", "")
         if seq_empty_test != "":
             new_bash_nuc[fasta_name] = seq_gap_nuc
 
@@ -180,8 +180,8 @@
 
 ###Print
 if sys.argv[2] == "oui" :
-    print "\nIn locus with CDS considering Methionine : \n"
+    print("\nIn locus with CDS considering Methionine : \n")
 else :
-    print "\nIn locus with CDS regardless of the Methionine : \n"
+    print("\nIn locus with CDS regardless of the Methionine : \n")
 
-print "\nTotal number of locus recorded  = %d" %n0
\ No newline at end of file
+print("\nTotal number of locus recorded  = %d" %n0)
--- a/scripts/S03_remove_site_with_not_enough_species_represented.py	Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/S03_remove_site_with_not_enough_species_represented.py	Thu Jun 09 12:40:00 2022 +0000
@@ -12,7 +12,7 @@
 def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB):
 
     ## 1 ## Get alignment length
-    fasta_name0 = bash_aa.keys()[0]
+    fasta_name0 = list(bash_aa.keys())[0]
     ln_aa = len(bash_aa[fasta_name0])
 
     ln_nuc = len(bash_nuc[fasta_name0])
@@ -23,7 +23,7 @@
     i=0
     while i < ln_aa:
         site = []
-        for fasta_name in bash_aa.keys():
+        for fasta_name in list(bash_aa.keys()):
             pos = bash_aa[fasta_name][i]
 
             if pos != "-" and pos != "?" and pos != "X":
@@ -45,15 +45,15 @@
     ## 4 ## Create entries for "filtered_bash" for aa & nuc
     filtered_bash_aa = {}
     filtered_bash_nuc = {}
-    for fasta_name in bash_aa.keys():
+    for fasta_name in list(bash_aa.keys()):
         filtered_bash_aa[fasta_name] = ""
-    for fasta_name in bash_nuc.keys():
+    for fasta_name in list(bash_nuc.keys()):
         filtered_bash_nuc[fasta_name] = ""
 
     ## 5 ## Write "filtered_bash" for aa
     j=0
     while j < ln_aa:
-        for fasta_name in bash_aa.keys():
+        for fasta_name in list(bash_aa.keys()):
             seq=filtered_bash_aa[fasta_name]
             pos=bash_aa[fasta_name][j]
 
@@ -63,7 +63,7 @@
         j = j + 1
 
     ## 6 ## Remove empty sequence
-    for name in filtered_bash_aa.keys():
+    for name in list(filtered_bash_aa.keys()):
         seq = filtered_bash_aa[name]
         if seq == '':
             del filtered_bash_aa[name]
@@ -72,7 +72,7 @@
     ## 7 ## Write "filtered_bash" for nuc
     j=0
     while j < ln_nuc:
-        for fasta_name in bash_nuc.keys():
+        for fasta_name in list(bash_nuc.keys()):
             seq=filtered_bash_nuc[fasta_name]
             #print seq
             pos=bash_nuc[fasta_name][j]
@@ -83,7 +83,7 @@
         j = j + 1
 
     ## 8 ## Remove empty sequence
-    for name in filtered_bash_nuc.keys():
+    for name in list(filtered_bash_nuc.keys()):
         seq = filtered_bash_nuc[name]
         if seq == '':
             del filtered_bash_nuc[name]
@@ -147,7 +147,7 @@
     ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment)
     filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB)   ### DEF 2 ###
 
-    k = filtered_bash_nuc.keys()
+    k = list(filtered_bash_nuc.keys())
     new_leng_nuc = 0
     if k != []:
         k0 = k[0]
@@ -158,14 +158,14 @@
     n0+=1
     #name_elems[1] = str(n0)
     name_elems[1] = file.split('_')[1]
-    name_elems[3] =  str(len(filtered_bash_aa.keys()))
+    name_elems[3] =  str(len(list(filtered_bash_aa.keys())))
     new_name = "_".join(name_elems)
 
     ## 4.5 ## Write filtered alignment in OUTPUTs
     ## aa
     if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
         OUTaa=open("%s/%s" %(path_OUT1, new_name), "w")
-        for fasta_name in filtered_bash_aa.keys():
+        for fasta_name in list(filtered_bash_aa.keys()):
             seq_aa = filtered_bash_aa[fasta_name]
             OUTaa.write("%s\n" %fasta_name)
             OUTaa.write("%s\n" %seq_aa)
@@ -174,7 +174,7 @@
     if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
         good+=1
         OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w")
-        for fasta_name in filtered_bash_nuc.keys():
+        for fasta_name in list(filtered_bash_nuc.keys()):
             seq_nuc = filtered_bash_nuc[fasta_name]
             OUTnuc.write("%s\n" %fasta_name)
             OUTnuc.write("%s\n" %seq_nuc)
@@ -184,8 +184,8 @@
 
 
 ## 5 ## Print
-print "*************** 2nd Filter : removal of the indel ***************"
-print "\nTotal number of locus recorded  = %d" %n0
-print "\tTotal number of locus with no indels (SAVED) = %d" %good
-print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad
-print ""
\ No newline at end of file
+print("*************** 2nd Filter : removal of the indel ***************")
+print("\nTotal number of locus recorded  = %d" %n0)
+print("\tTotal number of locus with no indels (SAVED) = %d" %good)
+print("\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad)
+print("")
--- a/scripts/dico.py	Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/dico.py	Thu Jun 09 12:40:00 2022 +0000
@@ -3,10 +3,10 @@
 def dico(F1):
     dicoco = {}
     with open(F1, "r") as file:
-        for name, query in itertools.izip_longest(*[file]*2):
+        for name, query in itertools.zip_longest(*[file]*2):
             if name[0] == ">":
                 fasta_name_query = name[:-1]
-                Sn = string.split(fasta_name_query, "||")
+                Sn = str.split(fasta_name_query, "||")
                 fasta_name_query = Sn[0]                
                 fasta_seq_query = query[:-1]
                 dicoco[fasta_name_query] = fasta_seq_query