Galaxy |

Changeset 1:c79bdda8abfb (2022-06-09)

Previous changeset 0:eb95bf7f90ae (2019-02-01)

Commit message:
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519

modified:
CDS_search.xml
macros.xml
scripts/S01_find_orf_on_multiple_alignment.py
scripts/S02_remove_too_short_bit_or_whole_sequence.py
scripts/S03_remove_site_with_not_enough_species_represented.py
scripts/dico.py

diff -r eb95bf7f90ae -r c79bdda8abfb CDS_search.xml
--- a/CDS_search.xml Fri Feb 01 10:26:37 2019 -0500
+++ b/CDS_search.xml Thu Jun 09 12:40:00 2022 +0000

[

@@ -1,4 +1,4 @@
-<tool name="CDS_search" id="cds_search" version="2.1.2">
+<tool name="CDS_search" id="cds_search" version="2.2.2">

<description>
ORF and CDS search
@@ -9,7 +9,7 @@
</macros>

<requirements>
- <expand macro="python_required" />
+ <requirement type="package" version="1.79">biopython</requirement>
</requirements>

<command><![CDATA[

diff -r eb95bf7f90ae -r c79bdda8abfb macros.xml
--- a/macros.xml Fri Feb 01 10:26:37 2019 -0500
+++ b/macros.xml Thu Jun 09 12:40:00 2022 +0000

@@ -4,6 +4,10 @@
<requirement type="package" version="2.7">python</requirement>
</xml>

+ <xml name="python3_required">
+                        <requirement type="package" version="1.79">biopython</requirement>
+        </xml>
+
     <token name="@HELP_AUTHORS@">
.. class:: infomark

diff -r eb95bf7f90ae -r c79bdda8abfb scripts/S01_find_orf_on_multiple_alignment.py
--- a/scripts/S01_find_orf_on_multiple_alignment.py Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/S01_find_orf_on_multiple_alignment.py Thu Jun 09 12:40:00 2022 +0000

[

b'@@ -2,74 +2,104 @@\n # coding: utf8\n # Author: Eric Fontanillas\n # Modification: 03/09/14 by Julie BAFFARD\n-# Last modification : 25/07/18 by Victor Mataigne\n+# Last modification : 10/09/21 by Charlotte Berthelier\n+\n+"""\n+Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria\n+\n+- CRITERIA 1\n+\n+Get the longest part of the sequence alignemen without codon stop "*",\n+and test in the 3 potential ORF and check with a Blast the best coding sequence\n+\n+- CRITERIA 2\n+\n+This longest part should be > 150nc or 50aa\n \n-# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria\n- # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF\n- # CRITERIA 2 - This longest part should be > 150nc or 50aa\n- # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa\n- # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA\n- # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA\n+- CRITERIA 3 [OPTIONNAL]\n+\n+A codon start "M" should be present in this longuest part, before the last 50 aa\n \n-import string, os, time, re, zipfile, sys, argparse\n+OUTPUTs:\n+"05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA\n+"06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA\n+"""\n+\n+import os\n+import re\n+import argparse\n+\n+from Bio.Blast import NCBIWWW\n+from Bio.Blast import NCBIXML\n from dico import dico\n \n-def code_universel(F1):\n+\n+def code_universel(file1):\n """ Creates bash for genetic code (key : codon ; value : amino-acid) """\n- bash_codeUniversel = {}\n+ bash_code_universel = {}\n \n- with open(F1, "r") as file:\n+ with open(file1, "r") as file:\n for line in file.readlines():\n- L1 = string.split(line, " ")\n- length1 = len(L1)\n+ item = str.split(line, " ")\n+ length1 = len(item)\n if length1 == 3:\n- key = L1[0]\n- value = L1[2][:-1]\n- bash_codeUniversel[key] = value\n+ key = item[0]\n+ value = item[2][:-1]\n+ bash_code_universel[key] = value\n else:\n- key = L1[0]\n- value = L1[2]\n- bash_codeUniversel[key] = value\n+ key = item[0]\n+ value = item[2]\n+ bash_code_universel[key] = value\n \n- return(bash_codeUniversel)\n+ return bash_code_universel\n+\n \n def multiple3(seq):\n- """ Tests if the sequence is a multiple of 3, and if not removes extra-bases \n- !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """\n- \n- m = len(seq)%3\n- if m != 0 :\n- return seq[:-m], m\n- else :\n- return seq, m\n+ """\n+ Tests if the sequence is a multiple of 3, and if not removes extra-bases\n+ Possible to lost a codon, when I test ORF (as I will decay the ORF)\n+ """\n \n-def detect_Methionine(seq_aa, Ortho, minimal_cds_length):\n+ multiple = len(seq) % 3\n+ if multiple != 0:\n+ return seq[:-multiple], multiple\n+ return seq, multiple\n+\n+\n+def detect_methionine(seq_aa, ortho, minimal_cds_length):\n """ Detects if methionin in the aa sequence """\n \n- ln = len(seq_aa)\n- CUTOFF_Last_50aa = ln - minimal_cds_length\n+ size = len(seq_aa)\n+ cutoff_last_50aa = size - minimal_cds_length\n \n- # Find all indices of occurances of "M" in a string of aa \n+ # Find all indices of occurances of "M" in a string of aa\n list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]\n \n- # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS\n+ # If some "M" are present, find whether the first "M" found is not in the\n+ # 50 last aa (indice < C'..b' print("good_ORF_found = %s" %good_ORF_found)\n \n else: # no CDS found\n BESTORF_bash_aligned_nc_seq = {}\n@@ -204,14 +257,14 @@\n BESTORF_bash_of_aligned_aa_seq_CODING ={}\n \n # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa\n- \n+\n BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}\n BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}\n \n Ortho = 0\n for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():\n seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]\n- Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ###\n+ Ortho = detect_methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ###\n \n # CASE 1: A "M" is present and correctly localized (not in last 50 aa)\n if Ortho == 1:\n@@ -243,7 +296,7 @@\n def main():\n parser = argparse.ArgumentParser()\n parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")\n- parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) \n+ parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)\n parser.add_argument("min_spec", help="Minimal number of species per alignment")\n parser.add_argument("list_files", help="File with all input files names")\n args = parser.parse_args()\n@@ -251,7 +304,7 @@\n minimal_cds_length = int(args.min_cds_len) # in aa number\n bash_codeUniversel = code_universel(args.codeUniversel)\n minimum_species = int(args.min_spec)\n- \n+\n # Inputs from file containing list of species\n list_files = []\n with open(args.list_files, \'r\') as f:\n@@ -278,14 +331,14 @@\n \n count_file_processed = count_file_processed + 1\n nb_gp = file.split(\'_\')[1] # Keep trace of the orthogroup number\n- fasta_file_path = "./%s" %file \n+ fasta_file_path = "./%s" %file\n bash_fasta = dico(fasta_file_path)\n BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)\n- \n+\n name_elems[1] = nb_gp\n \n # Update counts and write group in corresponding output directory\n- if BESTORF_nuc != {}: \n+ if BESTORF_nuc != {}:\n count_file_with_CDS += 1\n if len(BESTORF_nuc.keys()) >= minimum_species :\n count_file_with_cds_and_enought_species += 1\n@@ -305,14 +358,14 @@\n write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])\n write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])\n \n- print "*************** CDS detection ***************"\n- print "\\nFiles processed: %d" %count_file_processed\n- print "\\tFiles with CDS: %d" %count_file_with_CDS\n- print "\\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)\n- print "\\t\\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M\n- print "\\t\\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) \n- print "\\tFiles without CDS: %d \\n" %count_file_without_CDS\n- print ""\n+ print("*************** CDS detection ***************")\n+ print("\\nFiles processed: %d" %count_file_processed)\n+ print("\\tFiles with CDS: %d" %count_file_with_CDS)\n+ print("\\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species))\n+ print("\\t\\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M)\n+ print("\\t\\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) )\n+ print("\\tFiles without CDS: %d \\n" %count_file_without_CDS)\n+ print("")\n \n if __name__ == \'__main__\':\n main()\n'

diff -r eb95bf7f90ae -r c79bdda8abfb scripts/S02_remove_too_short_bit_or_whole_sequence.py
--- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py Thu Jun 09 12:40:00 2022 +0000

[

@@ -89,9 +89,9 @@
         seq_nuc = dico_nuc[fasta_name]

         if "?" in seq:
-            seq = string.replace(seq, "?", "-")
+            seq = str.replace(seq, "?", "-")
         if "?" in seq_nuc:
-            seq_nuc = string.replace(seq_nuc, "?", "-")
+            seq_nuc = str.replace(seq_nuc, "?", "-")

         ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?"
         ## aa
@@ -107,7 +107,7 @@

         ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa")
         LIST_sublist_aa=[]
-        S1 = string.split(seq, "-")
+        S1 = str.split(seq, "-")
         for element in S1:
             if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa:
                 LIST_sublist_aa.append(element)
@@ -126,7 +126,7 @@

         for subsequence in LIST_sublist_aa:
             ## aa
-            START = string.find(seq, subsequence)
+            START = str.find(seq, subsequence)
             END = START + len(subsequence)
             seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:]  ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence
             ## nuc
@@ -135,11 +135,11 @@
             seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:]

         ## 4.5 ## Save new sequence in bash if not empty
-        seq_empty_test = string.replace(seq_gap, "-", "")
+        seq_empty_test = str.replace(seq_gap, "-", "")
         if seq_empty_test != "":
             new_bash_aa[fasta_name] = seq_gap

-        seq_empty_test = string.replace(seq_gap_nuc, "-", "")
+        seq_empty_test = str.replace(seq_gap_nuc, "-", "")
         if seq_empty_test != "":
             new_bash_nuc[fasta_name] = seq_gap_nuc

@@ -180,8 +180,8 @@

###Print
if sys.argv[2] == "oui" :
-    print "\nIn locus with CDS considering Methionine : \n"
+    print("\nIn locus with CDS considering Methionine : \n")
else :
-    print "\nIn locus with CDS regardless of the Methionine : \n"
+    print("\nIn locus with CDS regardless of the Methionine : \n")

-print "\nTotal number of locus recorded  = %d" %n0
\ No newline at end of file
+print("\nTotal number of locus recorded  = %d" %n0)

diff -r eb95bf7f90ae -r c79bdda8abfb scripts/S03_remove_site_with_not_enough_species_represented.py
--- a/scripts/S03_remove_site_with_not_enough_species_represented.py Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/S03_remove_site_with_not_enough_species_represented.py Thu Jun 09 12:40:00 2022 +0000

[

@@ -12,7 +12,7 @@
def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB):

     ## 1 ## Get alignment length
-    fasta_name0 = bash_aa.keys()[0]
+    fasta_name0 = list(bash_aa.keys())[0]
     ln_aa = len(bash_aa[fasta_name0])

     ln_nuc = len(bash_nuc[fasta_name0])
@@ -23,7 +23,7 @@
     i=0
     while i < ln_aa:
         site = []
-        for fasta_name in bash_aa.keys():
+        for fasta_name in list(bash_aa.keys()):
             pos = bash_aa[fasta_name][i]

             if pos != "-" and pos != "?" and pos != "X":
@@ -45,15 +45,15 @@
     ## 4 ## Create entries for "filtered_bash" for aa & nuc
     filtered_bash_aa = {}
     filtered_bash_nuc = {}
-    for fasta_name in bash_aa.keys():
+    for fasta_name in list(bash_aa.keys()):
         filtered_bash_aa[fasta_name] = ""
-    for fasta_name in bash_nuc.keys():
+    for fasta_name in list(bash_nuc.keys()):
         filtered_bash_nuc[fasta_name] = ""

     ## 5 ## Write "filtered_bash" for aa
     j=0
     while j < ln_aa:
-        for fasta_name in bash_aa.keys():
+        for fasta_name in list(bash_aa.keys()):
             seq=filtered_bash_aa[fasta_name]
             pos=bash_aa[fasta_name][j]

@@ -63,7 +63,7 @@
         j = j + 1

     ## 6 ## Remove empty sequence
-    for name in filtered_bash_aa.keys():
+    for name in list(filtered_bash_aa.keys()):
         seq = filtered_bash_aa[name]
         if seq == '':
             del filtered_bash_aa[name]
@@ -72,7 +72,7 @@
     ## 7 ## Write "filtered_bash" for nuc
     j=0
     while j < ln_nuc:
-        for fasta_name in bash_nuc.keys():
+        for fasta_name in list(bash_nuc.keys()):
             seq=filtered_bash_nuc[fasta_name]
             #print seq
             pos=bash_nuc[fasta_name][j]
@@ -83,7 +83,7 @@
         j = j + 1

     ## 8 ## Remove empty sequence
-    for name in filtered_bash_nuc.keys():
+    for name in list(filtered_bash_nuc.keys()):
         seq = filtered_bash_nuc[name]
         if seq == '':
             del filtered_bash_nuc[name]
@@ -147,7 +147,7 @@
     ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment)
     filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB)   ### DEF 2 ###

-    k = filtered_bash_nuc.keys()
+    k = list(filtered_bash_nuc.keys())
     new_leng_nuc = 0
     if k != []:
         k0 = k[0]
@@ -158,14 +158,14 @@
     n0+=1
     #name_elems[1] = str(n0)
     name_elems[1] = file.split('_')[1]
-    name_elems[3] =  str(len(filtered_bash_aa.keys()))
+    name_elems[3] =  str(len(list(filtered_bash_aa.keys())))
     new_name = "_".join(name_elems)

     ## 4.5 ## Write filtered alignment in OUTPUTs
     ## aa
     if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
         OUTaa=open("%s/%s" %(path_OUT1, new_name), "w")
-        for fasta_name in filtered_bash_aa.keys():
+        for fasta_name in list(filtered_bash_aa.keys()):
             seq_aa = filtered_bash_aa[fasta_name]
             OUTaa.write("%s\n" %fasta_name)
             OUTaa.write("%s\n" %seq_aa)
@@ -174,7 +174,7 @@
     if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
         good+=1
         OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w")
-        for fasta_name in filtered_bash_nuc.keys():
+        for fasta_name in list(filtered_bash_nuc.keys()):
             seq_nuc = filtered_bash_nuc[fasta_name]
             OUTnuc.write("%s\n" %fasta_name)
             OUTnuc.write("%s\n" %seq_nuc)
@@ -184,8 +184,8 @@

## 5 ## Print
-print "*************** 2nd Filter : removal of the indel ***************"
-print "\nTotal number of locus recorded  = %d" %n0
-print "\tTotal number of locus with no indels (SAVED) = %d" %good
-print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad
-print ""
\ No newline at end of file
+print("*************** 2nd Filter : removal of the indel ***************")
+print("\nTotal number of locus recorded  = %d" %n0)
+print("\tTotal number of locus with no indels (SAVED) = %d" %good)
+print("\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad)
+print("")

diff -r eb95bf7f90ae -r c79bdda8abfb scripts/dico.py
--- a/scripts/dico.py Fri Feb 01 10:26:37 2019 -0500
+++ b/scripts/dico.py Thu Jun 09 12:40:00 2022 +0000

[

@@ -3,10 +3,10 @@
def dico(F1):
     dicoco = {}
     with open(F1, "r") as file:
-        for name, query in itertools.izip_longest(*[file]*2):
+        for name, query in itertools.zip_longest(*[file]*2):
             if name[0] == ">":
                 fasta_name_query = name[:-1]
-                Sn = string.split(fasta_name_query, "||")
+                Sn = str.split(fasta_name_query, "||")
                 fasta_name_query = Sn[0]
                 fasta_seq_query = query[:-1]
                 dicoco[fasta_name_query] = fasta_seq_query