Previous changeset 0:eb95bf7f90ae (2019-02-01) |
Commit message:
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519 |
modified:
CDS_search.xml macros.xml scripts/S01_find_orf_on_multiple_alignment.py scripts/S02_remove_too_short_bit_or_whole_sequence.py scripts/S03_remove_site_with_not_enough_species_represented.py scripts/dico.py |
b |
diff -r eb95bf7f90ae -r c79bdda8abfb CDS_search.xml --- a/CDS_search.xml Fri Feb 01 10:26:37 2019 -0500 +++ b/CDS_search.xml Thu Jun 09 12:40:00 2022 +0000 |
[ |
@@ -1,4 +1,4 @@ -<tool name="CDS_search" id="cds_search" version="2.1.2"> +<tool name="CDS_search" id="cds_search" version="2.2.2"> <description> ORF and CDS search @@ -9,7 +9,7 @@ </macros> <requirements> - <expand macro="python_required" /> + <requirement type="package" version="1.79">biopython</requirement> </requirements> <command><![CDATA[ |
b |
diff -r eb95bf7f90ae -r c79bdda8abfb macros.xml --- a/macros.xml Fri Feb 01 10:26:37 2019 -0500 +++ b/macros.xml Thu Jun 09 12:40:00 2022 +0000 |
b |
@@ -4,6 +4,10 @@ <requirement type="package" version="2.7">python</requirement> </xml> + <xml name="python3_required"> + <requirement type="package" version="1.79">biopython</requirement> + </xml> + <token name="@HELP_AUTHORS@"> .. class:: infomark |
b |
diff -r eb95bf7f90ae -r c79bdda8abfb scripts/S01_find_orf_on_multiple_alignment.py --- a/scripts/S01_find_orf_on_multiple_alignment.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Thu Jun 09 12:40:00 2022 +0000 |
[ |
b'@@ -2,74 +2,104 @@\n # coding: utf8\n # Author: Eric Fontanillas\n # Modification: 03/09/14 by Julie BAFFARD\n-# Last modification : 25/07/18 by Victor Mataigne\n+# Last modification : 10/09/21 by Charlotte Berthelier\n+\n+"""\n+Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria\n+\n+- CRITERIA 1\n+\n+Get the longest part of the sequence alignemen without codon stop "*",\n+and test in the 3 potential ORF and check with a Blast the best coding sequence\n+\n+- CRITERIA 2\n+\n+This longest part should be > 150nc or 50aa\n \n-# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria\n- # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF\n- # CRITERIA 2 - This longest part should be > 150nc or 50aa\n- # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa\n- # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA\n- # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA\n+- CRITERIA 3 [OPTIONNAL]\n+\n+A codon start "M" should be present in this longuest part, before the last 50 aa\n \n-import string, os, time, re, zipfile, sys, argparse\n+OUTPUTs:\n+"05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA\n+"06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA\n+"""\n+\n+import os\n+import re\n+import argparse\n+\n+from Bio.Blast import NCBIWWW\n+from Bio.Blast import NCBIXML\n from dico import dico\n \n-def code_universel(F1):\n+\n+def code_universel(file1):\n """ Creates bash for genetic code (key : codon ; value : amino-acid) """\n- bash_codeUniversel = {}\n+ bash_code_universel = {}\n \n- with open(F1, "r") as file:\n+ with open(file1, "r") as file:\n for line in file.readlines():\n- L1 = string.split(line, " ")\n- length1 = len(L1)\n+ item = str.split(line, " ")\n+ length1 = len(item)\n if length1 == 3:\n- key = L1[0]\n- value = L1[2][:-1]\n- bash_codeUniversel[key] = value\n+ key = item[0]\n+ value = item[2][:-1]\n+ bash_code_universel[key] = value\n else:\n- key = L1[0]\n- value = L1[2]\n- bash_codeUniversel[key] = value\n+ key = item[0]\n+ value = item[2]\n+ bash_code_universel[key] = value\n \n- return(bash_codeUniversel)\n+ return bash_code_universel\n+\n \n def multiple3(seq):\n- """ Tests if the sequence is a multiple of 3, and if not removes extra-bases \n- !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """\n- \n- m = len(seq)%3\n- if m != 0 :\n- return seq[:-m], m\n- else :\n- return seq, m\n+ """\n+ Tests if the sequence is a multiple of 3, and if not removes extra-bases\n+ Possible to lost a codon, when I test ORF (as I will decay the ORF)\n+ """\n \n-def detect_Methionine(seq_aa, Ortho, minimal_cds_length):\n+ multiple = len(seq) % 3\n+ if multiple != 0:\n+ return seq[:-multiple], multiple\n+ return seq, multiple\n+\n+\n+def detect_methionine(seq_aa, ortho, minimal_cds_length):\n """ Detects if methionin in the aa sequence """\n \n- ln = len(seq_aa)\n- CUTOFF_Last_50aa = ln - minimal_cds_length\n+ size = len(seq_aa)\n+ cutoff_last_50aa = size - minimal_cds_length\n \n- # Find all indices of occurances of "M" in a string of aa \n+ # Find all indices of occurances of "M" in a string of aa\n list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]\n \n- # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS\n+ # If some "M" are present, find whether the first "M" found is not in the\n+ # 50 last aa (indice < C'..b' print("good_ORF_found = %s" %good_ORF_found)\n \n else: # no CDS found\n BESTORF_bash_aligned_nc_seq = {}\n@@ -204,14 +257,14 @@\n BESTORF_bash_of_aligned_aa_seq_CODING ={}\n \n # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa\n- \n+\n BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}\n BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}\n \n Ortho = 0\n for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():\n seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]\n- Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ###\n+ Ortho = detect_methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ###\n \n # CASE 1: A "M" is present and correctly localized (not in last 50 aa)\n if Ortho == 1:\n@@ -243,7 +296,7 @@\n def main():\n parser = argparse.ArgumentParser()\n parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")\n- parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) \n+ parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)\n parser.add_argument("min_spec", help="Minimal number of species per alignment")\n parser.add_argument("list_files", help="File with all input files names")\n args = parser.parse_args()\n@@ -251,7 +304,7 @@\n minimal_cds_length = int(args.min_cds_len) # in aa number\n bash_codeUniversel = code_universel(args.codeUniversel)\n minimum_species = int(args.min_spec)\n- \n+\n # Inputs from file containing list of species\n list_files = []\n with open(args.list_files, \'r\') as f:\n@@ -278,14 +331,14 @@\n \n count_file_processed = count_file_processed + 1\n nb_gp = file.split(\'_\')[1] # Keep trace of the orthogroup number\n- fasta_file_path = "./%s" %file \n+ fasta_file_path = "./%s" %file\n bash_fasta = dico(fasta_file_path)\n BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)\n- \n+\n name_elems[1] = nb_gp\n \n # Update counts and write group in corresponding output directory\n- if BESTORF_nuc != {}: \n+ if BESTORF_nuc != {}:\n count_file_with_CDS += 1\n if len(BESTORF_nuc.keys()) >= minimum_species :\n count_file_with_cds_and_enought_species += 1\n@@ -305,14 +358,14 @@\n write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])\n write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])\n \n- print "*************** CDS detection ***************"\n- print "\\nFiles processed: %d" %count_file_processed\n- print "\\tFiles with CDS: %d" %count_file_with_CDS\n- print "\\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)\n- print "\\t\\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M\n- print "\\t\\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) \n- print "\\tFiles without CDS: %d \\n" %count_file_without_CDS\n- print ""\n+ print("*************** CDS detection ***************")\n+ print("\\nFiles processed: %d" %count_file_processed)\n+ print("\\tFiles with CDS: %d" %count_file_with_CDS)\n+ print("\\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species))\n+ print("\\t\\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M)\n+ print("\\t\\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) )\n+ print("\\tFiles without CDS: %d \\n" %count_file_without_CDS)\n+ print("")\n \n if __name__ == \'__main__\':\n main()\n' |
b |
diff -r eb95bf7f90ae -r c79bdda8abfb scripts/S02_remove_too_short_bit_or_whole_sequence.py --- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py Thu Jun 09 12:40:00 2022 +0000 |
[ |
@@ -89,9 +89,9 @@ seq_nuc = dico_nuc[fasta_name] if "?" in seq: - seq = string.replace(seq, "?", "-") + seq = str.replace(seq, "?", "-") if "?" in seq_nuc: - seq_nuc = string.replace(seq_nuc, "?", "-") + seq_nuc = str.replace(seq_nuc, "?", "-") ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?" ## aa @@ -107,7 +107,7 @@ ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa") LIST_sublist_aa=[] - S1 = string.split(seq, "-") + S1 = str.split(seq, "-") for element in S1: if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa: LIST_sublist_aa.append(element) @@ -126,7 +126,7 @@ for subsequence in LIST_sublist_aa: ## aa - START = string.find(seq, subsequence) + START = str.find(seq, subsequence) END = START + len(subsequence) seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:] ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence ## nuc @@ -135,11 +135,11 @@ seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:] ## 4.5 ## Save new sequence in bash if not empty - seq_empty_test = string.replace(seq_gap, "-", "") + seq_empty_test = str.replace(seq_gap, "-", "") if seq_empty_test != "": new_bash_aa[fasta_name] = seq_gap - seq_empty_test = string.replace(seq_gap_nuc, "-", "") + seq_empty_test = str.replace(seq_gap_nuc, "-", "") if seq_empty_test != "": new_bash_nuc[fasta_name] = seq_gap_nuc @@ -180,8 +180,8 @@ if sys.argv[2] == "oui" : - print "\nIn locus with CDS considering Methionine : \n" + print("\nIn locus with CDS considering Methionine : \n") else : - print "\nIn locus with CDS regardless of the Methionine : \n" + print("\nIn locus with CDS regardless of the Methionine : \n") -print "\nTotal number of locus recorded = %d" %n0 \ No newline at end of file +print("\nTotal number of locus recorded = %d" %n0) |
b |
diff -r eb95bf7f90ae -r c79bdda8abfb scripts/S03_remove_site_with_not_enough_species_represented.py --- a/scripts/S03_remove_site_with_not_enough_species_represented.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/S03_remove_site_with_not_enough_species_represented.py Thu Jun 09 12:40:00 2022 +0000 |
[ |
@@ -12,7 +12,7 @@ def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB): ## 1 ## Get alignment length - fasta_name0 = bash_aa.keys()[0] + fasta_name0 = list(bash_aa.keys())[0] ln_aa = len(bash_aa[fasta_name0]) ln_nuc = len(bash_nuc[fasta_name0]) @@ -23,7 +23,7 @@ i=0 while i < ln_aa: site = [] - for fasta_name in bash_aa.keys(): + for fasta_name in list(bash_aa.keys()): pos = bash_aa[fasta_name][i] if pos != "-" and pos != "?" and pos != "X": @@ -45,15 +45,15 @@ ## 4 ## Create entries for "filtered_bash" for aa & nuc filtered_bash_aa = {} filtered_bash_nuc = {} - for fasta_name in bash_aa.keys(): + for fasta_name in list(bash_aa.keys()): filtered_bash_aa[fasta_name] = "" - for fasta_name in bash_nuc.keys(): + for fasta_name in list(bash_nuc.keys()): filtered_bash_nuc[fasta_name] = "" ## 5 ## Write "filtered_bash" for aa j=0 while j < ln_aa: - for fasta_name in bash_aa.keys(): + for fasta_name in list(bash_aa.keys()): seq=filtered_bash_aa[fasta_name] pos=bash_aa[fasta_name][j] @@ -63,7 +63,7 @@ j = j + 1 ## 6 ## Remove empty sequence - for name in filtered_bash_aa.keys(): + for name in list(filtered_bash_aa.keys()): seq = filtered_bash_aa[name] if seq == '': del filtered_bash_aa[name] @@ -72,7 +72,7 @@ ## 7 ## Write "filtered_bash" for nuc j=0 while j < ln_nuc: - for fasta_name in bash_nuc.keys(): + for fasta_name in list(bash_nuc.keys()): seq=filtered_bash_nuc[fasta_name] #print seq pos=bash_nuc[fasta_name][j] @@ -83,7 +83,7 @@ j = j + 1 ## 8 ## Remove empty sequence - for name in filtered_bash_nuc.keys(): + for name in list(filtered_bash_nuc.keys()): seq = filtered_bash_nuc[name] if seq == '': del filtered_bash_nuc[name] @@ -147,7 +147,7 @@ ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment) filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ### - k = filtered_bash_nuc.keys() + k = list(filtered_bash_nuc.keys()) new_leng_nuc = 0 if k != []: k0 = k[0] @@ -158,14 +158,14 @@ n0+=1 #name_elems[1] = str(n0) name_elems[1] = file.split('_')[1] - name_elems[3] = str(len(filtered_bash_aa.keys())) + name_elems[3] = str(len(list(filtered_bash_aa.keys()))) new_name = "_".join(name_elems) ## 4.5 ## Write filtered alignment in OUTPUTs ## aa if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: OUTaa=open("%s/%s" %(path_OUT1, new_name), "w") - for fasta_name in filtered_bash_aa.keys(): + for fasta_name in list(filtered_bash_aa.keys()): seq_aa = filtered_bash_aa[fasta_name] OUTaa.write("%s\n" %fasta_name) OUTaa.write("%s\n" %seq_aa) @@ -174,7 +174,7 @@ if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: good+=1 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w") - for fasta_name in filtered_bash_nuc.keys(): + for fasta_name in list(filtered_bash_nuc.keys()): seq_nuc = filtered_bash_nuc[fasta_name] OUTnuc.write("%s\n" %fasta_name) OUTnuc.write("%s\n" %seq_nuc) @@ -184,8 +184,8 @@ ## 5 ## Print -print "*************** 2nd Filter : removal of the indel ***************" -print "\nTotal number of locus recorded = %d" %n0 -print "\tTotal number of locus with no indels (SAVED) = %d" %good -print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad -print "" \ No newline at end of file +print("*************** 2nd Filter : removal of the indel ***************") +print("\nTotal number of locus recorded = %d" %n0) +print("\tTotal number of locus with no indels (SAVED) = %d" %good) +print("\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad) +print("") |
b |
diff -r eb95bf7f90ae -r c79bdda8abfb scripts/dico.py --- a/scripts/dico.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/dico.py Thu Jun 09 12:40:00 2022 +0000 |
[ |
@@ -3,10 +3,10 @@ def dico(F1): dicoco = {} with open(F1, "r") as file: - for name, query in itertools.izip_longest(*[file]*2): + for name, query in itertools.zip_longest(*[file]*2): if name[0] == ">": fasta_name_query = name[:-1] - Sn = string.split(fasta_name_query, "||") + Sn = str.split(fasta_name_query, "||") fasta_name_query = Sn[0] fasta_seq_query = query[:-1] dicoco[fasta_name_query] = fasta_seq_query |