cds_search: scripts/S01_find_orf_on_multiple

comparison scripts/S01_find_orf_on_multiple_alignment.py @ 1:c79bdda8abfb draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519

author	abims-sbr
date	Thu, 09 Jun 2022 12:40:00 +0000
parents	eb95bf7f90ae
children

comparison

equal deleted inserted replaced

-:eb95bf7f90ae
+:c79bdda8abfb
 #!/usr/bin/env python
 # coding: utf8
 # Author: Eric Fontanillas
 # Modification: 03/09/14 by Julie BAFFARD
-# Last modification : 25/07/18 by Victor Mataigne
+# Last modification : 10/09/21 by Charlotte Berthelier
-# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
+"""
-# CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
+Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
-# CRITERIA 2 - This longest part should be > 150nc or 50aa
-# CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
+- CRITERIA 1
-# OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
-# OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
+Get the longest part of the sequence alignemen without codon stop "*",
+and test in the 3 potential ORF and check with a Blast the best coding sequence
-import string, os, time, re, zipfile, sys, argparse
+- CRITERIA 2
+This longest part should be > 150nc or 50aa
+- CRITERIA 3 [OPTIONNAL]
+A codon start "M" should be present in this longuest part, before the last 50 aa
+OUTPUTs:
+"05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
+"06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
+"""
+import os
+import re
+import argparse
+from Bio.Blast import NCBIWWW
+from Bio.Blast import NCBIXML
 from dico import dico
-def code_universel(F1):
+def code_universel(file1):
 """ Creates bash for genetic code (key : codon ; value : amino-acid) """
-bash_codeUniversel = {}
+bash_code_universel = {}
-with open(F1, "r") as file:
+with open(file1, "r") as file:
 for line in file.readlines():
-L1 = string.split(line, " ")
+item = str.split(line, " ")
-length1 = len(L1)
+length1 = len(item)
 if length1 == 3:
-key = L1[0]
+key = item[0]
-value = L1[2][:-1]
+value = item[2][:-1]
-bash_codeUniversel[key] = value
+bash_code_universel[key] = value
 else:
-key = L1[0]
+key = item[0]
-value = L1[2]
+value = item[2]
-bash_codeUniversel[key] = value
+bash_code_universel[key] = value
-return(bash_codeUniversel)
+return bash_code_universel
 def multiple3(seq):
-""" Tests if the sequence is a multiple of 3, and if not removes extra-bases
+"""
-!! Possible to lost a codon, when I test ORF (as I will decay the ORF) """
+Tests if the sequence is a multiple of 3, and if not removes extra-bases
+Possible to lost a codon, when I test ORF (as I will decay the ORF)
-m = len(seq)%3
+"""
-if m != 0 :
-return seq[:-m], m
+multiple = len(seq) % 3
-else :
+if multiple != 0:
-return seq, m
+return seq[:-multiple], multiple
+return seq, multiple
-def detect_Methionine(seq_aa, Ortho, minimal_cds_length):
+def detect_methionine(seq_aa, ortho, minimal_cds_length):
 """ Detects if methionin in the aa sequence """
-ln = len(seq_aa)
+size = len(seq_aa)
-CUTOFF_Last_50aa = ln - minimal_cds_length
+cutoff_last_50aa = size - minimal_cds_length
 # Find all indices of occurances of "M" in a string of aa
 list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]
-# If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
+# If some "M" are present, find whether the first "M" found is not in the
+# 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
 if list_indices != []:
-first_M = list_indices[0]
+first_m = list_indices[0]
-if first_M < CUTOFF_Last_50aa:
+if first_m < cutoff_last_50aa:
-Ortho = 1  # means orthologs found
+ortho = 1  # means orthologs found
-return(Ortho)
+return ortho
-def ReverseComplement2(seq):
+def reverse_complement2(seq):
 """ Reverse complement DNA sequence """
 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
-seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
+seq_dict = {seq1[i]: seq1[i + 6]
+for i in range(24) if i < 6 or 12 <= i <= 16}
 return "".join([seq_dict[base] for base in reversed(seq)])
-def simply_get_ORF(seq_dna, gen_code):
-seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)]
+def simply_get_orf(seq_dna, gen_code):
-seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons]
+""" Generate the ORF sequence from DNA sequence """
+seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i + 3]
+for i in range(0, len(seq_dna), 3)]
+seq_by_aa = [gen_code[codon] if codon in gen_code.keys()
+else '?' for codon in seq_by_codons]
 return ''.join(seq_by_aa)
 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec):
 # Multiple sequence based : Based on the alignment of several sequences (orthogroup)
 # Criteria 1 : Get the segment in the alignment with no codon stop
 # 1 - Get the list of aligned aa seq for the 3 ORF:
+print("1 - Get the list of aligned aa seq for the 3 ORF")
 bash_of_aligned_aa_seq_3ORF = {}
 bash_of_aligned_nuc_seq_3ORF = {}
 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = []
 for fasta_name in bash_aligned_nc_seq.keys():
 # Get sequence, chek if multiple 3, then get 6 orfs
 sequence_nc = bash_aligned_nc_seq[fasta_name]
 new_sequence_nc, modulo = multiple3(sequence_nc)
-new_sequence_rev = ReverseComplement2(new_sequence_nc)
+new_sequence_rev = reverse_complement2(new_sequence_nc)
 # For each seq of the multialignment => give the 6 ORFs (in nuc)
 bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]]
-seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel)
+seq_prot_ORF1 = simply_get_orf(new_sequence_nc, bash_codeUniversel)
-seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel)
+seq_prot_ORF2 = simply_get_orf(new_sequence_nc[1:-2], bash_codeUniversel)
-seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel)
+seq_prot_ORF3 = simply_get_orf(new_sequence_nc[2:-1], bash_codeUniversel)
-seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel)
+seq_prot_ORF4 = simply_get_orf(new_sequence_rev, bash_codeUniversel)
-seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel)
+seq_prot_ORF5 = simply_get_orf(new_sequence_rev[1:-2], bash_codeUniversel)
-seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel)
+seq_prot_ORF6 = simply_get_orf(new_sequence_rev[2:-1], bash_codeUniversel)
 # For each seq of the multialignment => give the 6 ORFs (in aa)
 bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6]
 # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
+print("2 - Test for the best ORF")
 BEST_MAX = 0
 for i in [0,1,2,3,4,5]: # Test the 6 ORFs
 ORF_Aligned_aa = []
 ORF_Aligned_nuc = []
 # 3 - ONCE we have this better segment (BEST CODING SEGMENT)
 # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position)
 # And get the INDEX of the best ORF [0, 1, or 2]
+print("3 - ONCE we have this better segment")
 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0]
 pos_MIN_aa = pos_MIN_aa - 1
 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1]
 seq_coding = seq[pos_MIN_aa:pos_MAX_aa]
 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq
 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding
 # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
+print("4 - Get the corresponding position")
 pos_MIN_nuc = pos_MIN_aa * 3
 pos_MAX_nuc = pos_MAX_aa * 3
 BESTORF_bash_aligned_nc_seq = {}
 BESTORF_bash_aligned_nc_seq_CODING = {}
 for fasta_name in bash_aligned_nc_seq.keys():
 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF]
 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc]
 BESTORF_bash_aligned_nc_seq[fasta_name] = seq
 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding
+seq_cutted = re.sub(r'^.*?[a-zA-Z]', '', seq)
+sequence_for_blast=(fasta_name+'\n'+seq_cutted+'\n')
+good_ORF_found = False
+try:
+#result_handle = ""
+#blast_records = ""
+# logger.debug("sequence_for_blast = %s ", sequence_for_blast)
+print('sequence_for_blast = %s ',sequence_for_blast, end=' ', flush=True)
+result_handle = NCBIWWW.qblast("blastn", "/db/nt/current/fasta/nt.fsa", sequence_for_blast, expect=0.001, hitlist_size=1)
+blast_records = NCBIXML.parse(result_handle)
+except:
+good_ORF_found = False
+else:
+for blast_record in blast_records:
+for alignment in blast_record.alignments:
+for hsp in alignment.hsps:
+if hsp.expect < 0.001:
+good_ORF_found = True
+print("good_ORF_found = %s" %good_ORF_found)
 else: # no CDS found
 BESTORF_bash_aligned_nc_seq = {}
 BESTORF_bash_aligned_nc_seq_CODING = {}
 BESTORF_bash_of_aligned_aa_seq = {}
 BESTORF_bash_of_aligned_aa_seq_CODING ={}
 # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}
 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}
 Ortho = 0
 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():
 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]
-Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length)   ### DEF6 ###
+Ortho = detect_methionine(seq_aa, Ortho, minimal_cds_length)   ### DEF6 ###
 # CASE 1: A "M" is present and correctly localized (not in last 50 aa)
 if Ortho == 1:
 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
 out1.close()
 def main():
 parser = argparse.ArgumentParser()
 parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")
 parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)
 parser.add_argument("min_spec", help="Minimal number of species per alignment")
 parser.add_argument("list_files", help="File with all input files names")
 args = parser.parse_args()
 minimal_cds_length = int(args.min_cds_len)  # in aa number
 bash_codeUniversel = code_universel(args.codeUniversel)
 minimum_species = int(args.min_spec)
 # Inputs from file containing list of species
 list_files = []
 with open(args.list_files, 'r') as f:
 for line in f.readlines():
 list_files.append(line.strip('\n'))
 for file in list_files:
 #n0 += 1
 count_file_processed = count_file_processed + 1
 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
 fasta_file_path = "./%s" %file
 bash_fasta = dico(fasta_file_path)
 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)
 name_elems[1] = nb_gp
 # Update counts and write group in corresponding output directory
 if BESTORF_nuc != {}:
 count_file_with_CDS += 1
 if len(BESTORF_nuc.keys()) >= minimum_species :
 count_file_with_cds_and_enought_species += 1
 write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc
 write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting
 if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species :
 count_file_with_cds_M_and_enought_species += 1
 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])
 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])
-print "*************** CDS detection ***************"
+print("*************** CDS detection ***************")
-print "\nFiles processed: %d" %count_file_processed
+print("\nFiles processed: %d" %count_file_processed)
-print "\tFiles with CDS: %d" %count_file_with_CDS
+print("\tFiles with CDS: %d" %count_file_with_CDS)
-print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)
+print("\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species))
-print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
+print("\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M)
-print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species)
+print("\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) )
-print "\tFiles without CDS: %d \n" %count_file_without_CDS
+print("\tFiles without CDS: %d \n" %count_file_without_CDS)
-print ""
+print("")
 if __name__ == '__main__':
 main()

Mercurial > repos > abims-sbr > cds_search

comparison scripts/S01_find_orf_on_multiple_alignment.py @ 1:c79bdda8abfb draft default tip