Mercurial > repos > abims-sbr > cds_search
diff scripts/S01_find_orf_on_multiple_alignment.py @ 0:eb95bf7f90ae draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:26:37 -0500 |
parents | |
children | c79bdda8abfb |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Fri Feb 01 10:26:37 2019 -0500 @@ -0,0 +1,318 @@ +#!/usr/bin/env python +# coding: utf8 +# Author: Eric Fontanillas +# Modification: 03/09/14 by Julie BAFFARD +# Last modification : 25/07/18 by Victor Mataigne + +# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria + # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF + # CRITERIA 2 - This longest part should be > 150nc or 50aa + # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa + # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA + # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA + +import string, os, time, re, zipfile, sys, argparse +from dico import dico + +def code_universel(F1): + """ Creates bash for genetic code (key : codon ; value : amino-acid) """ + bash_codeUniversel = {} + + with open(F1, "r") as file: + for line in file.readlines(): + L1 = string.split(line, " ") + length1 = len(L1) + if length1 == 3: + key = L1[0] + value = L1[2][:-1] + bash_codeUniversel[key] = value + else: + key = L1[0] + value = L1[2] + bash_codeUniversel[key] = value + + return(bash_codeUniversel) + +def multiple3(seq): + """ Tests if the sequence is a multiple of 3, and if not removes extra-bases + !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """ + + m = len(seq)%3 + if m != 0 : + return seq[:-m], m + else : + return seq, m + +def detect_Methionine(seq_aa, Ortho, minimal_cds_length): + """ Detects if methionin in the aa sequence """ + + ln = len(seq_aa) + CUTOFF_Last_50aa = ln - minimal_cds_length + + # Find all indices of occurances of "M" in a string of aa + list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"] + + # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS + if list_indices != []: + first_M = list_indices[0] + if first_M < CUTOFF_Last_50aa: + Ortho = 1 # means orthologs found + + return(Ortho) + +def ReverseComplement2(seq): + """ Reverse complement DNA sequence """ + seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' + seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } + + return "".join([seq_dict[base] for base in reversed(seq)]) + +def simply_get_ORF(seq_dna, gen_code): + seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)] + seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons] + + return ''.join(seq_by_aa) + +def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec): + # Multiple sequence based : Based on the alignment of several sequences (orthogroup) + # Criteria 1 : Get the segment in the alignment with no codon stop + + # 1 - Get the list of aligned aa seq for the 3 ORF: + bash_of_aligned_aa_seq_3ORF = {} + bash_of_aligned_nuc_seq_3ORF = {} + BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] + + for fasta_name in bash_aligned_nc_seq.keys(): + # Get sequence, chek if multiple 3, then get 6 orfs + sequence_nc = bash_aligned_nc_seq[fasta_name] + new_sequence_nc, modulo = multiple3(sequence_nc) + new_sequence_rev = ReverseComplement2(new_sequence_nc) + # For each seq of the multialignment => give the 6 ORFs (in nuc) + bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]] + + seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel) + seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel) + seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel) + seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel) + seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel) + seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel) + + # For each seq of the multialignment => give the 6 ORFs (in aa) + bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6] + + # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) + BEST_MAX = 0 + + for i in [0,1,2,3,4,5]: # Test the 6 ORFs + ORF_Aligned_aa = [] + ORF_Aligned_nuc = [] + + # 2.1 - Get the alignment of sequence for a given ORF + # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd + for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): + ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] + aa_length = len(ORFsequence) + ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" = + + n = i+1 + + for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): + ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] + nuc_length = len(ORFsequence) + ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" = + + # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment + # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) + # Next step is to get the longuest subsequence whithout stop + # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" + MAX_LENGTH = 0 + LONGUEST_SEGMENT_UNSTOPPED = "" + j = 0 # Start from first position in alignment + List_of_List_subsequences = [] + List_positions_subsequence = [] + while j < aa_length: + column = [] + for seq in ORF_Aligned_aa: + column.append(seq[j]) + j = j+1 + if "*" in column: + List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions + List_positions_subsequence = [] # Re-initialyse list of positions + else: + List_positions_subsequence.append(j) + + # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) + LONGUEST_SUBSEQUENCE_LIST_POSITION = [] + MAX=0 + for sublist in List_of_List_subsequences: + if len(sublist) > MAX and len(sublist) > minimal_cds_length: + MAX = len(sublist) + LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist + + # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) + if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: + if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: + CDS_maybe_truncated = 1 + else: + CDS_maybe_truncated = 0 + else: + CDS_maybe_truncated = 0 + + + # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) + # Test whether it is the better ORF + if MAX > BEST_MAX: + BEST_MAX = MAX + BEST_ORF = i+1 + BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION + + + # 3 - ONCE we have this better segment (BEST CODING SEGMENT) + # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) + # And get the INDEX of the best ORF [0, 1, or 2] + if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: + pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] + pos_MIN_aa = pos_MIN_aa - 1 + pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1] + + + BESTORF_bash_of_aligned_aa_seq = {} + BESTORF_bash_of_aligned_aa_seq_CODING = {} + for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): + index_BEST_ORF = BEST_ORF-1 # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 + seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] + seq_coding = seq[pos_MIN_aa:pos_MAX_aa] + BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq + BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding + + # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment + pos_MIN_nuc = pos_MIN_aa * 3 + pos_MAX_nuc = pos_MAX_aa * 3 + + BESTORF_bash_aligned_nc_seq = {} + BESTORF_bash_aligned_nc_seq_CODING = {} + for fasta_name in bash_aligned_nc_seq.keys(): + seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF] + seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] + BESTORF_bash_aligned_nc_seq[fasta_name] = seq + BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding + + else: # no CDS found + BESTORF_bash_aligned_nc_seq = {} + BESTORF_bash_aligned_nc_seq_CODING = {} + BESTORF_bash_of_aligned_aa_seq = {} + BESTORF_bash_of_aligned_aa_seq_CODING ={} + + # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa + + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} + BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} + + Ortho = 0 + for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): + seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] + Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### + + # CASE 1: A "M" is present and correctly localized (not in last 50 aa) + if Ortho == 1: + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING + BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING + + # CASE 2: in case the CDS is truncated, so the "M" is maybe missing: + if Ortho == 0 and CDS_maybe_truncated == 1: + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING + BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING + + # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): + ## => the 2 bash "CDS_with_M" are left empty ("{}") + + return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) + +def write_output_file(results_dict, name_elems, path_out): + if results_dict != {}: + name_elems[3] = str(len(results_dict.keys())) + new_name = "_".join(name_elems) + + out1 = open("%s/%s" %(path_out,new_name), "w") + for fasta_name in results_dict.keys(): + seq = results_dict[fasta_name] + out1.write("%s\n" %fasta_name) + out1.write("%s\n" %seq) + out1.close() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt") + parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) + parser.add_argument("min_spec", help="Minimal number of species per alignment") + parser.add_argument("list_files", help="File with all input files names") + args = parser.parse_args() + + minimal_cds_length = int(args.min_cds_len) # in aa number + bash_codeUniversel = code_universel(args.codeUniversel) + minimum_species = int(args.min_spec) + + # Inputs from file containing list of species + list_files = [] + with open(args.list_files, 'r') as f: + for line in f.readlines(): + list_files.append(line.strip('\n')) + + # Directories for results + dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"] + for directory in dirs: + os.mkdir(directory) + + count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0 + count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0 + + # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), + # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. + name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] + + # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond + #n0 = 0 + + for file in list_files: + #n0 += 1 + + count_file_processed = count_file_processed + 1 + nb_gp = file.split('_')[1] # Keep trace of the orthogroup number + fasta_file_path = "./%s" %file + bash_fasta = dico(fasta_file_path) + BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species) + + name_elems[1] = nb_gp + + # Update counts and write group in corresponding output directory + if BESTORF_nuc != {}: + count_file_with_CDS += 1 + if len(BESTORF_nuc.keys()) >= minimum_species : + count_file_with_cds_and_enought_species += 1 + write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc + write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting + else: + count_file_without_CDS += 1 + + if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species: + write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2]) + write_output_file(BESTORF_aa_CODING, name_elems, dirs[3]) + + if BESTORF_nuc_CDS_with_M != {}: + count_file_with_CDS_plus_M += 1 + if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species : + count_file_with_cds_M_and_enought_species += 1 + write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4]) + write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5]) + + print "*************** CDS detection ***************" + print "\nFiles processed: %d" %count_file_processed + print "\tFiles with CDS: %d" %count_file_with_CDS + print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species) + print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M + print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) + print "\tFiles without CDS: %d \n" %count_file_without_CDS + print "" + +if __name__ == '__main__': + main()