annotate scripts/S01_find_orf_on_multiple_alignment.py @ 0:eb95bf7f90ae draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:26:37 -0500
parents
children c79bdda8abfb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
1 #!/usr/bin/env python
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
2 # coding: utf8
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
3 # Author: Eric Fontanillas
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
4 # Modification: 03/09/14 by Julie BAFFARD
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
5 # Last modification : 25/07/18 by Victor Mataigne
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
6
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
7 # Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
8 # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
9 # CRITERIA 2 - This longest part should be > 150nc or 50aa
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
10 # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
11 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
12 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
13
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
14 import string, os, time, re, zipfile, sys, argparse
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
15 from dico import dico
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
16
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
17 def code_universel(F1):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
18 """ Creates bash for genetic code (key : codon ; value : amino-acid) """
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
19 bash_codeUniversel = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
20
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
21 with open(F1, "r") as file:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
22 for line in file.readlines():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
23 L1 = string.split(line, " ")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
24 length1 = len(L1)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
25 if length1 == 3:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
26 key = L1[0]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
27 value = L1[2][:-1]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
28 bash_codeUniversel[key] = value
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
29 else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
30 key = L1[0]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
31 value = L1[2]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
32 bash_codeUniversel[key] = value
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
33
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
34 return(bash_codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
35
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
36 def multiple3(seq):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
37 """ Tests if the sequence is a multiple of 3, and if not removes extra-bases
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
38 !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
39
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
40 m = len(seq)%3
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
41 if m != 0 :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
42 return seq[:-m], m
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
43 else :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
44 return seq, m
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
45
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
46 def detect_Methionine(seq_aa, Ortho, minimal_cds_length):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
47 """ Detects if methionin in the aa sequence """
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
48
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
49 ln = len(seq_aa)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
50 CUTOFF_Last_50aa = ln - minimal_cds_length
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
51
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
52 # Find all indices of occurances of "M" in a string of aa
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
53 list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
54
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
55 # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
56 if list_indices != []:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
57 first_M = list_indices[0]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
58 if first_M < CUTOFF_Last_50aa:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
59 Ortho = 1 # means orthologs found
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
60
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
61 return(Ortho)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
62
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
63 def ReverseComplement2(seq):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
64 """ Reverse complement DNA sequence """
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
65 seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
66 seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
67
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
68 return "".join([seq_dict[base] for base in reversed(seq)])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
69
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
70 def simply_get_ORF(seq_dna, gen_code):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
71 seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
72 seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
73
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
74 return ''.join(seq_by_aa)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
75
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
76 def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
77 # Multiple sequence based : Based on the alignment of several sequences (orthogroup)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
78 # Criteria 1 : Get the segment in the alignment with no codon stop
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
79
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
80 # 1 - Get the list of aligned aa seq for the 3 ORF:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
81 bash_of_aligned_aa_seq_3ORF = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
82 bash_of_aligned_nuc_seq_3ORF = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
83 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
84
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
85 for fasta_name in bash_aligned_nc_seq.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
86 # Get sequence, chek if multiple 3, then get 6 orfs
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
87 sequence_nc = bash_aligned_nc_seq[fasta_name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
88 new_sequence_nc, modulo = multiple3(sequence_nc)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
89 new_sequence_rev = ReverseComplement2(new_sequence_nc)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
90 # For each seq of the multialignment => give the 6 ORFs (in nuc)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
91 bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
92
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
93 seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
94 seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
95 seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
96 seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
97 seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
98 seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
99
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
100 # For each seq of the multialignment => give the 6 ORFs (in aa)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
101 bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
102
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
103 # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
104 BEST_MAX = 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
105
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
106 for i in [0,1,2,3,4,5]: # Test the 6 ORFs
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
107 ORF_Aligned_aa = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
108 ORF_Aligned_nuc = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
109
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
110 # 2.1 - Get the alignment of sequence for a given ORF
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
111 # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
112 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
113 ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
114 aa_length = len(ORFsequence)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
115 ORF_Aligned_aa.append(ORFsequence) ### List of all sequences in the ORF nb "i" =
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
116
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
117 n = i+1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
118
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
119 for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
120 ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
121 nuc_length = len(ORFsequence)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
122 ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" =
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
123
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
124 # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
125 # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
126 # Next step is to get the longuest subsequence whithout stop
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
127 # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
128 MAX_LENGTH = 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
129 LONGUEST_SEGMENT_UNSTOPPED = ""
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
130 j = 0 # Start from first position in alignment
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
131 List_of_List_subsequences = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
132 List_positions_subsequence = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
133 while j < aa_length:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
134 column = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
135 for seq in ORF_Aligned_aa:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
136 column.append(seq[j])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
137 j = j+1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
138 if "*" in column:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
139 List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
140 List_positions_subsequence = [] # Re-initialyse list of positions
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
141 else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
142 List_positions_subsequence.append(j)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
143
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
144 # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
145 LONGUEST_SUBSEQUENCE_LIST_POSITION = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
146 MAX=0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
147 for sublist in List_of_List_subsequences:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
148 if len(sublist) > MAX and len(sublist) > minimal_cds_length:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
149 MAX = len(sublist)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
150 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
151
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
152 # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
153 if LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
154 if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
155 CDS_maybe_truncated = 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
156 else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
157 CDS_maybe_truncated = 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
158 else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
159 CDS_maybe_truncated = 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
160
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
161
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
162 # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
163 # Test whether it is the better ORF
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
164 if MAX > BEST_MAX:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
165 BEST_MAX = MAX
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
166 BEST_ORF = i+1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
167 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
168
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
169
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
170 # 3 - ONCE we have this better segment (BEST CODING SEGMENT)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
171 # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
172 # And get the INDEX of the best ORF [0, 1, or 2]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
173 if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
174 pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
175 pos_MIN_aa = pos_MIN_aa - 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
176 pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
177
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
178
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
179 BESTORF_bash_of_aligned_aa_seq = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
180 BESTORF_bash_of_aligned_aa_seq_CODING = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
181 for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
182 index_BEST_ORF = BEST_ORF-1 # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
183 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
184 seq_coding = seq[pos_MIN_aa:pos_MAX_aa]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
185 BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
186 BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
187
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
188 # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
189 pos_MIN_nuc = pos_MIN_aa * 3
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
190 pos_MAX_nuc = pos_MAX_aa * 3
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
191
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
192 BESTORF_bash_aligned_nc_seq = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
193 BESTORF_bash_aligned_nc_seq_CODING = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
194 for fasta_name in bash_aligned_nc_seq.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
195 seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
196 seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
197 BESTORF_bash_aligned_nc_seq[fasta_name] = seq
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
198 BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
199
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
200 else: # no CDS found
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
201 BESTORF_bash_aligned_nc_seq = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
202 BESTORF_bash_aligned_nc_seq_CODING = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
203 BESTORF_bash_of_aligned_aa_seq = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
204 BESTORF_bash_of_aligned_aa_seq_CODING ={}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
205
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
206 # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
207
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
208 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
209 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
210
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
211 Ortho = 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
212 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
213 seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
214 Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ###
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
215
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
216 # CASE 1: A "M" is present and correctly localized (not in last 50 aa)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
217 if Ortho == 1:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
218 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
219 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
220
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
221 # CASE 2: in case the CDS is truncated, so the "M" is maybe missing:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
222 if Ortho == 0 and CDS_maybe_truncated == 1:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
223 BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
224 BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
225
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
226 # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
227 ## => the 2 bash "CDS_with_M" are left empty ("{}")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
228
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
229 return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
230
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
231 def write_output_file(results_dict, name_elems, path_out):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
232 if results_dict != {}:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
233 name_elems[3] = str(len(results_dict.keys()))
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
234 new_name = "_".join(name_elems)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
235
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
236 out1 = open("%s/%s" %(path_out,new_name), "w")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
237 for fasta_name in results_dict.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
238 seq = results_dict[fasta_name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
239 out1.write("%s\n" %fasta_name)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
240 out1.write("%s\n" %seq)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
241 out1.close()
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
242
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
243 def main():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
244 parser = argparse.ArgumentParser()
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
245 parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
246 parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
247 parser.add_argument("min_spec", help="Minimal number of species per alignment")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
248 parser.add_argument("list_files", help="File with all input files names")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
249 args = parser.parse_args()
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
250
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
251 minimal_cds_length = int(args.min_cds_len) # in aa number
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
252 bash_codeUniversel = code_universel(args.codeUniversel)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
253 minimum_species = int(args.min_spec)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
254
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
255 # Inputs from file containing list of species
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
256 list_files = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
257 with open(args.list_files, 'r') as f:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
258 for line in f.readlines():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
259 list_files.append(line.strip('\n'))
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
260
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
261 # Directories for results
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
262 dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
263 for directory in dirs:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
264 os.mkdir(directory)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
265
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
266 count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
267 count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
268
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
269 # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
270 # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
271 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
272
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
273 # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
274 #n0 = 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
275
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
276 for file in list_files:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
277 #n0 += 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
278
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
279 count_file_processed = count_file_processed + 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
280 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
281 fasta_file_path = "./%s" %file
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
282 bash_fasta = dico(fasta_file_path)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
283 BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
284
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
285 name_elems[1] = nb_gp
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
286
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
287 # Update counts and write group in corresponding output directory
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
288 if BESTORF_nuc != {}:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
289 count_file_with_CDS += 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
290 if len(BESTORF_nuc.keys()) >= minimum_species :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
291 count_file_with_cds_and_enought_species += 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
292 write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
293 write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
294 else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
295 count_file_without_CDS += 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
296
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
297 if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
298 write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
299 write_output_file(BESTORF_aa_CODING, name_elems, dirs[3])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
300
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
301 if BESTORF_nuc_CDS_with_M != {}:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
302 count_file_with_CDS_plus_M += 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
303 if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
304 count_file_with_cds_M_and_enought_species += 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
305 write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
306 write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
307
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
308 print "*************** CDS detection ***************"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
309 print "\nFiles processed: %d" %count_file_processed
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
310 print "\tFiles with CDS: %d" %count_file_with_CDS
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
311 print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
312 print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
313 print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
314 print "\tFiles without CDS: %d \n" %count_file_without_CDS
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
315 print ""
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
316
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
317 if __name__ == '__main__':
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
318 main()