comparison trimal_repo/scripts/generateRandomAlignmentsUsingAsSeedRealAlignments.py @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b15a3147e604
1 #!/usr/bin/python
2 import os
3 import Bio
4 import sys
5 import random
6 import argparse
7 import numpy as np
8 from Bio import SeqIO
9
10 def splitSequence(seq, length = 80):
11 ''' Split a given sequence contained in one line into lines of size "length"
12 '''
13 return "\n".join([seq[i:i + length] for i in range(0, len(seq), length)])
14
15 if __name__ == "__main__":
16
17 parser = argparse.ArgumentParser()
18
19 parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \
20 str, help = "Input Codon alignment")
21
22 parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \
23 str, help = "Set output file")
24
25 parser.add_argument("-s", "--numb_sequences", dest = "numb_sequences", \
26 default = 2, type = int, help = "Set how many sequences the output "
27 + "alignment should contain")
28
29 parser.add_argument("-r", "--numb_residues", dest = "numb_residues", \
30 default = 100, type = int, help = "Set how many residues the output "
31 + "alignment should contain")
32
33 parser.add_argument("-f", "--input_format", dest = "inFormat", type = str, \
34 default = "fasta", help = "Set input alignment format")
35
36 parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \
37 type = str, help = "Define the gap symbol used in the input/output "
38 + "alignments")
39
40 parser.add_argument("-m", "--max_attempts", dest = "attempts", default = 10, \
41 type = int, help = "Define a maximum numnber of attempts when generating "
42 + "a random alignment before giving it up")
43
44 args = parser.parse_args()
45
46 ## Check input parameters
47 if not os.path.isfile(args.inFile):
48 sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile))
49
50 if args.numb_sequences < 2:
51 sys.exit(("ERROR: Check input sequences '%s'") % (str(args.numb_sequences)))
52
53 if args.numb_residues < 2:
54 sys.exit(("ERROR: Check input residues '%s'") % (str(args.numb_residues)))
55
56 if args.attempts < 1:
57 sys.exit(("ERROR: Check max. number of attempts '%s'") % (str(args.attempts)))
58
59 ## Read input alignment and get some basic information from it e.g.
60 ## sequences names, residues number, etc.
61 algLen = -1
62 alignment = {}
63 for record in SeqIO.parse(args.inFile, args.inFormat):
64 seq = str(record.seq)
65 alignment.setdefault(record.id, seq)
66 if algLen == -1:
67 algLen = len(seq)
68 if len(seq) != algLen:
69 print("Detected Inconsistencies at Sequence's length", file = sys.stderr)
70
71 sequences = list(alignment.keys())
72 columns = list(range(algLen))
73
74 ## Select randomly sequences and columns from the input alignment to populate
75 ## the output alignment controlling there are not sequences nor columns
76 ## composed only by gaps.
77
78 ## This is an iterative process
79 selected_seqs = []
80 discarded_seqs = set()
81 selected_cols = []
82 discarded_cols = set()
83
84 ## Set a counter to control how many attempts are done for generating the
85 ## random alignment
86 max_attempts = 0
87 while True:
88
89 while len(selected_seqs) < args.numb_sequences:
90 selected = random.choice(sequences)
91 if not selected in discarded_seqs:
92 selected_seqs.append(selected)
93
94 while len(selected_cols) < args.numb_residues:
95 selected = random.choice(columns)
96 if not selected in discarded_cols:
97 selected_cols.append(selected)
98
99 generated = {}
100 for seq in selected_seqs:
101 if seq in generated:
102 continue
103 ## We check generated sequences are not composed only by gaps
104 sequence = [alignment[seq][pos] for pos in selected_cols]
105 if set(sequence) - set([args.gapSymbol]) == set([]):
106 discarded_seqs.add(seq)
107 continue
108 generated.setdefault(seq, splitSequence("".join(sequence)))
109
110 ## We have to check there are not columns composed only by gaps
111 for column in range(len(selected_cols)):
112 individual_column = [generated[seq][column] for seq in generated]
113 if set(individual_column) - set([args.gapSymbol]) == set([]):
114 discarded_cols.add(selected_cols[column])
115
116 ## We check which sequences/residues remain after controlling by those
117 ## composed only by gaps
118 selected_seqs = [s for s in selected_seqs if not s in discarded_seqs]
119 selected_cols = [c for c in selected_cols if not c in discarded_cols]
120
121 if len(selected_seqs) == args.numb_sequences and \
122 len(selected_cols) == args.numb_residues:
123 break
124
125 max_attempts += 1
126 if max_attempts == args.attempts:
127 sys.exit(("ERROR: Impossible to generate random alignment after '%s' "
128 + "attempts. Check configuration") % (args.attempts))
129
130 ## Produce the output aligment.
131 n = 1
132 ofile = open(args.outFile, "w") if args.outFile else sys.stdout
133
134 ## How to properly name output sequences including a padding to have
135 ## homogeneuous ids
136 padding = int(np.ceil(np.log10(args.numb_sequences)))
137 if args.numb_sequences % 10 == 0:
138 padding += 1
139
140 for seq in selected_seqs:
141 print(">seq_%s\n%s" % (str(n).zfill(padding), generated[seq]), file = ofile)
142 n += 1
143 ofile.close()