Mercurial > repos > padge > trimal
comparison trimal_repo/scripts/generateRandomAlignmentsUsingAsSeedRealAlignments.py @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b15a3147e604 |
---|---|
1 #!/usr/bin/python | |
2 import os | |
3 import Bio | |
4 import sys | |
5 import random | |
6 import argparse | |
7 import numpy as np | |
8 from Bio import SeqIO | |
9 | |
10 def splitSequence(seq, length = 80): | |
11 ''' Split a given sequence contained in one line into lines of size "length" | |
12 ''' | |
13 return "\n".join([seq[i:i + length] for i in range(0, len(seq), length)]) | |
14 | |
15 if __name__ == "__main__": | |
16 | |
17 parser = argparse.ArgumentParser() | |
18 | |
19 parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \ | |
20 str, help = "Input Codon alignment") | |
21 | |
22 parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \ | |
23 str, help = "Set output file") | |
24 | |
25 parser.add_argument("-s", "--numb_sequences", dest = "numb_sequences", \ | |
26 default = 2, type = int, help = "Set how many sequences the output " | |
27 + "alignment should contain") | |
28 | |
29 parser.add_argument("-r", "--numb_residues", dest = "numb_residues", \ | |
30 default = 100, type = int, help = "Set how many residues the output " | |
31 + "alignment should contain") | |
32 | |
33 parser.add_argument("-f", "--input_format", dest = "inFormat", type = str, \ | |
34 default = "fasta", help = "Set input alignment format") | |
35 | |
36 parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \ | |
37 type = str, help = "Define the gap symbol used in the input/output " | |
38 + "alignments") | |
39 | |
40 parser.add_argument("-m", "--max_attempts", dest = "attempts", default = 10, \ | |
41 type = int, help = "Define a maximum numnber of attempts when generating " | |
42 + "a random alignment before giving it up") | |
43 | |
44 args = parser.parse_args() | |
45 | |
46 ## Check input parameters | |
47 if not os.path.isfile(args.inFile): | |
48 sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile)) | |
49 | |
50 if args.numb_sequences < 2: | |
51 sys.exit(("ERROR: Check input sequences '%s'") % (str(args.numb_sequences))) | |
52 | |
53 if args.numb_residues < 2: | |
54 sys.exit(("ERROR: Check input residues '%s'") % (str(args.numb_residues))) | |
55 | |
56 if args.attempts < 1: | |
57 sys.exit(("ERROR: Check max. number of attempts '%s'") % (str(args.attempts))) | |
58 | |
59 ## Read input alignment and get some basic information from it e.g. | |
60 ## sequences names, residues number, etc. | |
61 algLen = -1 | |
62 alignment = {} | |
63 for record in SeqIO.parse(args.inFile, args.inFormat): | |
64 seq = str(record.seq) | |
65 alignment.setdefault(record.id, seq) | |
66 if algLen == -1: | |
67 algLen = len(seq) | |
68 if len(seq) != algLen: | |
69 print("Detected Inconsistencies at Sequence's length", file = sys.stderr) | |
70 | |
71 sequences = list(alignment.keys()) | |
72 columns = list(range(algLen)) | |
73 | |
74 ## Select randomly sequences and columns from the input alignment to populate | |
75 ## the output alignment controlling there are not sequences nor columns | |
76 ## composed only by gaps. | |
77 | |
78 ## This is an iterative process | |
79 selected_seqs = [] | |
80 discarded_seqs = set() | |
81 selected_cols = [] | |
82 discarded_cols = set() | |
83 | |
84 ## Set a counter to control how many attempts are done for generating the | |
85 ## random alignment | |
86 max_attempts = 0 | |
87 while True: | |
88 | |
89 while len(selected_seqs) < args.numb_sequences: | |
90 selected = random.choice(sequences) | |
91 if not selected in discarded_seqs: | |
92 selected_seqs.append(selected) | |
93 | |
94 while len(selected_cols) < args.numb_residues: | |
95 selected = random.choice(columns) | |
96 if not selected in discarded_cols: | |
97 selected_cols.append(selected) | |
98 | |
99 generated = {} | |
100 for seq in selected_seqs: | |
101 if seq in generated: | |
102 continue | |
103 ## We check generated sequences are not composed only by gaps | |
104 sequence = [alignment[seq][pos] for pos in selected_cols] | |
105 if set(sequence) - set([args.gapSymbol]) == set([]): | |
106 discarded_seqs.add(seq) | |
107 continue | |
108 generated.setdefault(seq, splitSequence("".join(sequence))) | |
109 | |
110 ## We have to check there are not columns composed only by gaps | |
111 for column in range(len(selected_cols)): | |
112 individual_column = [generated[seq][column] for seq in generated] | |
113 if set(individual_column) - set([args.gapSymbol]) == set([]): | |
114 discarded_cols.add(selected_cols[column]) | |
115 | |
116 ## We check which sequences/residues remain after controlling by those | |
117 ## composed only by gaps | |
118 selected_seqs = [s for s in selected_seqs if not s in discarded_seqs] | |
119 selected_cols = [c for c in selected_cols if not c in discarded_cols] | |
120 | |
121 if len(selected_seqs) == args.numb_sequences and \ | |
122 len(selected_cols) == args.numb_residues: | |
123 break | |
124 | |
125 max_attempts += 1 | |
126 if max_attempts == args.attempts: | |
127 sys.exit(("ERROR: Impossible to generate random alignment after '%s' " | |
128 + "attempts. Check configuration") % (args.attempts)) | |
129 | |
130 ## Produce the output aligment. | |
131 n = 1 | |
132 ofile = open(args.outFile, "w") if args.outFile else sys.stdout | |
133 | |
134 ## How to properly name output sequences including a padding to have | |
135 ## homogeneuous ids | |
136 padding = int(np.ceil(np.log10(args.numb_sequences))) | |
137 if args.numb_sequences % 10 == 0: | |
138 padding += 1 | |
139 | |
140 for seq in selected_seqs: | |
141 print(">seq_%s\n%s" % (str(n).zfill(padding), generated[seq]), file = ofile) | |
142 n += 1 | |
143 ofile.close() |