comparison scripts/GetConsensus.py @ 0:c6bab5103a14 draft

"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author iss
date Mon, 21 Mar 2022 15:23:09 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c6bab5103a14
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 """
4 ############################################################################
5 # Istituto Superiore di Sanita'
6 # European Union Reference Laboratory (EU-RL) for Escherichia coli, including Verotoxigenic E. coli (VTEC)
7 # Developer: Arnold Knijn arnold.knijn@iss.it
8 ############################################################################
9 """
10
11 import argparse
12 import sys
13 import os
14 import subprocess
15 import shutil
16
17 def __main__():
18 parser = argparse.ArgumentParser()
19 parser.add_argument('-i', '--input', dest='input', help='input alignment file')
20 parser.add_argument('-o', '--output', dest='output', help='output consensus file')
21 args = parser.parse_args()
22
23 sequences = []
24 varsequences = []
25 # read input
26 with open(args.input) as alignments:
27 for alignment in alignments:
28 if alignment[0] != ">":
29 sequences.append(alignment.rstrip())
30 numsequences = len(sequences)
31 for j in range(0, numsequences + 1):
32 varsequences.append("")
33 lstnumvariants = []
34 lstnumhyphens = []
35 # loop over the columns
36 for i in range(0, len(sequences[0])):
37 variants = []
38 numhyphens = 0
39 # loop over the original rows to obtain variants
40 for j in range(0, numsequences):
41 if sequences[j][i:i+1] == "-":
42 numhyphens = numhyphens + 1
43 if sequences[j][i:i+1] not in variants and sequences[j][i:i+1] != "-":
44 variants.append(sequences[j][i:i+1])
45 lstnumvariants.append(len(variants))
46 lstnumhyphens.append(numhyphens)
47 # create varsequences with a template of the variants
48 for j in range(0, numsequences):
49 if lstnumhyphens[i] == 0:
50 varsequences[j] = varsequences[j] + "-"
51 elif lstnumvariants[i] < 2:
52 varsequences[j] = varsequences[j] + "-"
53 else:
54 varsequences[j] = varsequences[j] + sequences[j][i:i+1]
55 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
56 varsequences[numsequences] = varsequences[numsequences] + variants[0]
57 else:
58 varsequences[numsequences] = varsequences[numsequences] + "-"
59 # loop over the columns, apply single variant
60 for i in range(0, len(sequences[0])):
61 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
62 # loop over all the rows to apply single variant to "-"
63 for j in range(0, len(sequences)):
64 if sequences[j][i:i+1] == "-":
65 lstsequence = list(sequences[j])
66 lstsequence[i] = varsequences[numsequences][i:i+1]
67 sequences[j] = ''.join(lstsequence)
68 # loop over the rows of the sequences, apply multiple variants
69 for j in range(0, len(sequences)):
70 # loop over the columns
71 for i in range(0, len(sequences[0])):
72 variants = []
73 if sequences[j][i:i+1] == "-" and lstnumvariants[i] > 1:
74 # loop over the rows of the varsequences
75 for k in range(0, numsequences):
76 if varsequences[k][i:i+1] not in variants and varsequences[k][i:i+1] != "-":
77 variants.append(varsequences[k][i:i+1])
78 if len(variants) == 1:
79 lstsequence = list(sequences[j])
80 lstsequence[i] = variants[0]
81 sequences[j] = ''.join(lstsequence)
82 else:
83 lstsequence[i] = variants[len(variants) - 1]
84 sequences.append(''.join(lstsequence))
85 # eliminate duplicate sequences
86 sequences_unique = list(set(sequences))
87 # write consensus sequences to output
88 consensus = open(args.output, 'w')
89 n = 1
90 for sequence in sequences_unique:
91 consensus.write(">consensus_" + str(n) + "\n")
92 consensus.write(sequence + "\n")
93 n = n + 1
94
95
96 if __name__ == "__main__":
97 __main__()