Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/GetConsensus.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author | iss |
---|---|
date | Mon, 21 Mar 2022 15:23:09 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6bab5103a14 |
---|---|
1 #!/usr/bin/env python3 | |
2 # -*- coding: utf-8 -*- | |
3 """ | |
4 ############################################################################ | |
5 # Istituto Superiore di Sanita' | |
6 # European Union Reference Laboratory (EU-RL) for Escherichia coli, including Verotoxigenic E. coli (VTEC) | |
7 # Developer: Arnold Knijn arnold.knijn@iss.it | |
8 ############################################################################ | |
9 """ | |
10 | |
11 import argparse | |
12 import sys | |
13 import os | |
14 import subprocess | |
15 import shutil | |
16 | |
17 def __main__(): | |
18 parser = argparse.ArgumentParser() | |
19 parser.add_argument('-i', '--input', dest='input', help='input alignment file') | |
20 parser.add_argument('-o', '--output', dest='output', help='output consensus file') | |
21 args = parser.parse_args() | |
22 | |
23 sequences = [] | |
24 varsequences = [] | |
25 # read input | |
26 with open(args.input) as alignments: | |
27 for alignment in alignments: | |
28 if alignment[0] != ">": | |
29 sequences.append(alignment.rstrip()) | |
30 numsequences = len(sequences) | |
31 for j in range(0, numsequences + 1): | |
32 varsequences.append("") | |
33 lstnumvariants = [] | |
34 lstnumhyphens = [] | |
35 # loop over the columns | |
36 for i in range(0, len(sequences[0])): | |
37 variants = [] | |
38 numhyphens = 0 | |
39 # loop over the original rows to obtain variants | |
40 for j in range(0, numsequences): | |
41 if sequences[j][i:i+1] == "-": | |
42 numhyphens = numhyphens + 1 | |
43 if sequences[j][i:i+1] not in variants and sequences[j][i:i+1] != "-": | |
44 variants.append(sequences[j][i:i+1]) | |
45 lstnumvariants.append(len(variants)) | |
46 lstnumhyphens.append(numhyphens) | |
47 # create varsequences with a template of the variants | |
48 for j in range(0, numsequences): | |
49 if lstnumhyphens[i] == 0: | |
50 varsequences[j] = varsequences[j] + "-" | |
51 elif lstnumvariants[i] < 2: | |
52 varsequences[j] = varsequences[j] + "-" | |
53 else: | |
54 varsequences[j] = varsequences[j] + sequences[j][i:i+1] | |
55 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0: | |
56 varsequences[numsequences] = varsequences[numsequences] + variants[0] | |
57 else: | |
58 varsequences[numsequences] = varsequences[numsequences] + "-" | |
59 # loop over the columns, apply single variant | |
60 for i in range(0, len(sequences[0])): | |
61 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0: | |
62 # loop over all the rows to apply single variant to "-" | |
63 for j in range(0, len(sequences)): | |
64 if sequences[j][i:i+1] == "-": | |
65 lstsequence = list(sequences[j]) | |
66 lstsequence[i] = varsequences[numsequences][i:i+1] | |
67 sequences[j] = ''.join(lstsequence) | |
68 # loop over the rows of the sequences, apply multiple variants | |
69 for j in range(0, len(sequences)): | |
70 # loop over the columns | |
71 for i in range(0, len(sequences[0])): | |
72 variants = [] | |
73 if sequences[j][i:i+1] == "-" and lstnumvariants[i] > 1: | |
74 # loop over the rows of the varsequences | |
75 for k in range(0, numsequences): | |
76 if varsequences[k][i:i+1] not in variants and varsequences[k][i:i+1] != "-": | |
77 variants.append(varsequences[k][i:i+1]) | |
78 if len(variants) == 1: | |
79 lstsequence = list(sequences[j]) | |
80 lstsequence[i] = variants[0] | |
81 sequences[j] = ''.join(lstsequence) | |
82 else: | |
83 lstsequence[i] = variants[len(variants) - 1] | |
84 sequences.append(''.join(lstsequence)) | |
85 # eliminate duplicate sequences | |
86 sequences_unique = list(set(sequences)) | |
87 # write consensus sequences to output | |
88 consensus = open(args.output, 'w') | |
89 n = 1 | |
90 for sequence in sequences_unique: | |
91 consensus.write(">consensus_" + str(n) + "\n") | |
92 consensus.write(sequence + "\n") | |
93 n = n + 1 | |
94 | |
95 | |
96 if __name__ == "__main__": | |
97 __main__() |