annotate scripts/GetConsensus.py @ 0:c6bab5103a14 draft

"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author iss
date Mon, 21 Mar 2022 15:23:09 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
1 #!/usr/bin/env python3
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
2 # -*- coding: utf-8 -*-
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
3 """
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
4 ############################################################################
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
5 # Istituto Superiore di Sanita'
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
6 # European Union Reference Laboratory (EU-RL) for Escherichia coli, including Verotoxigenic E. coli (VTEC)
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
7 # Developer: Arnold Knijn arnold.knijn@iss.it
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
8 ############################################################################
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
9 """
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
10
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
11 import argparse
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
12 import sys
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
13 import os
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
14 import subprocess
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
15 import shutil
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
16
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
17 def __main__():
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
18 parser = argparse.ArgumentParser()
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
19 parser.add_argument('-i', '--input', dest='input', help='input alignment file')
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
20 parser.add_argument('-o', '--output', dest='output', help='output consensus file')
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
21 args = parser.parse_args()
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
22
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
23 sequences = []
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
24 varsequences = []
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
25 # read input
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
26 with open(args.input) as alignments:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
27 for alignment in alignments:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
28 if alignment[0] != ">":
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
29 sequences.append(alignment.rstrip())
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
30 numsequences = len(sequences)
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
31 for j in range(0, numsequences + 1):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
32 varsequences.append("")
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
33 lstnumvariants = []
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
34 lstnumhyphens = []
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
35 # loop over the columns
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
36 for i in range(0, len(sequences[0])):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
37 variants = []
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
38 numhyphens = 0
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
39 # loop over the original rows to obtain variants
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
40 for j in range(0, numsequences):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
41 if sequences[j][i:i+1] == "-":
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
42 numhyphens = numhyphens + 1
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
43 if sequences[j][i:i+1] not in variants and sequences[j][i:i+1] != "-":
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
44 variants.append(sequences[j][i:i+1])
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
45 lstnumvariants.append(len(variants))
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
46 lstnumhyphens.append(numhyphens)
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
47 # create varsequences with a template of the variants
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
48 for j in range(0, numsequences):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
49 if lstnumhyphens[i] == 0:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
50 varsequences[j] = varsequences[j] + "-"
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
51 elif lstnumvariants[i] < 2:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
52 varsequences[j] = varsequences[j] + "-"
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
53 else:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
54 varsequences[j] = varsequences[j] + sequences[j][i:i+1]
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
55 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
56 varsequences[numsequences] = varsequences[numsequences] + variants[0]
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
57 else:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
58 varsequences[numsequences] = varsequences[numsequences] + "-"
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
59 # loop over the columns, apply single variant
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
60 for i in range(0, len(sequences[0])):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
61 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
62 # loop over all the rows to apply single variant to "-"
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
63 for j in range(0, len(sequences)):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
64 if sequences[j][i:i+1] == "-":
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
65 lstsequence = list(sequences[j])
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
66 lstsequence[i] = varsequences[numsequences][i:i+1]
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
67 sequences[j] = ''.join(lstsequence)
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
68 # loop over the rows of the sequences, apply multiple variants
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
69 for j in range(0, len(sequences)):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
70 # loop over the columns
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
71 for i in range(0, len(sequences[0])):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
72 variants = []
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
73 if sequences[j][i:i+1] == "-" and lstnumvariants[i] > 1:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
74 # loop over the rows of the varsequences
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
75 for k in range(0, numsequences):
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
76 if varsequences[k][i:i+1] not in variants and varsequences[k][i:i+1] != "-":
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
77 variants.append(varsequences[k][i:i+1])
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
78 if len(variants) == 1:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
79 lstsequence = list(sequences[j])
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
80 lstsequence[i] = variants[0]
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
81 sequences[j] = ''.join(lstsequence)
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
82 else:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
83 lstsequence[i] = variants[len(variants) - 1]
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
84 sequences.append(''.join(lstsequence))
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
85 # eliminate duplicate sequences
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
86 sequences_unique = list(set(sequences))
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
87 # write consensus sequences to output
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
88 consensus = open(args.output, 'w')
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
89 n = 1
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
90 for sequence in sequences_unique:
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
91 consensus.write(">consensus_" + str(n) + "\n")
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
92 consensus.write(sequence + "\n")
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
93 n = n + 1
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
94
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
95
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
96 if __name__ == "__main__":
c6bab5103a14 "planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
iss
parents:
diff changeset
97 __main__()