annotate pairScan.py @ 18:d7f3eff34c27 draft

Uploaded
author petr-novak
date Fri, 14 May 2021 11:08:46 +0000
parents e320ef2d105a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
2 import sys
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
3 import os
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
4 from optparse import OptionParser
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
5 import Levenshtein
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
6
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
7
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
8 class Error(Exception):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
9 """Base class for exceptions in this module."""
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
10 pass
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
11
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
12
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
13 def readSingleSeq(file):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
14 line = file.readline()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
15 if not line:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
16 return False # end of file
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
17 if line[0] != ">":
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
18 raise Error("no header on the first line")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
19 seqname = line[1:].strip()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
20 seq = ""
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
21 # read sequences
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
22 while True:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
23 last_pos = file.tell()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
24 line = file.readline()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
25 if not line:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
26 break
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
27 if line[0] == ">":
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
28 file.seek(last_pos)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
29 break
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
30 seq = seq + line.strip()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
31 return {'name': seqname, 'sequence': seq}
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
32
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
33
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
34 def writeSingleSeq(fileobject, seq):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
35 fileobject.write(">")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
36 fileobject.write(seq['name'] + "\n")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
37 fileobject.write(seq['sequence'] + "\n")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
38
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
39
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
40 def comparePairs(seq1, seq2, max_mismatch=3, offset=5):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
41 s1 = seq1['sequence'].lower()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
42 s2 = seq2['sequence'].lower()[::-1]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
43 m = 0
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
44 intab = "ctagn"
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
45 outtab = "gatcn"
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
46 trantab = str.maketrans(intab, outtab)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
47 s2 = s2.translate(trantab)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
48 s1 = "-" * offset + s1
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
49 s2 = s2 + "-" * offset
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
50 n1 = len(s1)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
51 n2 = len(s2)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
52 m = 0
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
53 for i in range(1, min(n1 + 1, n2 + 1)):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
54 #remove tails is any:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
55 ss1 = s1[n1 - i:n1]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
56 ss2 = s2[0:i]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
57 added = ss1.count("-") + ss2.count("-")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
58 d = Levenshtein.hamming(ss1, ss2) - added
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
59 if 100.0 * d / i <= max_mismatch:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
60 m = max(m, i - d - added)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
61 return m
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
62
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
63
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
64 def split_file(filename, N, min_chunk=2):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
65 f1 = open(filename, 'r')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
66 filenames = [filename + "." + str(i) for i in range(N)]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
67 f2 = list(map(open, filenames, 'w' * N))
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
68 while True:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
69 for i in f2:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
70 for j in range(min_chunk):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
71 line = f1.readline()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
72 if not line:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
73 [i.close() for i in f2]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
74 f1.close()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
75 return filenames
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
76 i.write(line)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
77
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
78
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
79 def find_overlapping_sequences(seqfile,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
80 seqfile2=None,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
81 seqfile_good="",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
82 seqfile_bad="",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
83 min_overlap=30,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
84 max_mismatch=2,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
85 offset=5):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
86 ''' return id ove overlaping pairs - only first id is returned '''
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
87 # default names - if empty
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
88 if seqfile_good == "":
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
89 seqfile_good = seqfile + ".pass"
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
90 if seqfile_bad == "":
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
91 seqfile_bad = seqfile + ".bad"
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
92
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
93 minscore = min_overlap * 2
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
94
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
95 fgood = open(seqfile_good, 'w')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
96 fbad = open(seqfile_bad, 'w')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
97 f = open(seqfile, 'r')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
98 if seqfile2:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
99 f2 = open(seqfile2)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
100 else:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
101 f2 = f
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
102 while True:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
103 seq1 = readSingleSeq(f)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
104 seq2 = readSingleSeq(f2)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
105 if not seq1 or not seq2:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
106 break # end of file
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
107 score = comparePairs(seq1, seq2, max_mismatch, offset=offset)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
108 if score > min_overlap:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
109 writeSingleSeq(fbad, seq1)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
110 writeSingleSeq(fbad, seq2)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
111 else:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
112 writeSingleSeq(fgood, seq1)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
113 writeSingleSeq(fgood, seq2)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
114 f.close()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
115 if not f2.closed:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
116 f2.close
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
117 fgood.close()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
118 fbad.close()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
119
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
120
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
121 def main():
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
122 parser = OptionParser()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
123 parser.add_option("-f",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
124 "--fasta_file",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
125 dest="seqfile",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
126 help="input sequences in fasta format")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
127 parser.add_option(
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
128 "-r",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
129 "--fasta_file2",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
130 default=None,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
131 dest="seqfile2",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
132 help=
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
133 "input sequences in fasta format, second file should be specified if pairs are not interlaced, all pairs must be complete!")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
134 parser.add_option("-p",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
135 "--fasta_file_pass",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
136 dest="seqfile_good",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
137 help="output file with good sequences",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
138 default='')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
139 parser.add_option("-b",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
140 "--fasta_file_bad",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
141 dest="seqfile_bad",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
142 help="output file with bad sequences",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
143 default='')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
144 parser.add_option("-o",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
145 "--minimal_overlap",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
146 dest="min_overlap",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
147 help="minimal overlap between pair ends",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
148 default='30')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
149 parser.add_option(
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
150 "-m",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
151 "--max_mismatch",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
152 dest="max_mismatch",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
153 help="maximum number of mismatches in overlap per 100 nt",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
154 default='2')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
155 parser.add_option("-s",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
156 "--offset",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
157 dest="offset",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
158 help="maximum offset",
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
159 default='5')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
160 options, args = parser.parse_args()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
161 find_overlapping_sequences(seqfile=options.seqfile,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
162 seqfile2=options.seqfile2,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
163 seqfile_good=options.seqfile_good,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
164 seqfile_bad=options.seqfile_bad,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
165 min_overlap=int(options.min_overlap),
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
166 max_mismatch=int(options.max_mismatch),
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
167 offset=int(options.offset))
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
168
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
169
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
170 if __name__ == "__main__":
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
171 main()