3
|
1 #!/usr/bin/env python
|
|
2 import sys
|
|
3 import os
|
|
4 from optparse import OptionParser
|
|
5 import Levenshtein
|
|
6
|
|
7
|
|
8 class Error(Exception):
|
|
9 """Base class for exceptions in this module."""
|
|
10 pass
|
|
11
|
|
12
|
|
13 def readSingleSeq(file):
|
|
14 line = file.readline()
|
|
15 if not line:
|
|
16 return False # end of file
|
|
17 if line[0] != ">":
|
|
18 raise Error("no header on the first line")
|
|
19 seqname = line[1:].strip()
|
|
20 seq = ""
|
|
21 # read sequences
|
|
22 while True:
|
|
23 last_pos = file.tell()
|
|
24 line = file.readline()
|
|
25 if not line:
|
|
26 break
|
|
27 if line[0] == ">":
|
|
28 file.seek(last_pos)
|
|
29 break
|
|
30 seq = seq + line.strip()
|
|
31 return {'name': seqname, 'sequence': seq}
|
|
32
|
|
33
|
|
34 def writeSingleSeq(fileobject, seq):
|
|
35 fileobject.write(">")
|
|
36 fileobject.write(seq['name'] + "\n")
|
|
37 fileobject.write(seq['sequence'] + "\n")
|
|
38
|
|
39
|
|
40 def comparePairs(seq1, seq2, max_mismatch=3, offset=5):
|
|
41 s1 = seq1['sequence'].lower()
|
|
42 s2 = seq2['sequence'].lower()[::-1]
|
|
43 m = 0
|
|
44 intab = "ctagn"
|
|
45 outtab = "gatcn"
|
|
46 trantab = str.maketrans(intab, outtab)
|
|
47 s2 = s2.translate(trantab)
|
|
48 s1 = "-" * offset + s1
|
|
49 s2 = s2 + "-" * offset
|
|
50 n1 = len(s1)
|
|
51 n2 = len(s2)
|
|
52 m = 0
|
|
53 for i in range(1, min(n1 + 1, n2 + 1)):
|
|
54 #remove tails is any:
|
|
55 ss1 = s1[n1 - i:n1]
|
|
56 ss2 = s2[0:i]
|
|
57 added = ss1.count("-") + ss2.count("-")
|
|
58 d = Levenshtein.hamming(ss1, ss2) - added
|
|
59 if 100.0 * d / i <= max_mismatch:
|
|
60 m = max(m, i - d - added)
|
|
61 return m
|
|
62
|
|
63
|
|
64 def split_file(filename, N, min_chunk=2):
|
|
65 f1 = open(filename, 'r')
|
|
66 filenames = [filename + "." + str(i) for i in range(N)]
|
|
67 f2 = list(map(open, filenames, 'w' * N))
|
|
68 while True:
|
|
69 for i in f2:
|
|
70 for j in range(min_chunk):
|
|
71 line = f1.readline()
|
|
72 if not line:
|
|
73 [i.close() for i in f2]
|
|
74 f1.close()
|
|
75 return filenames
|
|
76 i.write(line)
|
|
77
|
|
78
|
|
79 def find_overlapping_sequences(seqfile,
|
|
80 seqfile2=None,
|
|
81 seqfile_good="",
|
|
82 seqfile_bad="",
|
|
83 min_overlap=30,
|
|
84 max_mismatch=2,
|
|
85 offset=5):
|
|
86 ''' return id ove overlaping pairs - only first id is returned '''
|
|
87 # default names - if empty
|
|
88 if seqfile_good == "":
|
|
89 seqfile_good = seqfile + ".pass"
|
|
90 if seqfile_bad == "":
|
|
91 seqfile_bad = seqfile + ".bad"
|
|
92
|
|
93 minscore = min_overlap * 2
|
|
94
|
|
95 fgood = open(seqfile_good, 'w')
|
|
96 fbad = open(seqfile_bad, 'w')
|
|
97 f = open(seqfile, 'r')
|
|
98 if seqfile2:
|
|
99 f2 = open(seqfile2)
|
|
100 else:
|
|
101 f2 = f
|
|
102 while True:
|
|
103 seq1 = readSingleSeq(f)
|
|
104 seq2 = readSingleSeq(f2)
|
|
105 if not seq1 or not seq2:
|
|
106 break # end of file
|
|
107 score = comparePairs(seq1, seq2, max_mismatch, offset=offset)
|
|
108 if score > min_overlap:
|
|
109 writeSingleSeq(fbad, seq1)
|
|
110 writeSingleSeq(fbad, seq2)
|
|
111 else:
|
|
112 writeSingleSeq(fgood, seq1)
|
|
113 writeSingleSeq(fgood, seq2)
|
|
114 f.close()
|
|
115 if not f2.closed:
|
|
116 f2.close
|
|
117 fgood.close()
|
|
118 fbad.close()
|
|
119
|
|
120
|
|
121 def main():
|
|
122 parser = OptionParser()
|
|
123 parser.add_option("-f",
|
|
124 "--fasta_file",
|
|
125 dest="seqfile",
|
|
126 help="input sequences in fasta format")
|
|
127 parser.add_option(
|
|
128 "-r",
|
|
129 "--fasta_file2",
|
|
130 default=None,
|
|
131 dest="seqfile2",
|
|
132 help=
|
|
133 "input sequences in fasta format, second file should be specified if pairs are not interlaced, all pairs must be complete!")
|
|
134 parser.add_option("-p",
|
|
135 "--fasta_file_pass",
|
|
136 dest="seqfile_good",
|
|
137 help="output file with good sequences",
|
|
138 default='')
|
|
139 parser.add_option("-b",
|
|
140 "--fasta_file_bad",
|
|
141 dest="seqfile_bad",
|
|
142 help="output file with bad sequences",
|
|
143 default='')
|
|
144 parser.add_option("-o",
|
|
145 "--minimal_overlap",
|
|
146 dest="min_overlap",
|
|
147 help="minimal overlap between pair ends",
|
|
148 default='30')
|
|
149 parser.add_option(
|
|
150 "-m",
|
|
151 "--max_mismatch",
|
|
152 dest="max_mismatch",
|
|
153 help="maximum number of mismatches in overlap per 100 nt",
|
|
154 default='2')
|
|
155 parser.add_option("-s",
|
|
156 "--offset",
|
|
157 dest="offset",
|
|
158 help="maximum offset",
|
|
159 default='5')
|
|
160 options, args = parser.parse_args()
|
|
161 find_overlapping_sequences(seqfile=options.seqfile,
|
|
162 seqfile2=options.seqfile2,
|
|
163 seqfile_good=options.seqfile_good,
|
|
164 seqfile_bad=options.seqfile_bad,
|
|
165 min_overlap=int(options.min_overlap),
|
|
166 max_mismatch=int(options.max_mismatch),
|
|
167 offset=int(options.offset))
|
|
168
|
|
169
|
|
170 if __name__ == "__main__":
|
|
171 main()
|