Mercurial > repos > petr-novak > re_utils
comparison pairScan.py @ 3:e320ef2d105a draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 05 Sep 2019 09:04:56 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:ff658cf87f16 | 3:e320ef2d105a |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import os | |
4 from optparse import OptionParser | |
5 import Levenshtein | |
6 | |
7 | |
8 class Error(Exception): | |
9 """Base class for exceptions in this module.""" | |
10 pass | |
11 | |
12 | |
13 def readSingleSeq(file): | |
14 line = file.readline() | |
15 if not line: | |
16 return False # end of file | |
17 if line[0] != ">": | |
18 raise Error("no header on the first line") | |
19 seqname = line[1:].strip() | |
20 seq = "" | |
21 # read sequences | |
22 while True: | |
23 last_pos = file.tell() | |
24 line = file.readline() | |
25 if not line: | |
26 break | |
27 if line[0] == ">": | |
28 file.seek(last_pos) | |
29 break | |
30 seq = seq + line.strip() | |
31 return {'name': seqname, 'sequence': seq} | |
32 | |
33 | |
34 def writeSingleSeq(fileobject, seq): | |
35 fileobject.write(">") | |
36 fileobject.write(seq['name'] + "\n") | |
37 fileobject.write(seq['sequence'] + "\n") | |
38 | |
39 | |
40 def comparePairs(seq1, seq2, max_mismatch=3, offset=5): | |
41 s1 = seq1['sequence'].lower() | |
42 s2 = seq2['sequence'].lower()[::-1] | |
43 m = 0 | |
44 intab = "ctagn" | |
45 outtab = "gatcn" | |
46 trantab = str.maketrans(intab, outtab) | |
47 s2 = s2.translate(trantab) | |
48 s1 = "-" * offset + s1 | |
49 s2 = s2 + "-" * offset | |
50 n1 = len(s1) | |
51 n2 = len(s2) | |
52 m = 0 | |
53 for i in range(1, min(n1 + 1, n2 + 1)): | |
54 #remove tails is any: | |
55 ss1 = s1[n1 - i:n1] | |
56 ss2 = s2[0:i] | |
57 added = ss1.count("-") + ss2.count("-") | |
58 d = Levenshtein.hamming(ss1, ss2) - added | |
59 if 100.0 * d / i <= max_mismatch: | |
60 m = max(m, i - d - added) | |
61 return m | |
62 | |
63 | |
64 def split_file(filename, N, min_chunk=2): | |
65 f1 = open(filename, 'r') | |
66 filenames = [filename + "." + str(i) for i in range(N)] | |
67 f2 = list(map(open, filenames, 'w' * N)) | |
68 while True: | |
69 for i in f2: | |
70 for j in range(min_chunk): | |
71 line = f1.readline() | |
72 if not line: | |
73 [i.close() for i in f2] | |
74 f1.close() | |
75 return filenames | |
76 i.write(line) | |
77 | |
78 | |
79 def find_overlapping_sequences(seqfile, | |
80 seqfile2=None, | |
81 seqfile_good="", | |
82 seqfile_bad="", | |
83 min_overlap=30, | |
84 max_mismatch=2, | |
85 offset=5): | |
86 ''' return id ove overlaping pairs - only first id is returned ''' | |
87 # default names - if empty | |
88 if seqfile_good == "": | |
89 seqfile_good = seqfile + ".pass" | |
90 if seqfile_bad == "": | |
91 seqfile_bad = seqfile + ".bad" | |
92 | |
93 minscore = min_overlap * 2 | |
94 | |
95 fgood = open(seqfile_good, 'w') | |
96 fbad = open(seqfile_bad, 'w') | |
97 f = open(seqfile, 'r') | |
98 if seqfile2: | |
99 f2 = open(seqfile2) | |
100 else: | |
101 f2 = f | |
102 while True: | |
103 seq1 = readSingleSeq(f) | |
104 seq2 = readSingleSeq(f2) | |
105 if not seq1 or not seq2: | |
106 break # end of file | |
107 score = comparePairs(seq1, seq2, max_mismatch, offset=offset) | |
108 if score > min_overlap: | |
109 writeSingleSeq(fbad, seq1) | |
110 writeSingleSeq(fbad, seq2) | |
111 else: | |
112 writeSingleSeq(fgood, seq1) | |
113 writeSingleSeq(fgood, seq2) | |
114 f.close() | |
115 if not f2.closed: | |
116 f2.close | |
117 fgood.close() | |
118 fbad.close() | |
119 | |
120 | |
121 def main(): | |
122 parser = OptionParser() | |
123 parser.add_option("-f", | |
124 "--fasta_file", | |
125 dest="seqfile", | |
126 help="input sequences in fasta format") | |
127 parser.add_option( | |
128 "-r", | |
129 "--fasta_file2", | |
130 default=None, | |
131 dest="seqfile2", | |
132 help= | |
133 "input sequences in fasta format, second file should be specified if pairs are not interlaced, all pairs must be complete!") | |
134 parser.add_option("-p", | |
135 "--fasta_file_pass", | |
136 dest="seqfile_good", | |
137 help="output file with good sequences", | |
138 default='') | |
139 parser.add_option("-b", | |
140 "--fasta_file_bad", | |
141 dest="seqfile_bad", | |
142 help="output file with bad sequences", | |
143 default='') | |
144 parser.add_option("-o", | |
145 "--minimal_overlap", | |
146 dest="min_overlap", | |
147 help="minimal overlap between pair ends", | |
148 default='30') | |
149 parser.add_option( | |
150 "-m", | |
151 "--max_mismatch", | |
152 dest="max_mismatch", | |
153 help="maximum number of mismatches in overlap per 100 nt", | |
154 default='2') | |
155 parser.add_option("-s", | |
156 "--offset", | |
157 dest="offset", | |
158 help="maximum offset", | |
159 default='5') | |
160 options, args = parser.parse_args() | |
161 find_overlapping_sequences(seqfile=options.seqfile, | |
162 seqfile2=options.seqfile2, | |
163 seqfile_good=options.seqfile_good, | |
164 seqfile_bad=options.seqfile_bad, | |
165 min_overlap=int(options.min_overlap), | |
166 max_mismatch=int(options.max_mismatch), | |
167 offset=int(options.offset)) | |
168 | |
169 | |
170 if __name__ == "__main__": | |
171 main() |