comparison pairScan.py @ 3:e320ef2d105a draft

Uploaded
author petr-novak
date Thu, 05 Sep 2019 09:04:56 -0400
parents
children
comparison
equal deleted inserted replaced
2:ff658cf87f16 3:e320ef2d105a
1 #!/usr/bin/env python
2 import sys
3 import os
4 from optparse import OptionParser
5 import Levenshtein
6
7
8 class Error(Exception):
9 """Base class for exceptions in this module."""
10 pass
11
12
13 def readSingleSeq(file):
14 line = file.readline()
15 if not line:
16 return False # end of file
17 if line[0] != ">":
18 raise Error("no header on the first line")
19 seqname = line[1:].strip()
20 seq = ""
21 # read sequences
22 while True:
23 last_pos = file.tell()
24 line = file.readline()
25 if not line:
26 break
27 if line[0] == ">":
28 file.seek(last_pos)
29 break
30 seq = seq + line.strip()
31 return {'name': seqname, 'sequence': seq}
32
33
34 def writeSingleSeq(fileobject, seq):
35 fileobject.write(">")
36 fileobject.write(seq['name'] + "\n")
37 fileobject.write(seq['sequence'] + "\n")
38
39
40 def comparePairs(seq1, seq2, max_mismatch=3, offset=5):
41 s1 = seq1['sequence'].lower()
42 s2 = seq2['sequence'].lower()[::-1]
43 m = 0
44 intab = "ctagn"
45 outtab = "gatcn"
46 trantab = str.maketrans(intab, outtab)
47 s2 = s2.translate(trantab)
48 s1 = "-" * offset + s1
49 s2 = s2 + "-" * offset
50 n1 = len(s1)
51 n2 = len(s2)
52 m = 0
53 for i in range(1, min(n1 + 1, n2 + 1)):
54 #remove tails is any:
55 ss1 = s1[n1 - i:n1]
56 ss2 = s2[0:i]
57 added = ss1.count("-") + ss2.count("-")
58 d = Levenshtein.hamming(ss1, ss2) - added
59 if 100.0 * d / i <= max_mismatch:
60 m = max(m, i - d - added)
61 return m
62
63
64 def split_file(filename, N, min_chunk=2):
65 f1 = open(filename, 'r')
66 filenames = [filename + "." + str(i) for i in range(N)]
67 f2 = list(map(open, filenames, 'w' * N))
68 while True:
69 for i in f2:
70 for j in range(min_chunk):
71 line = f1.readline()
72 if not line:
73 [i.close() for i in f2]
74 f1.close()
75 return filenames
76 i.write(line)
77
78
79 def find_overlapping_sequences(seqfile,
80 seqfile2=None,
81 seqfile_good="",
82 seqfile_bad="",
83 min_overlap=30,
84 max_mismatch=2,
85 offset=5):
86 ''' return id ove overlaping pairs - only first id is returned '''
87 # default names - if empty
88 if seqfile_good == "":
89 seqfile_good = seqfile + ".pass"
90 if seqfile_bad == "":
91 seqfile_bad = seqfile + ".bad"
92
93 minscore = min_overlap * 2
94
95 fgood = open(seqfile_good, 'w')
96 fbad = open(seqfile_bad, 'w')
97 f = open(seqfile, 'r')
98 if seqfile2:
99 f2 = open(seqfile2)
100 else:
101 f2 = f
102 while True:
103 seq1 = readSingleSeq(f)
104 seq2 = readSingleSeq(f2)
105 if not seq1 or not seq2:
106 break # end of file
107 score = comparePairs(seq1, seq2, max_mismatch, offset=offset)
108 if score > min_overlap:
109 writeSingleSeq(fbad, seq1)
110 writeSingleSeq(fbad, seq2)
111 else:
112 writeSingleSeq(fgood, seq1)
113 writeSingleSeq(fgood, seq2)
114 f.close()
115 if not f2.closed:
116 f2.close
117 fgood.close()
118 fbad.close()
119
120
121 def main():
122 parser = OptionParser()
123 parser.add_option("-f",
124 "--fasta_file",
125 dest="seqfile",
126 help="input sequences in fasta format")
127 parser.add_option(
128 "-r",
129 "--fasta_file2",
130 default=None,
131 dest="seqfile2",
132 help=
133 "input sequences in fasta format, second file should be specified if pairs are not interlaced, all pairs must be complete!")
134 parser.add_option("-p",
135 "--fasta_file_pass",
136 dest="seqfile_good",
137 help="output file with good sequences",
138 default='')
139 parser.add_option("-b",
140 "--fasta_file_bad",
141 dest="seqfile_bad",
142 help="output file with bad sequences",
143 default='')
144 parser.add_option("-o",
145 "--minimal_overlap",
146 dest="min_overlap",
147 help="minimal overlap between pair ends",
148 default='30')
149 parser.add_option(
150 "-m",
151 "--max_mismatch",
152 dest="max_mismatch",
153 help="maximum number of mismatches in overlap per 100 nt",
154 default='2')
155 parser.add_option("-s",
156 "--offset",
157 dest="offset",
158 help="maximum offset",
159 default='5')
160 options, args = parser.parse_args()
161 find_overlapping_sequences(seqfile=options.seqfile,
162 seqfile2=options.seqfile2,
163 seqfile_good=options.seqfile_good,
164 seqfile_bad=options.seqfile_bad,
165 min_overlap=int(options.min_overlap),
166 max_mismatch=int(options.max_mismatch),
167 offset=int(options.offset))
168
169
170 if __name__ == "__main__":
171 main()