Mercurial > repos > petr-novak > re_utils
comparison pairScan.py @ 3:e320ef2d105a draft
Uploaded
| author | petr-novak |
|---|---|
| date | Thu, 05 Sep 2019 09:04:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:ff658cf87f16 | 3:e320ef2d105a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import os | |
| 4 from optparse import OptionParser | |
| 5 import Levenshtein | |
| 6 | |
| 7 | |
| 8 class Error(Exception): | |
| 9 """Base class for exceptions in this module.""" | |
| 10 pass | |
| 11 | |
| 12 | |
| 13 def readSingleSeq(file): | |
| 14 line = file.readline() | |
| 15 if not line: | |
| 16 return False # end of file | |
| 17 if line[0] != ">": | |
| 18 raise Error("no header on the first line") | |
| 19 seqname = line[1:].strip() | |
| 20 seq = "" | |
| 21 # read sequences | |
| 22 while True: | |
| 23 last_pos = file.tell() | |
| 24 line = file.readline() | |
| 25 if not line: | |
| 26 break | |
| 27 if line[0] == ">": | |
| 28 file.seek(last_pos) | |
| 29 break | |
| 30 seq = seq + line.strip() | |
| 31 return {'name': seqname, 'sequence': seq} | |
| 32 | |
| 33 | |
| 34 def writeSingleSeq(fileobject, seq): | |
| 35 fileobject.write(">") | |
| 36 fileobject.write(seq['name'] + "\n") | |
| 37 fileobject.write(seq['sequence'] + "\n") | |
| 38 | |
| 39 | |
| 40 def comparePairs(seq1, seq2, max_mismatch=3, offset=5): | |
| 41 s1 = seq1['sequence'].lower() | |
| 42 s2 = seq2['sequence'].lower()[::-1] | |
| 43 m = 0 | |
| 44 intab = "ctagn" | |
| 45 outtab = "gatcn" | |
| 46 trantab = str.maketrans(intab, outtab) | |
| 47 s2 = s2.translate(trantab) | |
| 48 s1 = "-" * offset + s1 | |
| 49 s2 = s2 + "-" * offset | |
| 50 n1 = len(s1) | |
| 51 n2 = len(s2) | |
| 52 m = 0 | |
| 53 for i in range(1, min(n1 + 1, n2 + 1)): | |
| 54 #remove tails is any: | |
| 55 ss1 = s1[n1 - i:n1] | |
| 56 ss2 = s2[0:i] | |
| 57 added = ss1.count("-") + ss2.count("-") | |
| 58 d = Levenshtein.hamming(ss1, ss2) - added | |
| 59 if 100.0 * d / i <= max_mismatch: | |
| 60 m = max(m, i - d - added) | |
| 61 return m | |
| 62 | |
| 63 | |
| 64 def split_file(filename, N, min_chunk=2): | |
| 65 f1 = open(filename, 'r') | |
| 66 filenames = [filename + "." + str(i) for i in range(N)] | |
| 67 f2 = list(map(open, filenames, 'w' * N)) | |
| 68 while True: | |
| 69 for i in f2: | |
| 70 for j in range(min_chunk): | |
| 71 line = f1.readline() | |
| 72 if not line: | |
| 73 [i.close() for i in f2] | |
| 74 f1.close() | |
| 75 return filenames | |
| 76 i.write(line) | |
| 77 | |
| 78 | |
| 79 def find_overlapping_sequences(seqfile, | |
| 80 seqfile2=None, | |
| 81 seqfile_good="", | |
| 82 seqfile_bad="", | |
| 83 min_overlap=30, | |
| 84 max_mismatch=2, | |
| 85 offset=5): | |
| 86 ''' return id ove overlaping pairs - only first id is returned ''' | |
| 87 # default names - if empty | |
| 88 if seqfile_good == "": | |
| 89 seqfile_good = seqfile + ".pass" | |
| 90 if seqfile_bad == "": | |
| 91 seqfile_bad = seqfile + ".bad" | |
| 92 | |
| 93 minscore = min_overlap * 2 | |
| 94 | |
| 95 fgood = open(seqfile_good, 'w') | |
| 96 fbad = open(seqfile_bad, 'w') | |
| 97 f = open(seqfile, 'r') | |
| 98 if seqfile2: | |
| 99 f2 = open(seqfile2) | |
| 100 else: | |
| 101 f2 = f | |
| 102 while True: | |
| 103 seq1 = readSingleSeq(f) | |
| 104 seq2 = readSingleSeq(f2) | |
| 105 if not seq1 or not seq2: | |
| 106 break # end of file | |
| 107 score = comparePairs(seq1, seq2, max_mismatch, offset=offset) | |
| 108 if score > min_overlap: | |
| 109 writeSingleSeq(fbad, seq1) | |
| 110 writeSingleSeq(fbad, seq2) | |
| 111 else: | |
| 112 writeSingleSeq(fgood, seq1) | |
| 113 writeSingleSeq(fgood, seq2) | |
| 114 f.close() | |
| 115 if not f2.closed: | |
| 116 f2.close | |
| 117 fgood.close() | |
| 118 fbad.close() | |
| 119 | |
| 120 | |
| 121 def main(): | |
| 122 parser = OptionParser() | |
| 123 parser.add_option("-f", | |
| 124 "--fasta_file", | |
| 125 dest="seqfile", | |
| 126 help="input sequences in fasta format") | |
| 127 parser.add_option( | |
| 128 "-r", | |
| 129 "--fasta_file2", | |
| 130 default=None, | |
| 131 dest="seqfile2", | |
| 132 help= | |
| 133 "input sequences in fasta format, second file should be specified if pairs are not interlaced, all pairs must be complete!") | |
| 134 parser.add_option("-p", | |
| 135 "--fasta_file_pass", | |
| 136 dest="seqfile_good", | |
| 137 help="output file with good sequences", | |
| 138 default='') | |
| 139 parser.add_option("-b", | |
| 140 "--fasta_file_bad", | |
| 141 dest="seqfile_bad", | |
| 142 help="output file with bad sequences", | |
| 143 default='') | |
| 144 parser.add_option("-o", | |
| 145 "--minimal_overlap", | |
| 146 dest="min_overlap", | |
| 147 help="minimal overlap between pair ends", | |
| 148 default='30') | |
| 149 parser.add_option( | |
| 150 "-m", | |
| 151 "--max_mismatch", | |
| 152 dest="max_mismatch", | |
| 153 help="maximum number of mismatches in overlap per 100 nt", | |
| 154 default='2') | |
| 155 parser.add_option("-s", | |
| 156 "--offset", | |
| 157 dest="offset", | |
| 158 help="maximum offset", | |
| 159 default='5') | |
| 160 options, args = parser.parse_args() | |
| 161 find_overlapping_sequences(seqfile=options.seqfile, | |
| 162 seqfile2=options.seqfile2, | |
| 163 seqfile_good=options.seqfile_good, | |
| 164 seqfile_bad=options.seqfile_bad, | |
| 165 min_overlap=int(options.min_overlap), | |
| 166 max_mismatch=int(options.max_mismatch), | |
| 167 offset=int(options.offset)) | |
| 168 | |
| 169 | |
| 170 if __name__ == "__main__": | |
| 171 main() |
