Mercurial > repos > petr-novak > re_utils
diff pairScan.py @ 3:e320ef2d105a draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 05 Sep 2019 09:04:56 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pairScan.py Thu Sep 05 09:04:56 2019 -0400 @@ -0,0 +1,171 @@ +#!/usr/bin/env python +import sys +import os +from optparse import OptionParser +import Levenshtein + + +class Error(Exception): + """Base class for exceptions in this module.""" + pass + + +def readSingleSeq(file): + line = file.readline() + if not line: + return False # end of file + if line[0] != ">": + raise Error("no header on the first line") + seqname = line[1:].strip() + seq = "" + # read sequences + while True: + last_pos = file.tell() + line = file.readline() + if not line: + break + if line[0] == ">": + file.seek(last_pos) + break + seq = seq + line.strip() + return {'name': seqname, 'sequence': seq} + + +def writeSingleSeq(fileobject, seq): + fileobject.write(">") + fileobject.write(seq['name'] + "\n") + fileobject.write(seq['sequence'] + "\n") + + +def comparePairs(seq1, seq2, max_mismatch=3, offset=5): + s1 = seq1['sequence'].lower() + s2 = seq2['sequence'].lower()[::-1] + m = 0 + intab = "ctagn" + outtab = "gatcn" + trantab = str.maketrans(intab, outtab) + s2 = s2.translate(trantab) + s1 = "-" * offset + s1 + s2 = s2 + "-" * offset + n1 = len(s1) + n2 = len(s2) + m = 0 + for i in range(1, min(n1 + 1, n2 + 1)): + #remove tails is any: + ss1 = s1[n1 - i:n1] + ss2 = s2[0:i] + added = ss1.count("-") + ss2.count("-") + d = Levenshtein.hamming(ss1, ss2) - added + if 100.0 * d / i <= max_mismatch: + m = max(m, i - d - added) + return m + + +def split_file(filename, N, min_chunk=2): + f1 = open(filename, 'r') + filenames = [filename + "." + str(i) for i in range(N)] + f2 = list(map(open, filenames, 'w' * N)) + while True: + for i in f2: + for j in range(min_chunk): + line = f1.readline() + if not line: + [i.close() for i in f2] + f1.close() + return filenames + i.write(line) + + +def find_overlapping_sequences(seqfile, + seqfile2=None, + seqfile_good="", + seqfile_bad="", + min_overlap=30, + max_mismatch=2, + offset=5): + ''' return id ove overlaping pairs - only first id is returned ''' + # default names - if empty + if seqfile_good == "": + seqfile_good = seqfile + ".pass" + if seqfile_bad == "": + seqfile_bad = seqfile + ".bad" + + minscore = min_overlap * 2 + + fgood = open(seqfile_good, 'w') + fbad = open(seqfile_bad, 'w') + f = open(seqfile, 'r') + if seqfile2: + f2 = open(seqfile2) + else: + f2 = f + while True: + seq1 = readSingleSeq(f) + seq2 = readSingleSeq(f2) + if not seq1 or not seq2: + break # end of file + score = comparePairs(seq1, seq2, max_mismatch, offset=offset) + if score > min_overlap: + writeSingleSeq(fbad, seq1) + writeSingleSeq(fbad, seq2) + else: + writeSingleSeq(fgood, seq1) + writeSingleSeq(fgood, seq2) + f.close() + if not f2.closed: + f2.close + fgood.close() + fbad.close() + + +def main(): + parser = OptionParser() + parser.add_option("-f", + "--fasta_file", + dest="seqfile", + help="input sequences in fasta format") + parser.add_option( + "-r", + "--fasta_file2", + default=None, + dest="seqfile2", + help= + "input sequences in fasta format, second file should be specified if pairs are not interlaced, all pairs must be complete!") + parser.add_option("-p", + "--fasta_file_pass", + dest="seqfile_good", + help="output file with good sequences", + default='') + parser.add_option("-b", + "--fasta_file_bad", + dest="seqfile_bad", + help="output file with bad sequences", + default='') + parser.add_option("-o", + "--minimal_overlap", + dest="min_overlap", + help="minimal overlap between pair ends", + default='30') + parser.add_option( + "-m", + "--max_mismatch", + dest="max_mismatch", + help="maximum number of mismatches in overlap per 100 nt", + default='2') + parser.add_option("-s", + "--offset", + dest="offset", + help="maximum offset", + default='5') + options, args = parser.parse_args() + find_overlapping_sequences(seqfile=options.seqfile, + seqfile2=options.seqfile2, + seqfile_good=options.seqfile_good, + seqfile_bad=options.seqfile_bad, + min_overlap=int(options.min_overlap), + max_mismatch=int(options.max_mismatch), + offset=int(options.offset)) + + +if __name__ == "__main__": + main()