Mercurial > repos > petr-novak > re_utils
diff fasta_interlacer.py @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children | d397f5a85464 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_interlacer.py Mon Apr 01 07:56:36 2019 -0400 @@ -0,0 +1,200 @@ +#!/usr/bin/env python +''' interlacing two fastq sequences''' +import sys + +def readSingleSeq(file): + ''' read single seq from fasta file''' + line = file.readline() + if not line: + return False # end of file + if line[0] != ">": + raise Exception("no header on the first line") + seqname = line[1:].strip() + seq = "" + # read sequences + while True: + last_pos = file.tell() + line = file.readline() + if not line: + break + if line[0] == ">": + file.seek(last_pos) + break + seq = seq + line.strip() + return {'name': seqname, 'sequence': seq} + + +def writeSingleSeq(fileobject, seq): + ''' write single sequence to fasta file''' + fileobject.write(">") + fileobject.write(seq['name'] + "\n") + fileobject.write(seq['sequence'] + "\n") + + +def main(): + from optparse import OptionParser + parser = OptionParser() + parser.add_option("-a", + "--fasta_file_A", + dest="seqfileA", + help="input sequences in fasta format") + parser.add_option("-b", + "--fasta_file_B", + dest="seqfileB", + help="input sequences in fasta format") + parser.add_option("-p", + "--fasta_file_pairs", + dest="seqfile_pairs", + help="output file with paired sequences") + parser.add_option("-x", + "--fasta_file_singles", + dest="seqfile_singles", + help="output file with single sequences") + options, args = parser.parse_args() + + # Input files + fA = open(options.seqfileA, 'r') + fB = open(options.seqfileB, 'r') + # Output files + if options.seqfile_pairs: + fPairs = open(options.seqfile_pairs, 'w') + else: + fPairs = open(options.seqfileA + ".pairs", 'w') + + if options.seqfile_singles: + single = open(options.seqfile_singles, "w") + else: + single = open(options.seqfileA + ".single", "w") + + + sA1 = readSingleSeq(fA) + sB1 = readSingleSeq(fB) + if not sA1 or not sB1: + raise Exception("\nEmpty sequence on input, nothing to interlace!\n") + charA = sA1['name'][-1] + charB = sB1['name'][-1] + # validate sequence names + if charA == charB: + sys.stderr.write( + "last character of sequence id must be used for distinguishing pairs!") + exit(1) + # check first thousand! + for i in range(1000): + seqA = readSingleSeq(fA) + seqB = readSingleSeq(fB) + if (not seqA) or (not seqB): + # end of file: + if i == 0: + sys.stderr.write("input file is empty") + exit(1) + else: + break + if seqA['name'][-1] == charA and seqB['name'][-1] == charB: + continue + else: + sys.stderr.write( + "last character of sequence id must be used for distinguishing pairs!") + exit(1) + + fA.seek(0) + fB.seek(0) + + buffA = {} + buffB = {} + buffA_names = [] + buffB_names = [] + + while True: + + seqA = readSingleSeq(fA) + seqB = readSingleSeq(fB) + + if not seqA and not seqB: + break # end of file + + ## validation and direct checking only if not end of files + if seqA and seqB: + #validate: + if not (seqA['name'][-1] == charA and seqB['name'][-1] == charB): + sys.stderr.write( + "last character of sequence id must be used for distinguishing pairs!") + exit(1) + + # check if current seqs are pairs + if seqA['name'][:-1] == seqB['name'][:-1]: + writeSingleSeq(fPairs, seqA) + writeSingleSeq(fPairs, seqB) + continue + + ### compare whith buffers + ### seqA vs buffB + if seqA: + if seqA["name"][:-1] in buffB: + writeSingleSeq(fPairs, seqA) + seqtmp = {"name": seqA["name"][:-1] + charB, + "sequence": buffB[seqA["name"][:-1]]} + writeSingleSeq(fPairs, seqtmp) + # can I empty buffA ??? + for i in buffA_names: + seqtmp = {"name": i + charA, "sequence": buffA[i]} + writeSingleSeq(single, seqtmp) + buffA = {} + buffA_names = [] + + j = 0 + for i in buffB_names: + seqtmp = {"name": i + charB, "sequence": buffB[i]} + del buffB[i] + j += 1 + if i == seqA["name"][:-1]: + del buffB_names[0:j] + break + else: + writeSingleSeq(single, seqtmp) + else: + buffA[seqA["name"][:-1]] = seqA['sequence'] + buffA_names.append(seqA["name"][:-1]) + + ### seqA vs buffB + if seqB: + if seqB["name"][:-1] in buffA: + seqtmp = {"name": seqB["name"][:-1] + charA, + "sequence": buffA[seqB["name"][:-1]]} + writeSingleSeq(fPairs, seqtmp) + writeSingleSeq(fPairs, seqB) + # can I empty buffB ??? + for i in buffB_names: + seqtmp = {"name": i + charB, "sequence": buffB[i]} + writeSingleSeq(single, seqtmp) + buffB = {} + buffB_names = [] + + j = 0 + for i in buffA_names: + seqtmp = {"name": i + charA, "sequence": buffA[i]} + del buffA[i] + j += 1 + if i == seqB["name"][:-1]: + del buffA_names[0:j] + break + else: + writeSingleSeq(single, seqtmp) + + else: + buffB[seqB["name"][:-1]] = seqB['sequence'] + buffB_names.append(seqB["name"][:-1]) + fA.close() + fB.close() + fPairs.close() + # write rest of singles: + for i in buffA: + seqtmp = {"name": i + charA, "sequence": buffA[i]} + writeSingleSeq(single, seqtmp) + for i in buffB: + seqtmp = {"name": i + charB, "sequence": buffB[i]} + writeSingleSeq(single, seqtmp) + single.close() + + +if __name__ == "__main__": + main()