# HG changeset patch # User brenninc # Date 1462785301 14400 # Node ID 79682a423af732b87d2bef89bbfa994793b1a8f8 Uploaded diff -r 000000000000 -r 79682a423af7 syncpairs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/syncpairs.py Mon May 09 05:15:01 2016 -0400 @@ -0,0 +1,111 @@ +""" +Source: https://raw.githubusercontent.com/mmendez12/sync_paired_end_reads/master/sync_paired_end_reads/syncpairs.py +""" +__author__ = 'mickael' +__author__ = 'mickael' + +from Bio import SeqIO +from itertools import izip +import argparse + + +def adjust_name(reads1, reads2): + for r1, r2 in izip(reads1, reads2): + r2.name = r1.description + r2.description = r1.description + r2.id = r1.description + yield r2 + + +def remove_space_from_sequence_header(read): + """ replaces spaces in a read's name by three underscores. + Args: + read: A SeqRecord object (see Biopython) + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Alphabet import SingleLetterAlphabet + + >>> read = SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ + id="read A", name="read A", description="read A") + + >>> print remove_space_from_sequence_header(read).name + read___A + """ + + read.description = read.description.replace(' ', '___') + read.name = read.description + read.id = read.description + return read + + +def next_matching_read(reads1, reads2): + """ return next read2 that matches read2 + Args: + reads1: A generator that contains a SeqRecord (see Biopython) + + reads2: A generator that contains a SeqRecord (see Biopython) + + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Alphabet import SingleLetterAlphabet + + >>> reads1 = [] + >>> reads2 = [] + + >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ + id="read A", name="read A", description="read A")) + >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\ + id="read A", name="read A", description="read A")) + + >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ + id="read B", name="read B", description="read B")) + + >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ + id="read C", name="read C", description="read C")) + >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\ + id="read C", name="read C", description="read C")) + + >>> match = [read2 for read2 in next_matching_read(reads1, reads2)] + >>> print match[0].name + read A + >>> print match[1].name + read C + """ + + for read1 in reads1: + for read2 in reads2: + if read1.name == read2.name: + yield read2 + break + + +def main(): + + parser = argparse.ArgumentParser() + + parser.add_argument("reads1",help='modified reads') + parser.add_argument("reads2", help='reads to adjust') + + parser.add_argument('reads1_output', help='output folder and filename. Note that the folder should already exist') + parser.add_argument('reads2_output', help='output folder and filename. Note that the folder should already exist') + + args = parser.parse_args() + + #we'll need to go through the reads1 multiple time and it can be a large file + #so it's better to use inline func that return a generator + _reads1 = lambda: (rec for rec in SeqIO.parse(args.reads1, 'fastq')) + _reads2 = (rec for rec in SeqIO.parse(args.reads2, 'fastq')) + + matching_reads2 = (read2 for read2 in next_matching_read(_reads1(), _reads2)) + synced_reads2_names = (read2 for read2 in adjust_name(_reads1(), matching_reads2)) + + final_reads1 = (remove_space_from_sequence_header(r1) for r1 in _reads1()) + final_reads2 = (remove_space_from_sequence_header(r2) for r2 in synced_reads2_names) + + SeqIO.write(final_reads1, args.reads1_output, "fastq") + SeqIO.write(final_reads2, args.reads2_output, "fastq") + +if __name__ == '__main__': + main() diff -r 000000000000 -r 79682a423af7 syncpairs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/syncpairs.xml Mon May 09 05:15:01 2016 -0400 @@ -0,0 +1,65 @@ + + Runes sncy pairs on two fastq files. + + biopython + + + + + + + + syncpairs.py $reads1 $reads2 $reads1_output $reads2_output + + + + + + + + + + + + + + + + + + + + + + + + + @misc{ + sync_paired_end_reads, + author = {Mickaƫl Mendez}, + title = {sync_paired_end_reads on Github}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/mmendez12/sync_paired_end_reads}, + commit = {afd9bf092ed55a65f90f28f40d6230c5fe849223} + } + + + diff -r 000000000000 -r 79682a423af7 test-data/read1.fq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/read1.fq Mon May 09 05:15:01 2016 -0400 @@ -0,0 +1,328 @@ +@NB500968:8:H5CFGAFXX:1:11101:15939:1071 1:N:0:TCCTGAGC+TACTCCTT;FP:23586;RQ:34.63 +GGCACGTCAGATGAGTATAAGAGACAGCAAATAGGTATAGGGGGCACGTCAGATGAGTAA ++ +EAEEEEEEA///A/EAEEE/EAEEEEEEEEEEAEEE//E/AA/E6AE/EE/ + + + + +