view syncpairs.py @ 1:622f0b43ce57 draft

Uploaded
author brenninc
date Mon, 09 May 2016 08:31:12 -0400
parents 79682a423af7
children
line wrap: on
line source

"""
Source: https://raw.githubusercontent.com/mmendez12/sync_paired_end_reads/master/sync_paired_end_reads/syncpairs.py
"""
__author__ = 'mickael'
__author__ = 'mickael'

from Bio import SeqIO
from itertools import izip
import argparse


def adjust_name(reads1, reads2):
    for r1, r2 in izip(reads1, reads2):
        r2.name = r1.description
        r2.description = r1.description
        r2.id = r1.description
        yield r2


def remove_space_from_sequence_header(read):
    """ replaces spaces in a read's name by three underscores.
    Args:
        read: A SeqRecord object (see Biopython)

    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Alphabet import SingleLetterAlphabet

    >>> read = SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
                       id="read A", name="read A", description="read A")

    >>> print remove_space_from_sequence_header(read).name
    read___A
    """

    read.description = read.description.replace(' ', '___')
    read.name = read.description
    read.id = read.description
    return read


def next_matching_read(reads1, reads2):
    """ return next read2 that matches read2
    Args:
        reads1: A generator that contains a SeqRecord (see Biopython)

        reads2: A generator that contains a SeqRecord (see Biopython)


    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Alphabet import SingleLetterAlphabet

    >>> reads1 = []
    >>> reads2 = []

    >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
                       id="read A", name="read A", description="read A"))
    >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\
                       id="read A", name="read A", description="read A"))

    >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
                       id="read B", name="read B", description="read B"))

    >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
                       id="read C", name="read C", description="read C"))
    >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\
                       id="read C", name="read C", description="read C"))

    >>> match = [read2 for read2 in next_matching_read(reads1, reads2)]
    >>> print match[0].name
    read A
    >>> print match[1].name
    read C
    """

    for read1 in reads1:
        for read2 in reads2:
            if read1.name == read2.name:
                yield read2
                break


def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("reads1",help='modified reads')
    parser.add_argument("reads2", help='reads to adjust')

    parser.add_argument('reads1_output',  help='output folder and filename. Note that the folder should already exist')
    parser.add_argument('reads2_output',  help='output folder and filename. Note that the folder should already exist')

    args = parser.parse_args()

    #we'll need to go through the reads1 multiple time and it can be a large file
    #so it's better to use inline func that return a generator
    _reads1 = lambda: (rec for rec in SeqIO.parse(args.reads1, 'fastq'))
    _reads2 = (rec for rec in SeqIO.parse(args.reads2, 'fastq'))

    matching_reads2 = (read2 for read2 in next_matching_read(_reads1(), _reads2))
    synced_reads2_names = (read2 for read2 in adjust_name(_reads1(), matching_reads2))

    final_reads1 = (remove_space_from_sequence_header(r1) for r1 in _reads1())
    final_reads2 = (remove_space_from_sequence_header(r2) for r2 in synced_reads2_names)

    SeqIO.write(final_reads1, args.reads1_output, "fastq")
    SeqIO.write(final_reads2, args.reads2_output, "fastq")

if __name__ == '__main__':
    main()