Mercurial > repos > brenninc > sync_paired_end_reads
comparison syncpairs.py @ 0:79682a423af7 draft
Uploaded
| author | brenninc |
|---|---|
| date | Mon, 09 May 2016 05:15:01 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:79682a423af7 |
|---|---|
| 1 """ | |
| 2 Source: https://raw.githubusercontent.com/mmendez12/sync_paired_end_reads/master/sync_paired_end_reads/syncpairs.py | |
| 3 """ | |
| 4 __author__ = 'mickael' | |
| 5 __author__ = 'mickael' | |
| 6 | |
| 7 from Bio import SeqIO | |
| 8 from itertools import izip | |
| 9 import argparse | |
| 10 | |
| 11 | |
| 12 def adjust_name(reads1, reads2): | |
| 13 for r1, r2 in izip(reads1, reads2): | |
| 14 r2.name = r1.description | |
| 15 r2.description = r1.description | |
| 16 r2.id = r1.description | |
| 17 yield r2 | |
| 18 | |
| 19 | |
| 20 def remove_space_from_sequence_header(read): | |
| 21 """ replaces spaces in a read's name by three underscores. | |
| 22 Args: | |
| 23 read: A SeqRecord object (see Biopython) | |
| 24 | |
| 25 >>> from Bio.Seq import Seq | |
| 26 >>> from Bio.SeqRecord import SeqRecord | |
| 27 >>> from Bio.Alphabet import SingleLetterAlphabet | |
| 28 | |
| 29 >>> read = SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ | |
| 30 id="read A", name="read A", description="read A") | |
| 31 | |
| 32 >>> print remove_space_from_sequence_header(read).name | |
| 33 read___A | |
| 34 """ | |
| 35 | |
| 36 read.description = read.description.replace(' ', '___') | |
| 37 read.name = read.description | |
| 38 read.id = read.description | |
| 39 return read | |
| 40 | |
| 41 | |
| 42 def next_matching_read(reads1, reads2): | |
| 43 """ return next read2 that matches read2 | |
| 44 Args: | |
| 45 reads1: A generator that contains a SeqRecord (see Biopython) | |
| 46 | |
| 47 reads2: A generator that contains a SeqRecord (see Biopython) | |
| 48 | |
| 49 | |
| 50 >>> from Bio.Seq import Seq | |
| 51 >>> from Bio.SeqRecord import SeqRecord | |
| 52 >>> from Bio.Alphabet import SingleLetterAlphabet | |
| 53 | |
| 54 >>> reads1 = [] | |
| 55 >>> reads2 = [] | |
| 56 | |
| 57 >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ | |
| 58 id="read A", name="read A", description="read A")) | |
| 59 >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\ | |
| 60 id="read A", name="read A", description="read A")) | |
| 61 | |
| 62 >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ | |
| 63 id="read B", name="read B", description="read B")) | |
| 64 | |
| 65 >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\ | |
| 66 id="read C", name="read C", description="read C")) | |
| 67 >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\ | |
| 68 id="read C", name="read C", description="read C")) | |
| 69 | |
| 70 >>> match = [read2 for read2 in next_matching_read(reads1, reads2)] | |
| 71 >>> print match[0].name | |
| 72 read A | |
| 73 >>> print match[1].name | |
| 74 read C | |
| 75 """ | |
| 76 | |
| 77 for read1 in reads1: | |
| 78 for read2 in reads2: | |
| 79 if read1.name == read2.name: | |
| 80 yield read2 | |
| 81 break | |
| 82 | |
| 83 | |
| 84 def main(): | |
| 85 | |
| 86 parser = argparse.ArgumentParser() | |
| 87 | |
| 88 parser.add_argument("reads1",help='modified reads') | |
| 89 parser.add_argument("reads2", help='reads to adjust') | |
| 90 | |
| 91 parser.add_argument('reads1_output', help='output folder and filename. Note that the folder should already exist') | |
| 92 parser.add_argument('reads2_output', help='output folder and filename. Note that the folder should already exist') | |
| 93 | |
| 94 args = parser.parse_args() | |
| 95 | |
| 96 #we'll need to go through the reads1 multiple time and it can be a large file | |
| 97 #so it's better to use inline func that return a generator | |
| 98 _reads1 = lambda: (rec for rec in SeqIO.parse(args.reads1, 'fastq')) | |
| 99 _reads2 = (rec for rec in SeqIO.parse(args.reads2, 'fastq')) | |
| 100 | |
| 101 matching_reads2 = (read2 for read2 in next_matching_read(_reads1(), _reads2)) | |
| 102 synced_reads2_names = (read2 for read2 in adjust_name(_reads1(), matching_reads2)) | |
| 103 | |
| 104 final_reads1 = (remove_space_from_sequence_header(r1) for r1 in _reads1()) | |
| 105 final_reads2 = (remove_space_from_sequence_header(r2) for r2 in synced_reads2_names) | |
| 106 | |
| 107 SeqIO.write(final_reads1, args.reads1_output, "fastq") | |
| 108 SeqIO.write(final_reads2, args.reads2_output, "fastq") | |
| 109 | |
| 110 if __name__ == '__main__': | |
| 111 main() |
