0
|
1 """
|
|
2 Source: https://raw.githubusercontent.com/mmendez12/sync_paired_end_reads/master/sync_paired_end_reads/syncpairs.py
|
|
3 """
|
|
4 __author__ = 'mickael'
|
|
5 __author__ = 'mickael'
|
|
6
|
|
7 from Bio import SeqIO
|
|
8 from itertools import izip
|
|
9 import argparse
|
|
10
|
|
11
|
|
12 def adjust_name(reads1, reads2):
|
|
13 for r1, r2 in izip(reads1, reads2):
|
|
14 r2.name = r1.description
|
|
15 r2.description = r1.description
|
|
16 r2.id = r1.description
|
|
17 yield r2
|
|
18
|
|
19
|
|
20 def remove_space_from_sequence_header(read):
|
|
21 """ replaces spaces in a read's name by three underscores.
|
|
22 Args:
|
|
23 read: A SeqRecord object (see Biopython)
|
|
24
|
|
25 >>> from Bio.Seq import Seq
|
|
26 >>> from Bio.SeqRecord import SeqRecord
|
|
27 >>> from Bio.Alphabet import SingleLetterAlphabet
|
|
28
|
|
29 >>> read = SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
|
|
30 id="read A", name="read A", description="read A")
|
|
31
|
|
32 >>> print remove_space_from_sequence_header(read).name
|
|
33 read___A
|
|
34 """
|
|
35
|
|
36 read.description = read.description.replace(' ', '___')
|
|
37 read.name = read.description
|
|
38 read.id = read.description
|
|
39 return read
|
|
40
|
|
41
|
|
42 def next_matching_read(reads1, reads2):
|
|
43 """ return next read2 that matches read2
|
|
44 Args:
|
|
45 reads1: A generator that contains a SeqRecord (see Biopython)
|
|
46
|
|
47 reads2: A generator that contains a SeqRecord (see Biopython)
|
|
48
|
|
49
|
|
50 >>> from Bio.Seq import Seq
|
|
51 >>> from Bio.SeqRecord import SeqRecord
|
|
52 >>> from Bio.Alphabet import SingleLetterAlphabet
|
|
53
|
|
54 >>> reads1 = []
|
|
55 >>> reads2 = []
|
|
56
|
|
57 >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
|
|
58 id="read A", name="read A", description="read A"))
|
|
59 >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\
|
|
60 id="read A", name="read A", description="read A"))
|
|
61
|
|
62 >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
|
|
63 id="read B", name="read B", description="read B"))
|
|
64
|
|
65 >>> reads1.append(SeqRecord(Seq("AAAAA",SingleLetterAlphabet),\
|
|
66 id="read C", name="read C", description="read C"))
|
|
67 >>> reads2.append(SeqRecord(Seq("TTTTT",SingleLetterAlphabet),\
|
|
68 id="read C", name="read C", description="read C"))
|
|
69
|
|
70 >>> match = [read2 for read2 in next_matching_read(reads1, reads2)]
|
|
71 >>> print match[0].name
|
|
72 read A
|
|
73 >>> print match[1].name
|
|
74 read C
|
|
75 """
|
|
76
|
|
77 for read1 in reads1:
|
|
78 for read2 in reads2:
|
|
79 if read1.name == read2.name:
|
|
80 yield read2
|
|
81 break
|
|
82
|
|
83
|
|
84 def main():
|
|
85
|
|
86 parser = argparse.ArgumentParser()
|
|
87
|
|
88 parser.add_argument("reads1",help='modified reads')
|
|
89 parser.add_argument("reads2", help='reads to adjust')
|
|
90
|
|
91 parser.add_argument('reads1_output', help='output folder and filename. Note that the folder should already exist')
|
|
92 parser.add_argument('reads2_output', help='output folder and filename. Note that the folder should already exist')
|
|
93
|
|
94 args = parser.parse_args()
|
|
95
|
|
96 #we'll need to go through the reads1 multiple time and it can be a large file
|
|
97 #so it's better to use inline func that return a generator
|
|
98 _reads1 = lambda: (rec for rec in SeqIO.parse(args.reads1, 'fastq'))
|
|
99 _reads2 = (rec for rec in SeqIO.parse(args.reads2, 'fastq'))
|
|
100
|
|
101 matching_reads2 = (read2 for read2 in next_matching_read(_reads1(), _reads2))
|
|
102 synced_reads2_names = (read2 for read2 in adjust_name(_reads1(), matching_reads2))
|
|
103
|
|
104 final_reads1 = (remove_space_from_sequence_header(r1) for r1 in _reads1())
|
|
105 final_reads2 = (remove_space_from_sequence_header(r2) for r2 in synced_reads2_names)
|
|
106
|
|
107 SeqIO.write(final_reads1, args.reads1_output, "fastq")
|
|
108 SeqIO.write(final_reads2, args.reads2_output, "fastq")
|
|
109
|
|
110 if __name__ == '__main__':
|
|
111 main()
|