Mercurial > repos > peterjc > fastq_paired_unpaired
annotate tools/fastq_paired_unpaired/fastq_paired_unpaired.py @ 6:f396701fbf32 draft
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
author | peterjc |
---|---|
date | Wed, 10 May 2017 13:28:59 -0400 |
parents | 09f9f0e29e47 |
children | 8cbc866b72ce |
rev | line source |
---|---|
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
2 """Divides a FASTQ into paired and single (orphan reads) as separate files. |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
3 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
4 The input file should be a valid FASTQ file which has been sorted so that |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
5 any partner forward+reverse reads are consecutive. The output files all |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
6 preserve this sort order. Pairing are recognised based on standard name |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
7 suffices. See below or run the tool with no arguments for more details. |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
8 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
10 Color Space should all work equally well). |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
11 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
13 (formerly SCRI), Scotland, UK. All rights reserved. |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
14 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
15 See accompanying text file for licence details (MIT license). |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
16 """ |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
17 |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
18 import re |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
19 import sys |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
20 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
21 if "-v" in sys.argv or "--version" in sys.argv: |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
22 print("Version 0.1.3") |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
23 sys.exit(0) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
24 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
25 try: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
26 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
27 except ImportError: |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
28 sys.exit("Biopython missing") |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
29 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
30 msg = """Expect either 3 or 4 arguments, all FASTQ filenames. |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
31 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
32 If you want two output files, use four arguments: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
33 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
34 - Sorted input FASTQ filename, |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
35 - Output paired FASTQ filename (forward then reverse interleaved), |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
36 - Output singles FASTQ filename (orphan reads) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
37 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
38 If you want three output files, use five arguments: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
39 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
40 - Sorted input FASTQ filename, |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
41 - Output forward paired FASTQ filename, |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
42 - Output reverse paired FASTQ filename, |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
43 - Output singles FASTQ filename (orphan reads) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
44 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
45 The input file should be a valid FASTQ file which has been sorted so that |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
46 any partner forward+reverse reads are consecutive. The output files all |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
47 preserve this sort order. |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
48 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
49 Any reads where the forward/reverse naming suffix used is not recognised |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
50 are treated as orphan reads. The tool supports the /1 and /2 convention |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
51 originally used by Illumina, the .f and .r convention, and the Sanger |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
53 for details), and the new Illumina convention where the reads have the |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
54 same identifier with the fragment at the start of the description, e.g. |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
55 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
58 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
59 Note that this does support multiple forward and reverse reads per template |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
60 (which is quite common with Sanger sequencing), e.g. this which is sorted |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
61 alphabetically: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
62 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
63 WTSI_1055_4p17.p1kapIBF |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
64 WTSI_1055_4p17.p1kpIBF |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
65 WTSI_1055_4p17.q1kapIBR |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
66 WTSI_1055_4p17.q1kpIBR |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
67 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
68 or this where the reads already come in pairs: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
69 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
70 WTSI_1055_4p17.p1kapIBF |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
71 WTSI_1055_4p17.q1kapIBR |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
72 WTSI_1055_4p17.p1kpIBF |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
73 WTSI_1055_4p17.q1kpIBR |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
74 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
75 both become: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
76 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
77 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
78 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
79 """ |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
80 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
81 if len(sys.argv) == 5: |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
82 seq_format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
83 elif len(sys.argv) == 6: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
84 pairs_fastq = None |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
85 seq_format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
86 else: |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
87 sys.exit(msg) |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
88 |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
89 seq_format = seq_format.replace("fastq", "").lower() |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
90 if not seq_format: |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
91 seq_format = "sanger" # safe default |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
92 elif seq_format not in ["sanger", "solexa", "illumina", "cssanger"]: |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
93 sys.exit("Unrecognised format %s" % seq_format) |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
94 |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
95 # Cope with three widely used suffix naming convensions, |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
96 # Illumina: /1 or /2 |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
97 # Forward/revered: .f or .r |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
98 # Sanger, e.g. .p1k and .q1k |
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
99 # See http://staden.sourceforge.net/manual/pregap4_unix_50.html |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
100 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
101 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
102 |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
103 # assert re_f.match("demo/1") |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
104 assert re_f.search("demo.f") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
105 assert re_f.search("demo.s1") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
106 assert re_f.search("demo.f1k") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
107 assert re_f.search("demo.p1") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
108 assert re_f.search("demo.p1k") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
109 assert re_f.search("demo.p1lk") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
110 assert re_r.search("demo/2") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
111 assert re_r.search("demo.r") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
112 assert re_r.search("demo.q1") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
113 assert re_r.search("demo.q1lk") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
114 assert not re_r.search("demo/1") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
115 assert not re_r.search("demo.f") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
116 assert not re_r.search("demo.p") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
117 assert not re_f.search("demo/2") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
118 assert not re_f.search("demo.r") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
119 assert not re_f.search("demo.q") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
120 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
121 re_illumina_f = re.compile(r"^[a-zA-Z0-9_:-]+ 1:.*$") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
122 re_illumina_r = re.compile(r"^[a-zA-Z0-9_:-]+ 2:.*$") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
123 assert re_illumina_f.match("HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
124 assert re_illumina_r.match("HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
125 assert not re_illumina_f.match("HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
126 assert not re_illumina_r.match("HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
127 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
128 FASTQ_TEMPLATE = "@%s\n%s\n+\n%s\n" |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
129 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
130 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
131 in_handle = open(input_fastq) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
132 if pairs_fastq: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
133 pairs_f_handle = open(pairs_fastq, "w") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
134 pairs_r_handle = pairs_f_handle |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
135 else: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
136 pairs_f_handle = open(pairs_f_fastq, "w") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
137 pairs_r_handle = open(pairs_r_fastq, "w") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
138 singles_handle = open(singles_fastq, "w") |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
139 last_template, buffered_reads = None, [] |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
140 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
141 for title, seq, qual in FastqGeneralIterator(in_handle): |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
142 count += 1 |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
143 name = title.split(None, 1)[0] |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
144 is_forward = False |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
145 suffix = re_f.search(name) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
146 if suffix: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
147 # ============ |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
148 # Forward read |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
149 # ============ |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
150 template = name[:suffix.start()] |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
151 is_forward = True |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
152 elif re_illumina_f.match(title): |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
153 template = name # No suffix |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
154 is_forward = True |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
155 if is_forward: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
156 # print(name, "forward", template) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
157 forward += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
158 if last_template == template: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
159 buffered_reads.append((title, seq, qual)) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
160 else: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
161 # Any old buffered reads are orphans |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
162 for old in buffered_reads: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
163 singles_handle.write(FASTQ_TEMPLATE % old) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
164 singles += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
165 # Save this read in buffer |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
166 buffered_reads = [(title, seq, qual)] |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
167 last_template = template |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
168 else: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
169 is_reverse = False |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
170 suffix = re_r.search(name) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
171 if suffix: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
172 # ============ |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
173 # Reverse read |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
174 # ============ |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
175 template = name[:suffix.start()] |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
176 is_reverse = True |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
177 elif re_illumina_r.match(title): |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
178 template = name # No suffix |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
179 is_reverse = True |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
180 if is_reverse: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
181 # print(name, "reverse", template) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
182 reverse += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
183 if last_template == template and buffered_reads: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
184 # We have a pair! |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
185 # If there are multiple buffered forward reads, want to pick |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
186 # the first one (although we could try and do something more |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
187 # clever looking at the suffix to match them up...) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
188 old = buffered_reads.pop(0) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
189 pairs_f_handle.write(FASTQ_TEMPLATE % old) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
190 pairs_r_handle.write(FASTQ_TEMPLATE % (title, seq, qual)) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
191 pairs += 2 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
192 else: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
193 # As this is a reverse read, this and any buffered read(s) are |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
194 # all orphans |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
195 for old in buffered_reads: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
196 singles_handle.write(FASTQ_TEMPLATE % old) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
197 singles += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
198 buffered_reads = [] |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
199 singles_handle.write(FASTQ_TEMPLATE % (title, seq, qual)) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
200 singles += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
201 last_template = None |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
202 else: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
203 # =========================== |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
204 # Neither forward nor reverse |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
205 # =========================== |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
206 singles_handle.write(FASTQ_TEMPLATE % (title, seq, qual)) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
207 singles += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
208 neither += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
209 for old in buffered_reads: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
210 singles_handle.write(FASTQ_TEMPLATE % old) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
211 singles += 1 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
212 buffered_reads = [] |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
213 last_template = None |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
214 if last_template: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
215 # Left over singles... |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
216 for old in buffered_reads: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
217 singles_handle.write(FASTQ_TEMPLATE % old) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
218 singles += 1 |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
219 in_handle.close() |
4
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
220 singles_handle.close() |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
221 if pairs_fastq: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
222 pairs_f_handle.close() |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
223 assert pairs_r_handle.closed |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
224 else: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
225 pairs_f_handle.close() |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
226 pairs_r_handle.close() |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
227 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
228 if neither: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
229 print("%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
230 % (count, forward, reverse, neither, pairs, singles)) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
231 else: |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
232 print("%i reads (%i forward, %i reverse), %i in pairs, %i as singles" |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
233 % (count, forward, reverse, pairs, singles)) |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
234 |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
235 assert count == pairs + singles == forward + reverse + neither, \ |
09f9f0e29e47
v0.0.6 use format_source; v0.0.5 error handling & citation
peterjc
parents:
diff
changeset
|
236 "%i vs %i+%i=%i vs %i+%i+%i=%i" \ |
6
f396701fbf32
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
4
diff
changeset
|
237 % (count, pairs, singles, pairs + singles, forward, reverse, neither, forward + reverse + neither) |