Mercurial > repos > peterjc > fastq_paired_unpaired
comparison tools/fastq_paired_unpaired/fastq_paired_unpaired.py @ 6:f396701fbf32 draft
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
author | peterjc |
---|---|
date | Wed, 10 May 2017 13:28:59 -0400 |
parents | 09f9f0e29e47 |
children | 8cbc866b72ce |
comparison
equal
deleted
inserted
replaced
5:b38bbcbd458d | 6:f396701fbf32 |
---|---|
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute | 12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute |
13 (formerly SCRI), Scotland, UK. All rights reserved. | 13 (formerly SCRI), Scotland, UK. All rights reserved. |
14 | 14 |
15 See accompanying text file for licence details (MIT license). | 15 See accompanying text file for licence details (MIT license). |
16 """ | 16 """ |
17 import os | 17 |
18 import re | |
18 import sys | 19 import sys |
19 import re | |
20 | 20 |
21 if "-v" in sys.argv or "--version" in sys.argv: | 21 if "-v" in sys.argv or "--version" in sys.argv: |
22 print("Version 0.1.0") | 22 print("Version 0.1.3") |
23 sys.exit(0) | 23 sys.exit(0) |
24 | |
25 def sys_exit(msg, err=1): | |
26 sys.stderr.write(msg.rstrip() + "\n") | |
27 sys.exit(err) | |
28 | 24 |
29 try: | 25 try: |
30 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 26 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
31 except ImportError: | 27 except ImportError: |
32 sys_exit("Biopython missing") | 28 sys.exit("Biopython missing") |
33 | 29 |
34 msg = """Expect either 3 or 4 arguments, all FASTQ filenames. | 30 msg = """Expect either 3 or 4 arguments, all FASTQ filenames. |
35 | 31 |
36 If you want two output files, use four arguments: | 32 If you want two output files, use four arguments: |
37 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) | 33 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) |
56 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html | 52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html |
57 for details), and the new Illumina convention where the reads have the | 53 for details), and the new Illumina convention where the reads have the |
58 same identifier with the fragment at the start of the description, e.g. | 54 same identifier with the fragment at the start of the description, e.g. |
59 | 55 |
60 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA | 56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA |
61 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA | 57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA |
62 | 58 |
63 Note that this does support multiple forward and reverse reads per template | 59 Note that this does support multiple forward and reverse reads per template |
64 (which is quite common with Sanger sequencing), e.g. this which is sorted | 60 (which is quite common with Sanger sequencing), e.g. this which is sorted |
65 alphabetically: | 61 alphabetically: |
66 | 62 |
81 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR | 77 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR |
82 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR | 78 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR |
83 """ | 79 """ |
84 | 80 |
85 if len(sys.argv) == 5: | 81 if len(sys.argv) == 5: |
86 format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] | 82 seq_format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] |
87 elif len(sys.argv) == 6: | 83 elif len(sys.argv) == 6: |
88 pairs_fastq = None | 84 pairs_fastq = None |
89 format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] | 85 seq_format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] |
90 else: | 86 else: |
91 sys_exit(msg) | 87 sys.exit(msg) |
92 | 88 |
93 format = format.replace("fastq", "").lower() | 89 seq_format = seq_format.replace("fastq", "").lower() |
94 if not format: | 90 if not seq_format: |
95 format="sanger" #safe default | 91 seq_format = "sanger" # safe default |
96 elif format not in ["sanger","solexa","illumina","cssanger"]: | 92 elif seq_format not in ["sanger", "solexa", "illumina", "cssanger"]: |
97 sys_exit("Unrecognised format %s" % format) | 93 sys.exit("Unrecognised format %s" % seq_format) |
98 | 94 |
99 #Cope with three widely used suffix naming convensions, | 95 # Cope with three widely used suffix naming convensions, |
100 #Illumina: /1 or /2 | 96 # Illumina: /1 or /2 |
101 #Forward/revered: .f or .r | 97 # Forward/revered: .f or .r |
102 #Sanger, e.g. .p1k and .q1k | 98 # Sanger, e.g. .p1k and .q1k |
103 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html | 99 # See http://staden.sourceforge.net/manual/pregap4_unix_50.html |
104 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") | 100 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") |
105 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") | 101 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") |
106 | 102 |
107 #assert re_f.match("demo/1") | 103 # assert re_f.match("demo/1") |
108 assert re_f.search("demo.f") | 104 assert re_f.search("demo.f") |
109 assert re_f.search("demo.s1") | 105 assert re_f.search("demo.s1") |
110 assert re_f.search("demo.f1k") | 106 assert re_f.search("demo.f1k") |
111 assert re_f.search("demo.p1") | 107 assert re_f.search("demo.p1") |
112 assert re_f.search("demo.p1k") | 108 assert re_f.search("demo.p1k") |
142 singles_handle = open(singles_fastq, "w") | 138 singles_handle = open(singles_fastq, "w") |
143 last_template, buffered_reads = None, [] | 139 last_template, buffered_reads = None, [] |
144 | 140 |
145 for title, seq, qual in FastqGeneralIterator(in_handle): | 141 for title, seq, qual in FastqGeneralIterator(in_handle): |
146 count += 1 | 142 count += 1 |
147 name = title.split(None,1)[0] | 143 name = title.split(None, 1)[0] |
148 is_forward = False | 144 is_forward = False |
149 suffix = re_f.search(name) | 145 suffix = re_f.search(name) |
150 if suffix: | 146 if suffix: |
151 # ============ | 147 # ============ |
152 # Forward read | 148 # Forward read |
218 if last_template: | 214 if last_template: |
219 # Left over singles... | 215 # Left over singles... |
220 for old in buffered_reads: | 216 for old in buffered_reads: |
221 singles_handle.write(FASTQ_TEMPLATE % old) | 217 singles_handle.write(FASTQ_TEMPLATE % old) |
222 singles += 1 | 218 singles += 1 |
223 in_handle.close | 219 in_handle.close() |
224 singles_handle.close() | 220 singles_handle.close() |
225 if pairs_fastq: | 221 if pairs_fastq: |
226 pairs_f_handle.close() | 222 pairs_f_handle.close() |
227 assert pairs_r_handle.closed | 223 assert pairs_r_handle.closed |
228 else: | 224 else: |
236 print("%i reads (%i forward, %i reverse), %i in pairs, %i as singles" | 232 print("%i reads (%i forward, %i reverse), %i in pairs, %i as singles" |
237 % (count, forward, reverse, pairs, singles)) | 233 % (count, forward, reverse, pairs, singles)) |
238 | 234 |
239 assert count == pairs + singles == forward + reverse + neither, \ | 235 assert count == pairs + singles == forward + reverse + neither, \ |
240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \ | 236 "%i vs %i+%i=%i vs %i+%i+%i=%i" \ |
241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) | 237 % (count, pairs, singles, pairs + singles, forward, reverse, neither, forward + reverse + neither) |