comparison tools/fastq_paired_unpaired/fastq_paired_unpaired.py @ 6:f396701fbf32 draft

v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
author peterjc
date Wed, 10 May 2017 13:28:59 -0400
parents 09f9f0e29e47
children 8cbc866b72ce
comparison
equal deleted inserted replaced
5:b38bbcbd458d 6:f396701fbf32
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute 12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
13 (formerly SCRI), Scotland, UK. All rights reserved. 13 (formerly SCRI), Scotland, UK. All rights reserved.
14 14
15 See accompanying text file for licence details (MIT license). 15 See accompanying text file for licence details (MIT license).
16 """ 16 """
17 import os 17
18 import re
18 import sys 19 import sys
19 import re
20 20
21 if "-v" in sys.argv or "--version" in sys.argv: 21 if "-v" in sys.argv or "--version" in sys.argv:
22 print("Version 0.1.0") 22 print("Version 0.1.3")
23 sys.exit(0) 23 sys.exit(0)
24
25 def sys_exit(msg, err=1):
26 sys.stderr.write(msg.rstrip() + "\n")
27 sys.exit(err)
28 24
29 try: 25 try:
30 from Bio.SeqIO.QualityIO import FastqGeneralIterator 26 from Bio.SeqIO.QualityIO import FastqGeneralIterator
31 except ImportError: 27 except ImportError:
32 sys_exit("Biopython missing") 28 sys.exit("Biopython missing")
33 29
34 msg = """Expect either 3 or 4 arguments, all FASTQ filenames. 30 msg = """Expect either 3 or 4 arguments, all FASTQ filenames.
35 31
36 If you want two output files, use four arguments: 32 If you want two output files, use four arguments:
37 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) 33 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
56 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html 52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html
57 for details), and the new Illumina convention where the reads have the 53 for details), and the new Illumina convention where the reads have the
58 same identifier with the fragment at the start of the description, e.g. 54 same identifier with the fragment at the start of the description, e.g.
59 55
60 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA 56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA
61 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA 57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA
62 58
63 Note that this does support multiple forward and reverse reads per template 59 Note that this does support multiple forward and reverse reads per template
64 (which is quite common with Sanger sequencing), e.g. this which is sorted 60 (which is quite common with Sanger sequencing), e.g. this which is sorted
65 alphabetically: 61 alphabetically:
66 62
81 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR 77 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
82 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR 78 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
83 """ 79 """
84 80
85 if len(sys.argv) == 5: 81 if len(sys.argv) == 5:
86 format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] 82 seq_format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:]
87 elif len(sys.argv) == 6: 83 elif len(sys.argv) == 6:
88 pairs_fastq = None 84 pairs_fastq = None
89 format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] 85 seq_format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:]
90 else: 86 else:
91 sys_exit(msg) 87 sys.exit(msg)
92 88
93 format = format.replace("fastq", "").lower() 89 seq_format = seq_format.replace("fastq", "").lower()
94 if not format: 90 if not seq_format:
95 format="sanger" #safe default 91 seq_format = "sanger" # safe default
96 elif format not in ["sanger","solexa","illumina","cssanger"]: 92 elif seq_format not in ["sanger", "solexa", "illumina", "cssanger"]:
97 sys_exit("Unrecognised format %s" % format) 93 sys.exit("Unrecognised format %s" % seq_format)
98 94
99 #Cope with three widely used suffix naming convensions, 95 # Cope with three widely used suffix naming convensions,
100 #Illumina: /1 or /2 96 # Illumina: /1 or /2
101 #Forward/revered: .f or .r 97 # Forward/revered: .f or .r
102 #Sanger, e.g. .p1k and .q1k 98 # Sanger, e.g. .p1k and .q1k
103 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html 99 # See http://staden.sourceforge.net/manual/pregap4_unix_50.html
104 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") 100 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
105 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") 101 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
106 102
107 #assert re_f.match("demo/1") 103 # assert re_f.match("demo/1")
108 assert re_f.search("demo.f") 104 assert re_f.search("demo.f")
109 assert re_f.search("demo.s1") 105 assert re_f.search("demo.s1")
110 assert re_f.search("demo.f1k") 106 assert re_f.search("demo.f1k")
111 assert re_f.search("demo.p1") 107 assert re_f.search("demo.p1")
112 assert re_f.search("demo.p1k") 108 assert re_f.search("demo.p1k")
142 singles_handle = open(singles_fastq, "w") 138 singles_handle = open(singles_fastq, "w")
143 last_template, buffered_reads = None, [] 139 last_template, buffered_reads = None, []
144 140
145 for title, seq, qual in FastqGeneralIterator(in_handle): 141 for title, seq, qual in FastqGeneralIterator(in_handle):
146 count += 1 142 count += 1
147 name = title.split(None,1)[0] 143 name = title.split(None, 1)[0]
148 is_forward = False 144 is_forward = False
149 suffix = re_f.search(name) 145 suffix = re_f.search(name)
150 if suffix: 146 if suffix:
151 # ============ 147 # ============
152 # Forward read 148 # Forward read
218 if last_template: 214 if last_template:
219 # Left over singles... 215 # Left over singles...
220 for old in buffered_reads: 216 for old in buffered_reads:
221 singles_handle.write(FASTQ_TEMPLATE % old) 217 singles_handle.write(FASTQ_TEMPLATE % old)
222 singles += 1 218 singles += 1
223 in_handle.close 219 in_handle.close()
224 singles_handle.close() 220 singles_handle.close()
225 if pairs_fastq: 221 if pairs_fastq:
226 pairs_f_handle.close() 222 pairs_f_handle.close()
227 assert pairs_r_handle.closed 223 assert pairs_r_handle.closed
228 else: 224 else:
236 print("%i reads (%i forward, %i reverse), %i in pairs, %i as singles" 232 print("%i reads (%i forward, %i reverse), %i in pairs, %i as singles"
237 % (count, forward, reverse, pairs, singles)) 233 % (count, forward, reverse, pairs, singles))
238 234
239 assert count == pairs + singles == forward + reverse + neither, \ 235 assert count == pairs + singles == forward + reverse + neither, \
240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \ 236 "%i vs %i+%i=%i vs %i+%i+%i=%i" \
241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) 237 % (count, pairs, singles, pairs + singles, forward, reverse, neither, forward + reverse + neither)