Mercurial > repos > peterjc > fastq_paired_unpaired
diff tools/fastq_paired_unpaired/fastq_paired_unpaired.py @ 6:f396701fbf32 draft
v0.1.3 Depends on Biopython 1.67 via Tool Shed package or bioconda.
author | peterjc |
---|---|
date | Wed, 10 May 2017 13:28:59 -0400 |
parents | 09f9f0e29e47 |
children | 8cbc866b72ce |
line wrap: on
line diff
--- a/tools/fastq_paired_unpaired/fastq_paired_unpaired.py Wed Aug 05 11:17:49 2015 -0400 +++ b/tools/fastq_paired_unpaired/fastq_paired_unpaired.py Wed May 10 13:28:59 2017 -0400 @@ -14,22 +14,18 @@ See accompanying text file for licence details (MIT license). """ -import os + +import re import sys -import re if "-v" in sys.argv or "--version" in sys.argv: - print("Version 0.1.0") + print("Version 0.1.3") sys.exit(0) -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - try: from Bio.SeqIO.QualityIO import FastqGeneralIterator except ImportError: - sys_exit("Biopython missing") + sys.exit("Biopython missing") msg = """Expect either 3 or 4 arguments, all FASTQ filenames. @@ -58,7 +54,7 @@ same identifier with the fragment at the start of the description, e.g. @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA -@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA +@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA Note that this does support multiple forward and reverse reads per template (which is quite common with Sanger sequencing), e.g. this which is sorted @@ -83,28 +79,28 @@ """ if len(sys.argv) == 5: - format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] + seq_format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] elif len(sys.argv) == 6: pairs_fastq = None - format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] + seq_format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] else: - sys_exit(msg) + sys.exit(msg) -format = format.replace("fastq", "").lower() -if not format: - format="sanger" #safe default -elif format not in ["sanger","solexa","illumina","cssanger"]: - sys_exit("Unrecognised format %s" % format) +seq_format = seq_format.replace("fastq", "").lower() +if not seq_format: + seq_format = "sanger" # safe default +elif seq_format not in ["sanger", "solexa", "illumina", "cssanger"]: + sys.exit("Unrecognised format %s" % seq_format) -#Cope with three widely used suffix naming convensions, -#Illumina: /1 or /2 -#Forward/revered: .f or .r -#Sanger, e.g. .p1k and .q1k -#See http://staden.sourceforge.net/manual/pregap4_unix_50.html +# Cope with three widely used suffix naming convensions, +# Illumina: /1 or /2 +# Forward/revered: .f or .r +# Sanger, e.g. .p1k and .q1k +# See http://staden.sourceforge.net/manual/pregap4_unix_50.html re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") -#assert re_f.match("demo/1") +# assert re_f.match("demo/1") assert re_f.search("demo.f") assert re_f.search("demo.s1") assert re_f.search("demo.f1k") @@ -144,7 +140,7 @@ for title, seq, qual in FastqGeneralIterator(in_handle): count += 1 - name = title.split(None,1)[0] + name = title.split(None, 1)[0] is_forward = False suffix = re_f.search(name) if suffix: @@ -220,7 +216,7 @@ for old in buffered_reads: singles_handle.write(FASTQ_TEMPLATE % old) singles += 1 -in_handle.close +in_handle.close() singles_handle.close() if pairs_fastq: pairs_f_handle.close() @@ -238,4 +234,4 @@ assert count == pairs + singles == forward + reverse + neither, \ "%i vs %i+%i=%i vs %i+%i+%i=%i" \ - % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither) + % (count, pairs, singles, pairs + singles, forward, reverse, neither, forward + reverse + neither)