Mercurial > repos > peterjc > fastq_paired_unpaired
changeset 1:7ed81e36fc1c
Uploaded v0.0.5 which handles Illumina 1.8 style pair naming.
author | peterjc |
---|---|
date | Mon, 12 Dec 2011 11:33:10 -0500 |
parents | 72e9fcaec61f |
children | 95a632a71951 |
files | tools/fastq/fastq_paired_unpaired.py tools/fastq/fastq_paired_unpaired.txt tools/fastq/fastq_paired_unpaired.xml |
diffstat | 3 files changed, 55 insertions(+), 20 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/fastq/fastq_paired_unpaired.py Tue Jun 07 17:21:17 2011 -0400 +++ b/tools/fastq/fastq_paired_unpaired.py Mon Dec 12 11:33:10 2011 -0500 @@ -9,16 +9,20 @@ Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even Color Space should all work equally well). -This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. +This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute +(formerly SCRI), Scotland, UK. All rights reserved. + See accompanying text file for licence details (MIT/BSD style). - -This is version 0.0.4 of the script. """ import os import sys import re from galaxy_utils.sequence.fastq import fastqReader, fastqWriter +if "--version" in sys.argv[1:]: + print "Version 0.0.5" + sys.exit(0) + def stop_err(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) @@ -44,8 +48,13 @@ Any reads where the forward/reverse naming suffix used is not recognised are treated as orphan reads. The tool supports the /1 and /2 convention -used by Illumina, the .f and .r convention, and the Sanger convention -(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). +originally used by Illumina, the .f and .r convention, and the Sanger +convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html +for details), and the new Illumina convention where the reads have the +same identifier with the fragment at the start of the description, e.g. + +@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA +@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA Note that this does support multiple forward and reverse reads per template (which is quite common with Sanger sequencing), e.g. this which is sorted @@ -113,6 +122,14 @@ assert not re_f.search("demo.r") assert not re_f.search("demo.q") +re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$") +re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$") +assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") +assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") +assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") +assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") + + count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 in_handle = open(input_fastq) if pairs_fastq: @@ -128,12 +145,18 @@ count += 1 name = record.identifier.split(None,1)[0] assert name[0]=="@", record.identifier #Quirk of the Galaxy parser + is_forward = False suffix = re_f.search(name) if suffix: #============ #Forward read #============ template = name[:suffix.start()] + is_forward = True + elif re_illumina_f.match(record.identifier): + template = name #No suffix + is_forward = True + if is_forward: #print name, "forward", template forward += 1 if last_template == template: @@ -145,14 +168,20 @@ singles += 1 #Save this read in buffer buffered_reads = [record] - last_template = template + last_template = template else: + is_reverse = False suffix = re_r.search(name) if suffix: #============ #Reverse read #============ template = name[:suffix.start()] + is_reverse = True + elif re_illumina_r.match(record.identifier): + template = name #No suffix + is_reverse = True + if is_reverse: #print name, "reverse", template reverse += 1 if last_template == template and buffered_reads: @@ -208,5 +237,5 @@ % (count, forward, reverse, pairs, singles) assert count == pairs + singles == forward + reverse + neither, \ - "%i vs %i+%i=%i vs %i+%i=%i" \ + "%i vs %i+%i=%i vs %i+%i+%i=%i" \ % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)
--- a/tools/fastq/fastq_paired_unpaired.txt Tue Jun 07 17:21:17 2011 -0400 +++ b/tools/fastq/fastq_paired_unpaired.txt Mon Dec 12 11:33:10 2011 -0500 @@ -36,6 +36,7 @@ v0.0.2 - Help text; cope with multiple pairs per template v0.0.3 - Galaxy XML wrappers added v0.0.4 - Use Galaxy library to handle FASTQ files (avoid Biopython dependency) +v0.0.5 - Handle Illumina 1.8 style pair names Developers
--- a/tools/fastq/fastq_paired_unpaired.xml Tue Jun 07 17:21:17 2011 -0400 +++ b/tools/fastq/fastq_paired_unpaired.xml Mon Dec 12 11:33:10 2011 -0500 @@ -1,4 +1,4 @@ -<tool id="fastq_paired_unpaired" name="Divide FASTQ file into paired and unpaired reads" version="0.0.4"> +<tool id="fastq_paired_unpaired" name="Divide FASTQ file into paired and unpaired reads" version="0.0.5"> <description>using the read name suffices</description> <command interpreter="python"> fastq_paired_unpaired.py $input_fastq.extension $input_fastq @@ -52,29 +52,34 @@ Any reads where the forward/reverse naming suffix used is not recognised are treated as orphan reads. The tool supports the /1 and /2 convention -used by Illumina, the .f and .r convention, and the Sanger convention -(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). +originally used by Illumina, .f and .r convention, the Sanger convention +(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details), +and the current Illumina convention where the reads get the same identifier +with the fragment number in the description, for example: + + * @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA + * @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA Note that this does support multiple forward and reverse reads per template (which is quite common with Sanger sequencing), e.g. this which is sorted alphabetically: -WTSI_1055_4p17.p1kapIBF -WTSI_1055_4p17.p1kpIBF -WTSI_1055_4p17.q1kapIBR -WTSI_1055_4p17.q1kpIBR + * WTSI_1055_4p17.p1kapIBF + * WTSI_1055_4p17.p1kpIBF + * WTSI_1055_4p17.q1kapIBR + * WTSI_1055_4p17.q1kpIBR or this where the reads already come in pairs: -WTSI_1055_4p17.p1kapIBF -WTSI_1055_4p17.q1kapIBR -WTSI_1055_4p17.p1kpIBF -WTSI_1055_4p17.q1kpIBR + * WTSI_1055_4p17.p1kapIBF + * WTSI_1055_4p17.q1kapIBR + * WTSI_1055_4p17.p1kpIBF + * WTSI_1055_4p17.q1kpIBR both become: -WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR -WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR + * WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR + * WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR </help> </tool>