changeset 1:7ed81e36fc1c

Uploaded v0.0.5 which handles Illumina 1.8 style pair naming.
author peterjc
date Mon, 12 Dec 2011 11:33:10 -0500
parents 72e9fcaec61f
children 95a632a71951
files tools/fastq/fastq_paired_unpaired.py tools/fastq/fastq_paired_unpaired.txt tools/fastq/fastq_paired_unpaired.xml
diffstat 3 files changed, 55 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
--- a/tools/fastq/fastq_paired_unpaired.py	Tue Jun 07 17:21:17 2011 -0400
+++ b/tools/fastq/fastq_paired_unpaired.py	Mon Dec 12 11:33:10 2011 -0500
@@ -9,16 +9,20 @@
 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even
 Color Space should all work equally well).
 
-This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved.
+This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute
+(formerly SCRI), Scotland, UK. All rights reserved.
+
 See accompanying text file for licence details (MIT/BSD style).
-
-This is version 0.0.4 of the script.
 """
 import os
 import sys
 import re
 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
 
+if "--version" in sys.argv[1:]:
+    print "Version 0.0.5"
+    sys.exit(0)
+
 def stop_err(msg, err=1):
    sys.stderr.write(msg.rstrip() + "\n")
    sys.exit(err)
@@ -44,8 +48,13 @@
 
 Any reads where the forward/reverse naming suffix used is not recognised
 are treated as orphan reads. The tool supports the /1 and /2 convention
-used by Illumina, the .f and .r convention, and the Sanger convention
-(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details).
+originally used by Illumina, the .f and .r convention, and the Sanger
+convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html
+for details), and the new Illumina convention where the reads have the
+same identifier with the fragment at the start of the description, e.g.
+
+@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA
+@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA 
 
 Note that this does support multiple forward and reverse reads per template
 (which is quite common with Sanger sequencing), e.g. this which is sorted
@@ -113,6 +122,14 @@
 assert not re_f.search("demo.r")
 assert not re_f.search("demo.q")
 
+re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$")
+re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$")
+assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
+assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
+assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
+assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
+
+
 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
 in_handle = open(input_fastq)
 if pairs_fastq:
@@ -128,12 +145,18 @@
     count += 1
     name = record.identifier.split(None,1)[0]
     assert name[0]=="@", record.identifier #Quirk of the Galaxy parser
+    is_forward = False
     suffix = re_f.search(name)
     if suffix:
         #============
         #Forward read
         #============
         template = name[:suffix.start()]
+        is_forward = True
+    elif re_illumina_f.match(record.identifier):
+        template = name #No suffix
+        is_forward = True
+    if is_forward:
         #print name, "forward", template
         forward += 1
         if last_template == template:
@@ -145,14 +168,20 @@
                 singles += 1
             #Save this read in buffer
             buffered_reads = [record]
-            last_template = template  
+            last_template = template
     else:
+        is_reverse = False
         suffix = re_r.search(name)
         if suffix:
             #============
             #Reverse read
             #============
             template = name[:suffix.start()]
+            is_reverse = True
+        elif re_illumina_r.match(record.identifier):
+            template = name #No suffix
+            is_reverse = True
+        if is_reverse:
             #print name, "reverse", template
             reverse += 1
             if last_template == template and buffered_reads:
@@ -208,5 +237,5 @@
            % (count, forward, reverse, pairs, singles)
 
 assert count == pairs + singles == forward + reverse + neither, \
-       "%i vs %i+%i=%i vs %i+%i=%i" \
+       "%i vs %i+%i=%i vs %i+%i+%i=%i" \
        % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)
--- a/tools/fastq/fastq_paired_unpaired.txt	Tue Jun 07 17:21:17 2011 -0400
+++ b/tools/fastq/fastq_paired_unpaired.txt	Mon Dec 12 11:33:10 2011 -0500
@@ -36,6 +36,7 @@
 v0.0.2 - Help text; cope with multiple pairs per template
 v0.0.3 - Galaxy XML wrappers added
 v0.0.4 - Use Galaxy library to handle FASTQ files (avoid Biopython dependency)
+v0.0.5 - Handle Illumina 1.8 style pair names
 
 
 Developers
--- a/tools/fastq/fastq_paired_unpaired.xml	Tue Jun 07 17:21:17 2011 -0400
+++ b/tools/fastq/fastq_paired_unpaired.xml	Mon Dec 12 11:33:10 2011 -0500
@@ -1,4 +1,4 @@
-<tool id="fastq_paired_unpaired" name="Divide FASTQ file into paired and unpaired reads" version="0.0.4">
+<tool id="fastq_paired_unpaired" name="Divide FASTQ file into paired and unpaired reads" version="0.0.5">
 	<description>using the read name suffices</description>
 	<command interpreter="python">
 fastq_paired_unpaired.py $input_fastq.extension $input_fastq
@@ -52,29 +52,34 @@
 
 Any reads where the forward/reverse naming suffix used is not recognised
 are treated as orphan reads. The tool supports the /1 and /2 convention
-used by Illumina, the .f and .r convention, and the Sanger convention
-(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details).
+originally used by Illumina, .f and .r convention, the Sanger convention
+(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details),
+and the current Illumina convention where the reads get the same identifier
+with the fragment number in the description, for example:
+
+ * @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA
+ * @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA 
 
 Note that this does support multiple forward and reverse reads per template
 (which is quite common with Sanger sequencing), e.g. this which is sorted
 alphabetically:
 
-WTSI_1055_4p17.p1kapIBF
-WTSI_1055_4p17.p1kpIBF
-WTSI_1055_4p17.q1kapIBR
-WTSI_1055_4p17.q1kpIBR
+ * WTSI_1055_4p17.p1kapIBF
+ * WTSI_1055_4p17.p1kpIBF
+ * WTSI_1055_4p17.q1kapIBR
+ * WTSI_1055_4p17.q1kpIBR
 
 or this where the reads already come in pairs:
 
-WTSI_1055_4p17.p1kapIBF
-WTSI_1055_4p17.q1kapIBR
-WTSI_1055_4p17.p1kpIBF
-WTSI_1055_4p17.q1kpIBR
+ * WTSI_1055_4p17.p1kapIBF
+ * WTSI_1055_4p17.q1kapIBR
+ * WTSI_1055_4p17.p1kpIBF
+ * WTSI_1055_4p17.q1kpIBR
 
 both become:
 
-WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
-WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
+ * WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
+ * WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
 
 	</help>
 </tool>