Mercurial > repos > peterjc > fastq_paired_unpaired
diff tools/fastq/fastq_paired_unpaired.py @ 0:72e9fcaec61f
Migrated tool version 0.0.4 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 17:21:17 -0400 |
parents | |
children | 7ed81e36fc1c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_unpaired.py Tue Jun 07 17:21:17 2011 -0400 @@ -0,0 +1,212 @@ +#!/usr/bin/env python +"""Divides a FASTQ into paired and single (orphan reads) as separate files. + +The input file should be a valid FASTQ file which has been sorted so that +any partner forward+reverse reads are consecutive. The output files all +preserve this sort order. Pairing are recognised based on standard name +suffices. See below or run the tool with no arguments for more details. + +Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even +Color Space should all work equally well). + +This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.4 of the script. +""" +import os +import sys +import re +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +msg = """Expect either 3 or 4 arguments, all FASTQ filenames. + +If you want two output files, use four arguments: + - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) + - Sorted input FASTQ filename, + - Output paired FASTQ filename (forward then reverse interleaved), + - Output singles FASTQ filename (orphan reads) + +If you want three output files, use five arguments: + - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) + - Sorted input FASTQ filename, + - Output forward paired FASTQ filename, + - Output reverse paired FASTQ filename, + - Output singles FASTQ filename (orphan reads) + +The input file should be a valid FASTQ file which has been sorted so that +any partner forward+reverse reads are consecutive. The output files all +preserve this sort order. + +Any reads where the forward/reverse naming suffix used is not recognised +are treated as orphan reads. The tool supports the /1 and /2 convention +used by Illumina, the .f and .r convention, and the Sanger convention +(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). + +Note that this does support multiple forward and reverse reads per template +(which is quite common with Sanger sequencing), e.g. this which is sorted +alphabetically: + +WTSI_1055_4p17.p1kapIBF +WTSI_1055_4p17.p1kpIBF +WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.q1kpIBR + +or this where the reads already come in pairs: + +WTSI_1055_4p17.p1kapIBF +WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.p1kpIBF +WTSI_1055_4p17.q1kpIBR + +both become: + +WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR +""" + +if len(sys.argv) == 5: + format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] +elif len(sys.argv) == 6: + pairs_fastq = None + format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] +else: + stop_err(msg) + +format = format.replace("fastq", "").lower() +if not format: + format="sanger" #safe default +elif format not in ["sanger","solexa","illumina","cssanger"]: + stop_err("Unrecognised format %s" % format) + +def f_match(name): + if name.endswith("/1") or name.endswith(".f"): + return True + +#Cope with three widely used suffix naming convensions, +#Illumina: /1 or /2 +#Forward/revered: .f or .r +#Sanger, e.g. .p1k and .q1k +#See http://staden.sourceforge.net/manual/pregap4_unix_50.html +re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") +re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") + +#assert re_f.match("demo/1") +assert re_f.search("demo.f") +assert re_f.search("demo.s1") +assert re_f.search("demo.f1k") +assert re_f.search("demo.p1") +assert re_f.search("demo.p1k") +assert re_f.search("demo.p1lk") +assert re_r.search("demo/2") +assert re_r.search("demo.r") +assert re_r.search("demo.q1") +assert re_r.search("demo.q1lk") +assert not re_r.search("demo/1") +assert not re_r.search("demo.f") +assert not re_r.search("demo.p") +assert not re_f.search("demo/2") +assert not re_f.search("demo.r") +assert not re_f.search("demo.q") + +count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 +in_handle = open(input_fastq) +if pairs_fastq: + pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) + pairs_r_writer = pairs_f_writer +else: + pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) + pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) +singles_writer = fastqWriter(open(singles_fastq, "w"), format) +last_template, buffered_reads = None, [] + +for record in fastqReader(in_handle, format): + count += 1 + name = record.identifier.split(None,1)[0] + assert name[0]=="@", record.identifier #Quirk of the Galaxy parser + suffix = re_f.search(name) + if suffix: + #============ + #Forward read + #============ + template = name[:suffix.start()] + #print name, "forward", template + forward += 1 + if last_template == template: + buffered_reads.append(record) + else: + #Any old buffered reads are orphans + for old in buffered_reads: + singles_writer.write(old) + singles += 1 + #Save this read in buffer + buffered_reads = [record] + last_template = template + else: + suffix = re_r.search(name) + if suffix: + #============ + #Reverse read + #============ + template = name[:suffix.start()] + #print name, "reverse", template + reverse += 1 + if last_template == template and buffered_reads: + #We have a pair! + #If there are multiple buffered forward reads, want to pick + #the first one (although we could try and do something more + #clever looking at the suffix to match them up...) + old = buffered_reads.pop(0) + pairs_f_writer.write(old) + pairs_r_writer.write(record) + pairs += 2 + else: + #As this is a reverse read, this and any buffered read(s) are + #all orphans + for old in buffered_reads: + singles_writer.write(old) + singles += 1 + buffered_reads = [] + singles_writer.write(record) + singles += 1 + last_template = None + else: + #=========================== + #Neither forward nor reverse + #=========================== + singles_writer.write(record) + singles += 1 + neither += 1 + for old in buffered_reads: + singles_writer.write(old) + singles += 1 + buffered_reads = [] + last_template = None +if last_template: + #Left over singles... + for old in buffered_reads: + singles_writer.write(old) + singles += 1 +in_handle.close +singles_writer.close() +if pairs_fastq: + pairs_f_writer.close() + assert pairs_r_writer.file.closed +else: + pairs_f_writer.close() + pairs_r_writer.close() + +if neither: + print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \ + % (count, forward, reverse, neither, pairs, singles) +else: + print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ + % (count, forward, reverse, pairs, singles) + +assert count == pairs + singles == forward + reverse + neither, \ + "%i vs %i+%i=%i vs %i+%i=%i" \ + % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)