Mercurial > repos > peterjc > sample_seqs
diff tools/sample_seqs/sample_seqs.py @ 0:3a807e5ea6c8 draft
Uploaded v0.0.1
author | peterjc |
---|---|
date | Thu, 27 Mar 2014 09:40:53 -0400 |
parents | |
children | da64f6a9e32b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/sample_seqs/sample_seqs.py Thu Mar 27 09:40:53 2014 -0400 @@ -0,0 +1,183 @@ +#!/usr/bin/env python +"""Sub-sample sequence from a FASTA, FASTQ or SFF file. + +This tool is a short Python script which requires Biopython 1.62 or later +for SFF file support. If you use this tool in scientific work leading to a +publication, please cite the Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute +(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. +See accompanying text file for licence details (MIT license). + +This is version 0.1.0 of the script, use -v or --version to get the version. +""" +import os +import sys + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +if "-v" in sys.argv or "--version" in sys.argv: + print("v0.1.0") + sys.exit(0) + +#Parse Command Line +if len(sys.argv) < 5: + stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...") +seq_format, in_file, out_file, mode = sys.argv[1:5] +if in_file != "/dev/stdin" and not os.path.isfile(in_file): + stop_err("Missing input file %r" % in_file) + +if mode == "everyNth": + if len(sys.argv) != 6: + stop_err("If using everyNth, just need argument N (integer, at least 2)") + try: + N = int(sys.argv[5]) + except: + stop_err("Bad N argument %r" % sys.argv[5]) + if N < 2: + stop_err("Bad N argument %r" % sys.argv[5]) + if (N % 10) == 1: + sys.stderr.write("Sampling every %ist sequence\n" % N) + elif (N % 10) == 2: + sys.stderr.write("Sampling every %ind sequence\n" % N) + elif (N % 10) == 3: + sys.stderr.write("Sampling every %ird sequence\n" % N) + else: + sys.stderr.write("Sampling every %ith sequence\n" % N) + def sampler(iterator): + global N + count = 0 + for record in iterator: + count += 1 + if count % N == 1: + yield record +elif mode == "percentage": + if len(sys.argv) != 6: + stop_err("If using percentage, just need percentage argument (float, range 0 to 100)") + try: + percent = float(sys.argv[5]) / 100.0 + except: + stop_err("Bad percent argument %r" % sys.argv[5]) + if percent <= 0.0 or 1.0 <= percent: + stop_err("Bad percent argument %r" % sys.argv[5]) + sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) + def sampler(iterator): + global percent + count = 0 + taken = 0 + for record in iterator: + count += 1 + if percent * count > taken: + taken += 1 + yield record +else: + stop_err("Unsupported mode %r" % mode) + +def raw_fasta_iterator(handle): + """Yields raw FASTA records as multi-line strings.""" + while True: + line = handle.readline() + if line == "": + return # Premature end of file, or just empty? + if line[0] == ">": + break + + no_id_warned = False + while True: + if line[0] != ">": + raise ValueError( + "Records in Fasta files should start with '>' character") + try: + id = line[1:].split(None, 1)[0] + except IndexError: + if not no_id_warned: + sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") + no_id_warned = True + id = None + lines = [line] + line = handle.readline() + while True: + if not line: + break + if line[0] == ">": + break + lines.append(line) + line = handle.readline() + yield "".join(lines) + if not line: + return # StopIteration + +def fasta_filter(in_file, out_file, iterator_filter): + count = 0 + #Galaxy now requires Python 2.5+ so can use with statements, + with open(in_file) as in_handle: + with open(out_file, "w") as pos_handle: + for record in iterator_filter(raw_fasta_iterator(in_handle)): + count += 1 + pos_handle.write(record) + return count + +try: + from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + def fastq_filter(in_file, out_file, iterator_filter): + count = 0 + #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + reader = fastqReader(open(in_file, "rU")) + writer = fastqWriter(open(out_file, "w")) + for record in iterator_filter(reader): + count += 1 + writer.write(record) + writer.close() + reader.close() + return count +except ImportError: + from Bio.SeqIO.QualityIO import FastqGeneralIterator + def fastq_filter(in_file, out_file, iterator_filter): + count = 0 + with open(in_file) as in_handle: + with open(out_file, "w") as pos_handle: + for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): + count += 1 + pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + return count + +def sff_filter(in_file, out_file, iterator_filter): + count = 0 + try: + from Bio.SeqIO.SffIO import SffIterator, SffWriter + except ImportError: + stop_err("SFF filtering requires Biopython 1.54 or later") + try: + from Bio.SeqIO.SffIO import ReadRocheXmlManifest + except ImportError: + #Prior to Biopython 1.56 this was a private function + from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest + with open(in_file, "rb") as in_handle: + try: + manifest = ReadRocheXmlManifest(in_handle) + except ValueError: + manifest = None + in_handle.seek(0) + with open(out_file, "wb") as out_handle: + writer = SffWriter(out_handle, xml=manifest) + in_handle.seek(0) #start again after getting manifest + count = writer.write_file(iterator_filter(SffIterator(in_handle))) + #count = writer.write_file(SffIterator(in_handle)) + return count + +if seq_format.lower()=="sff": + count = sff_filter(in_file, out_file, sampler) +elif seq_format.lower()=="fasta": + count = fasta_filter(in_file, out_file, sampler) +elif seq_format.lower().startswith("fastq"): + count = fastq_filter(in_file, out_file, sampler) +else: + stop_err("Unsupported file type %r" % seq_format) + +sys.stderr.write("Sampled %i records\n" % count)