Mercurial > repos > peterjc > sample_seqs
comparison tools/sample_seqs/sample_seqs.py @ 0:3a807e5ea6c8 draft
Uploaded v0.0.1
| author | peterjc |
|---|---|
| date | Thu, 27 Mar 2014 09:40:53 -0400 |
| parents | |
| children | da64f6a9e32b |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3a807e5ea6c8 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. | |
| 3 | |
| 4 This tool is a short Python script which requires Biopython 1.62 or later | |
| 5 for SFF file support. If you use this tool in scientific work leading to a | |
| 6 publication, please cite the Biopython application note: | |
| 7 | |
| 8 Cock et al 2009. Biopython: freely available Python tools for computational | |
| 9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
| 10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
| 11 | |
| 12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute | |
| 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | |
| 14 See accompanying text file for licence details (MIT license). | |
| 15 | |
| 16 This is version 0.1.0 of the script, use -v or --version to get the version. | |
| 17 """ | |
| 18 import os | |
| 19 import sys | |
| 20 | |
| 21 def stop_err(msg, err=1): | |
| 22 sys.stderr.write(msg.rstrip() + "\n") | |
| 23 sys.exit(err) | |
| 24 | |
| 25 if "-v" in sys.argv or "--version" in sys.argv: | |
| 26 print("v0.1.0") | |
| 27 sys.exit(0) | |
| 28 | |
| 29 #Parse Command Line | |
| 30 if len(sys.argv) < 5: | |
| 31 stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...") | |
| 32 seq_format, in_file, out_file, mode = sys.argv[1:5] | |
| 33 if in_file != "/dev/stdin" and not os.path.isfile(in_file): | |
| 34 stop_err("Missing input file %r" % in_file) | |
| 35 | |
| 36 if mode == "everyNth": | |
| 37 if len(sys.argv) != 6: | |
| 38 stop_err("If using everyNth, just need argument N (integer, at least 2)") | |
| 39 try: | |
| 40 N = int(sys.argv[5]) | |
| 41 except: | |
| 42 stop_err("Bad N argument %r" % sys.argv[5]) | |
| 43 if N < 2: | |
| 44 stop_err("Bad N argument %r" % sys.argv[5]) | |
| 45 if (N % 10) == 1: | |
| 46 sys.stderr.write("Sampling every %ist sequence\n" % N) | |
| 47 elif (N % 10) == 2: | |
| 48 sys.stderr.write("Sampling every %ind sequence\n" % N) | |
| 49 elif (N % 10) == 3: | |
| 50 sys.stderr.write("Sampling every %ird sequence\n" % N) | |
| 51 else: | |
| 52 sys.stderr.write("Sampling every %ith sequence\n" % N) | |
| 53 def sampler(iterator): | |
| 54 global N | |
| 55 count = 0 | |
| 56 for record in iterator: | |
| 57 count += 1 | |
| 58 if count % N == 1: | |
| 59 yield record | |
| 60 elif mode == "percentage": | |
| 61 if len(sys.argv) != 6: | |
| 62 stop_err("If using percentage, just need percentage argument (float, range 0 to 100)") | |
| 63 try: | |
| 64 percent = float(sys.argv[5]) / 100.0 | |
| 65 except: | |
| 66 stop_err("Bad percent argument %r" % sys.argv[5]) | |
| 67 if percent <= 0.0 or 1.0 <= percent: | |
| 68 stop_err("Bad percent argument %r" % sys.argv[5]) | |
| 69 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) | |
| 70 def sampler(iterator): | |
| 71 global percent | |
| 72 count = 0 | |
| 73 taken = 0 | |
| 74 for record in iterator: | |
| 75 count += 1 | |
| 76 if percent * count > taken: | |
| 77 taken += 1 | |
| 78 yield record | |
| 79 else: | |
| 80 stop_err("Unsupported mode %r" % mode) | |
| 81 | |
| 82 def raw_fasta_iterator(handle): | |
| 83 """Yields raw FASTA records as multi-line strings.""" | |
| 84 while True: | |
| 85 line = handle.readline() | |
| 86 if line == "": | |
| 87 return # Premature end of file, or just empty? | |
| 88 if line[0] == ">": | |
| 89 break | |
| 90 | |
| 91 no_id_warned = False | |
| 92 while True: | |
| 93 if line[0] != ">": | |
| 94 raise ValueError( | |
| 95 "Records in Fasta files should start with '>' character") | |
| 96 try: | |
| 97 id = line[1:].split(None, 1)[0] | |
| 98 except IndexError: | |
| 99 if not no_id_warned: | |
| 100 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | |
| 101 no_id_warned = True | |
| 102 id = None | |
| 103 lines = [line] | |
| 104 line = handle.readline() | |
| 105 while True: | |
| 106 if not line: | |
| 107 break | |
| 108 if line[0] == ">": | |
| 109 break | |
| 110 lines.append(line) | |
| 111 line = handle.readline() | |
| 112 yield "".join(lines) | |
| 113 if not line: | |
| 114 return # StopIteration | |
| 115 | |
| 116 def fasta_filter(in_file, out_file, iterator_filter): | |
| 117 count = 0 | |
| 118 #Galaxy now requires Python 2.5+ so can use with statements, | |
| 119 with open(in_file) as in_handle: | |
| 120 with open(out_file, "w") as pos_handle: | |
| 121 for record in iterator_filter(raw_fasta_iterator(in_handle)): | |
| 122 count += 1 | |
| 123 pos_handle.write(record) | |
| 124 return count | |
| 125 | |
| 126 try: | |
| 127 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
| 128 def fastq_filter(in_file, out_file, iterator_filter): | |
| 129 count = 0 | |
| 130 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
| 131 reader = fastqReader(open(in_file, "rU")) | |
| 132 writer = fastqWriter(open(out_file, "w")) | |
| 133 for record in iterator_filter(reader): | |
| 134 count += 1 | |
| 135 writer.write(record) | |
| 136 writer.close() | |
| 137 reader.close() | |
| 138 return count | |
| 139 except ImportError: | |
| 140 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
| 141 def fastq_filter(in_file, out_file, iterator_filter): | |
| 142 count = 0 | |
| 143 with open(in_file) as in_handle: | |
| 144 with open(out_file, "w") as pos_handle: | |
| 145 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): | |
| 146 count += 1 | |
| 147 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
| 148 return count | |
| 149 | |
| 150 def sff_filter(in_file, out_file, iterator_filter): | |
| 151 count = 0 | |
| 152 try: | |
| 153 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
| 154 except ImportError: | |
| 155 stop_err("SFF filtering requires Biopython 1.54 or later") | |
| 156 try: | |
| 157 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
| 158 except ImportError: | |
| 159 #Prior to Biopython 1.56 this was a private function | |
| 160 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
| 161 with open(in_file, "rb") as in_handle: | |
| 162 try: | |
| 163 manifest = ReadRocheXmlManifest(in_handle) | |
| 164 except ValueError: | |
| 165 manifest = None | |
| 166 in_handle.seek(0) | |
| 167 with open(out_file, "wb") as out_handle: | |
| 168 writer = SffWriter(out_handle, xml=manifest) | |
| 169 in_handle.seek(0) #start again after getting manifest | |
| 170 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | |
| 171 #count = writer.write_file(SffIterator(in_handle)) | |
| 172 return count | |
| 173 | |
| 174 if seq_format.lower()=="sff": | |
| 175 count = sff_filter(in_file, out_file, sampler) | |
| 176 elif seq_format.lower()=="fasta": | |
| 177 count = fasta_filter(in_file, out_file, sampler) | |
| 178 elif seq_format.lower().startswith("fastq"): | |
| 179 count = fastq_filter(in_file, out_file, sampler) | |
| 180 else: | |
| 181 stop_err("Unsupported file type %r" % seq_format) | |
| 182 | |
| 183 sys.stderr.write("Sampled %i records\n" % count) |
