annotate tools/sample_seqs/sample_seqs.py @ 3:02c13ef1a669 draft

Uploaded v0.2.1, fixed missing test file, more tests.
author peterjc
date Fri, 27 Mar 2015 09:34:27 -0400
parents da64f6a9e32b
children 6b71ad5d43fb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
3
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
4 This tool is a short Python script which requires Biopython 1.62 or later
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
5 for sequence parsing. If you use this tool in scientific work leading to a
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
6 publication, please cite the Biopython application note:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
7
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
8 Cock et al 2009. Biopython: freely available Python tools for computational
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
11
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
12 This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
14 See accompanying text file for licence details (MIT license).
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
15
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
16 Use -v or --version to get the version, -h or --help for help.
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
17 """
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
18 import os
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
19 import sys
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
20 from optparse import OptionParser
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
21
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
22
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
23 def sys_exit(msg, err=1):
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
24 sys.stderr.write(msg.rstrip() + "\n")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
25 sys.exit(err)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
26
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
27 #Parse Command Line
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
28 usage = """Use as follows:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
29
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
30 $ python sample_seqs.py [options]
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
31
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
32 e.g. Sample 20% of the reads:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
33
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
35
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
36 This samples uniformly though the file, rather than at random, and therefore
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
37 should be reproducible.
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
38 """
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
39 parser = OptionParser(usage=usage)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
40 parser.add_option('-i', '--input', dest='input',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
41 default=None, help='Input sequences filename',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
42 metavar="FILE")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
43 parser.add_option('-f', '--format', dest='format',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
44 default=None,
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
45 help='Input sequence format (e.g. fasta, fastq, sff)')
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
46 parser.add_option('-o', '--output', dest='output',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
47 default=None, help='Output sampled sequenced filename',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
48 metavar="FILE")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
49 parser.add_option('-p', '--percent', dest='percent',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
50 default=None,
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
51 help='Take this percent of the reads')
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
52 parser.add_option('-n', '--everyn', dest='everyn',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
53 default=None,
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
54 help='Take every N-th read')
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
55 parser.add_option('-c', '--count', dest='count',
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
56 default=None,
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
57 help='Take exactly N reads')
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
58 parser.add_option("--interleaved", dest="interleaved",
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
59 default=False, action="store_true",
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
60 help="Input is interleaved reads, preserve the pairings")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
61 parser.add_option("-v", "--version", dest="version",
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
62 default=False, action="store_true",
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
63 help="Show version and quit")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
64 options, args = parser.parse_args()
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
65
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
66 if options.version:
3
02c13ef1a669 Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents: 2
diff changeset
67 print("v0.2.1")
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
68 sys.exit(0)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
69
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
70 in_file = options.input
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
71 out_file = options.output
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
72 interleaved = options.interleaved
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
73
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
74 if not in_file:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
75 sys_exit("Require an input filename")
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
76 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
77 sys_exit("Missing input file %r" % in_file)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
78 if not out_file:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
79 sys_exit("Require an output filename")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
80 if not options.format:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
81 sys_exit("Require the sequence format")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
82 seq_format = options.format.lower()
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
83
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
84
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
85 def count_fasta(filename):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
86 from Bio.SeqIO.FastaIO import SimpleFastaParser
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
87 count = 0
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
88 with open(filename) as handle:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
89 for title, seq in SimpleFastaParser(handle):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
90 count += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
91 return count
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
92
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
93
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
94 def count_fastq(filename):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
95 from Bio.SeqIO.QualityIO import FastqGeneralIterator
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
96 count = 0
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
97 with open(filename) as handle:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
98 for title, seq, qual in FastqGeneralIterator(handle):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
99 count += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
100 return count
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
101
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
102
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
103 def count_sff(filename):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
104 from Bio import SeqIO
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
105 # If the SFF file has a built in index (which is normal),
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
106 # this will be parsed and is the quicker than scanning
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
107 # the whole file.
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
108 return len(SeqIO.index(filename, "sff"))
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
109
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
110
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
111 def count_sequences(filename, format):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
112 if seq_format == "sff":
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
113 return count_sff(filename)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
114 elif seq_format == "fasta":
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
115 return count_fasta(filename)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
116 elif seq_format.startswith("fastq"):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
117 return count_fastq(filename)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
118 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
119 sys_exit("Unsupported file type %r" % seq_format)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
120
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
121
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
122 if options.percent and options.everyn:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
123 sys_exit("Cannot combine -p and -n options")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
124 elif options.everyn and options.count:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
125 sys_exit("Cannot combine -p and -c options")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
126 elif options.percent and options.count:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
127 sys_exit("Cannot combine -n and -c options")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
128 elif options.everyn:
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
129 try:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
130 N = int(options.everyn)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
131 except:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
132 sys_exit("Bad -n argument %r" % options.everyn)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
133 if N < 2:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
134 sys_exit("Bad -n argument %r" % options.everyn)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
135 if (N % 10) == 1:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
136 sys.stderr.write("Sampling every %ist sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
137 elif (N % 10) == 2:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
138 sys.stderr.write("Sampling every %ind sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
139 elif (N % 10) == 3:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
140 sys.stderr.write("Sampling every %ird sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
141 else:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
142 sys.stderr.write("Sampling every %ith sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
143 def sampler(iterator):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
144 global N
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
145 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
146 for record in iterator:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
147 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
148 if count % N == 1:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
149 yield record
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
150 elif options.percent:
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
151 try:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
152 percent = float(options.percent) / 100.0
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
153 except:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
154 sys_exit("Bad -p percent argument %r" % options.percent)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
155 if percent <= 0.0 or 1.0 <= percent:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
156 sys_exit("Bad -p percent argument %r" % options.percent)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
158 def sampler(iterator):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
159 global percent
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
160 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
161 taken = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
162 for record in iterator:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
163 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
164 if percent * count > taken:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
165 taken += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
166 yield record
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
167 elif options.count:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
168 try:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
169 N = int(options.count)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
170 except:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
171 sys_exit("Bad -c count argument %r" % options.count)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
172 if N < 1:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
173 sys_exit("Bad -c count argument %r" % options.count)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
174 total = count_sequences(in_file, seq_format)
3
02c13ef1a669 Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents: 2
diff changeset
175 sys.stderr.write("Input file has %i sequences\n" % total)
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
176 if interleaved:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
177 # Paired
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
178 if total % 2:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
179 sys_exit("Paired mode, but input file has an odd number of sequences: %i"
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
180 % total)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
181 elif N > total // 2:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
183 % (N, total // 2, total))
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
184 total = total // 2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
185 if N == 1:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
186 sys.stderr.write("Sampling just first sequence pair!\n")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
187 elif N == total:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
188 sys.stderr.write("Taking all the sequence pairs\n")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
189 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
190 sys.stderr.write("Sampling %i sequence pairs\n" % N)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
191 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
192 # Not paired
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
193 if total < N:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
194 sys_exit("Requested %i sequences, but file only has %i." % (N, total))
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
195 if N == 1:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
196 sys.stderr.write("Sampling just first sequence!\n")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
197 elif N == total:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
198 sys.stderr.write("Taking all the sequences\n")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
199 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
200 sys.stderr.write("Sampling %i sequences\n" % N)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
201 if N == total:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
202 def sampler(iterator):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
203 """Dummy filter to filter nothing, taking everything."""
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
204 global N
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
205 taken = 0
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
206 for record in iterator:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
207 taken += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
208 yield record
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
209 assert taken == N, "Picked %i, wanted %i" % (taken, N)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
210 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
211 def sampler(iterator):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
212 # Mimic the percentage sampler, with double check on final count
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
213 global N, total
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
214 # Do we need a floating point fudge factor epsilon?
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
215 # i.e. What if percentage comes out slighty too low, and
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
216 # we could end up missing last few desired sequences?
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
217 percentage = float(N) / float(total)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
219 # % (N, total, percentage * 100.0))
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
220 count = 0
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
221 taken = 0
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
222 for record in iterator:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
223 count += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
224 # Do we need the extra upper bound?
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
225 if percentage * count > taken and taken < N:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
226 taken += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
227 yield record
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
228 elif total - count + 1 <= N - taken:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
229 # remaining records (incuding this one) <= what we still need.
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
230 # This is a safey check for floating point edge cases where
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
231 # we need to take all remaining sequences to meet target
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
232 taken += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
233 yield record
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
234 assert taken == N, "Picked %i, wanted %i" % (taken, N)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
235 else:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
236 sys_exit("Must use either -n, -p or -c")
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
237
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
238
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
239 def pair(iterator):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
240 """Quick and dirty pair batched iterator."""
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
241 while True:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
242 a = next(iterator)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
243 b = next(iterator)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
244 if not b:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
245 assert not a, "Odd number of records?"
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
246 break
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
247 yield (a, b)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
248
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
249
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
250 def raw_fasta_iterator(handle):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
251 """Yields raw FASTA records as multi-line strings."""
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
252 while True:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
253 line = handle.readline()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
254 if line == "":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
255 return # Premature end of file, or just empty?
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
256 if line[0] == ">":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
257 break
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
258
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
259 no_id_warned = False
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
260 while True:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
261 if line[0] != ">":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
262 raise ValueError(
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
263 "Records in Fasta files should start with '>' character")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
264 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
265 id = line[1:].split(None, 1)[0]
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
266 except IndexError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
267 if not no_id_warned:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
268 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
269 no_id_warned = True
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
270 id = None
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
271 lines = [line]
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
272 line = handle.readline()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
273 while True:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
274 if not line:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
275 break
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
276 if line[0] == ">":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
277 break
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
278 lines.append(line)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
279 line = handle.readline()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
280 yield "".join(lines)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
281 if not line:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
282 return # StopIteration
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
283
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
284 def fasta_filter(in_file, out_file, iterator_filter, inter):
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
285 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
286 #Galaxy now requires Python 2.5+ so can use with statements,
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
287 with open(in_file) as in_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
288 with open(out_file, "w") as pos_handle:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
289 if inter:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
291 count += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
292 pos_handle.write(r1)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
293 pos_handle.write(r2)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
294 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
295 for record in iterator_filter(raw_fasta_iterator(in_handle)):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
296 count += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
297 pos_handle.write(record)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
298 return count
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
299
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
300
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
301 from Bio.SeqIO.QualityIO import FastqGeneralIterator
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
302 def fastq_filter(in_file, out_file, iterator_filter, inter):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
303 count = 0
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
304 with open(in_file) as in_handle:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
305 with open(out_file, "w") as pos_handle:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
306 if inter:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
307 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
308 count += 1
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
309 pos_handle.write("@%s\n%s\n+\n%s\n" % r1)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
310 pos_handle.write("@%s\n%s\n+\n%s\n" % r2)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
311 else:
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
312 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
313 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
314 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
315 return count
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
316
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
317
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
318 def sff_filter(in_file, out_file, iterator_filter, inter):
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
319 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
320 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
321 from Bio.SeqIO.SffIO import SffIterator, SffWriter
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
322 except ImportError:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
323 sys_exit("SFF filtering requires Biopython 1.54 or later")
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
324 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
326 except ImportError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
327 #Prior to Biopython 1.56 this was a private function
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
329 with open(in_file, "rb") as in_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
330 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
331 manifest = ReadRocheXmlManifest(in_handle)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
332 except ValueError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
333 manifest = None
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
334 in_handle.seek(0)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
335 with open(out_file, "wb") as out_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
336 writer = SffWriter(out_handle, xml=manifest)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
337 in_handle.seek(0) #start again after getting manifest
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
338 if inter:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
339 from itertools import chain
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
341 assert count % 2 == 0, "Odd number of records? %i" % count
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
342 count /= 2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
343 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
344 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
345 #count = writer.write_file(SffIterator(in_handle))
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
346 return count
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
347
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
348 if seq_format == "sff":
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
349 count = sff_filter(in_file, out_file, sampler, interleaved)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
350 elif seq_format == "fasta":
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
351 count = fasta_filter(in_file, out_file, sampler, interleaved)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
352 elif seq_format.startswith("fastq"):
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
353 count = fastq_filter(in_file, out_file, sampler, interleaved)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
354 else:
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
355 sys_exit("Unsupported file type %r" % seq_format)
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
356
2
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
357 if interleaved:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
358 sys.stderr.write("Selected %i pairs\n" % count)
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
359 else:
da64f6a9e32b Uploaded v0.2.0, adds desired count mode
peterjc
parents: 0
diff changeset
360 sys.stderr.write("Selected %i records\n" % count)