Mercurial > repos > peterjc > sample_seqs
comparison tools/sample_seqs/sample_seqs.py @ 0:3a807e5ea6c8 draft
Uploaded v0.0.1
author | peterjc |
---|---|
date | Thu, 27 Mar 2014 09:40:53 -0400 |
parents | |
children | da64f6a9e32b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3a807e5ea6c8 |
---|---|
1 #!/usr/bin/env python | |
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. | |
3 | |
4 This tool is a short Python script which requires Biopython 1.62 or later | |
5 for SFF file support. If you use this tool in scientific work leading to a | |
6 publication, please cite the Biopython application note: | |
7 | |
8 Cock et al 2009. Biopython: freely available Python tools for computational | |
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
11 | |
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute | |
13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | |
14 See accompanying text file for licence details (MIT license). | |
15 | |
16 This is version 0.1.0 of the script, use -v or --version to get the version. | |
17 """ | |
18 import os | |
19 import sys | |
20 | |
21 def stop_err(msg, err=1): | |
22 sys.stderr.write(msg.rstrip() + "\n") | |
23 sys.exit(err) | |
24 | |
25 if "-v" in sys.argv or "--version" in sys.argv: | |
26 print("v0.1.0") | |
27 sys.exit(0) | |
28 | |
29 #Parse Command Line | |
30 if len(sys.argv) < 5: | |
31 stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...") | |
32 seq_format, in_file, out_file, mode = sys.argv[1:5] | |
33 if in_file != "/dev/stdin" and not os.path.isfile(in_file): | |
34 stop_err("Missing input file %r" % in_file) | |
35 | |
36 if mode == "everyNth": | |
37 if len(sys.argv) != 6: | |
38 stop_err("If using everyNth, just need argument N (integer, at least 2)") | |
39 try: | |
40 N = int(sys.argv[5]) | |
41 except: | |
42 stop_err("Bad N argument %r" % sys.argv[5]) | |
43 if N < 2: | |
44 stop_err("Bad N argument %r" % sys.argv[5]) | |
45 if (N % 10) == 1: | |
46 sys.stderr.write("Sampling every %ist sequence\n" % N) | |
47 elif (N % 10) == 2: | |
48 sys.stderr.write("Sampling every %ind sequence\n" % N) | |
49 elif (N % 10) == 3: | |
50 sys.stderr.write("Sampling every %ird sequence\n" % N) | |
51 else: | |
52 sys.stderr.write("Sampling every %ith sequence\n" % N) | |
53 def sampler(iterator): | |
54 global N | |
55 count = 0 | |
56 for record in iterator: | |
57 count += 1 | |
58 if count % N == 1: | |
59 yield record | |
60 elif mode == "percentage": | |
61 if len(sys.argv) != 6: | |
62 stop_err("If using percentage, just need percentage argument (float, range 0 to 100)") | |
63 try: | |
64 percent = float(sys.argv[5]) / 100.0 | |
65 except: | |
66 stop_err("Bad percent argument %r" % sys.argv[5]) | |
67 if percent <= 0.0 or 1.0 <= percent: | |
68 stop_err("Bad percent argument %r" % sys.argv[5]) | |
69 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) | |
70 def sampler(iterator): | |
71 global percent | |
72 count = 0 | |
73 taken = 0 | |
74 for record in iterator: | |
75 count += 1 | |
76 if percent * count > taken: | |
77 taken += 1 | |
78 yield record | |
79 else: | |
80 stop_err("Unsupported mode %r" % mode) | |
81 | |
82 def raw_fasta_iterator(handle): | |
83 """Yields raw FASTA records as multi-line strings.""" | |
84 while True: | |
85 line = handle.readline() | |
86 if line == "": | |
87 return # Premature end of file, or just empty? | |
88 if line[0] == ">": | |
89 break | |
90 | |
91 no_id_warned = False | |
92 while True: | |
93 if line[0] != ">": | |
94 raise ValueError( | |
95 "Records in Fasta files should start with '>' character") | |
96 try: | |
97 id = line[1:].split(None, 1)[0] | |
98 except IndexError: | |
99 if not no_id_warned: | |
100 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | |
101 no_id_warned = True | |
102 id = None | |
103 lines = [line] | |
104 line = handle.readline() | |
105 while True: | |
106 if not line: | |
107 break | |
108 if line[0] == ">": | |
109 break | |
110 lines.append(line) | |
111 line = handle.readline() | |
112 yield "".join(lines) | |
113 if not line: | |
114 return # StopIteration | |
115 | |
116 def fasta_filter(in_file, out_file, iterator_filter): | |
117 count = 0 | |
118 #Galaxy now requires Python 2.5+ so can use with statements, | |
119 with open(in_file) as in_handle: | |
120 with open(out_file, "w") as pos_handle: | |
121 for record in iterator_filter(raw_fasta_iterator(in_handle)): | |
122 count += 1 | |
123 pos_handle.write(record) | |
124 return count | |
125 | |
126 try: | |
127 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
128 def fastq_filter(in_file, out_file, iterator_filter): | |
129 count = 0 | |
130 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
131 reader = fastqReader(open(in_file, "rU")) | |
132 writer = fastqWriter(open(out_file, "w")) | |
133 for record in iterator_filter(reader): | |
134 count += 1 | |
135 writer.write(record) | |
136 writer.close() | |
137 reader.close() | |
138 return count | |
139 except ImportError: | |
140 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
141 def fastq_filter(in_file, out_file, iterator_filter): | |
142 count = 0 | |
143 with open(in_file) as in_handle: | |
144 with open(out_file, "w") as pos_handle: | |
145 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): | |
146 count += 1 | |
147 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
148 return count | |
149 | |
150 def sff_filter(in_file, out_file, iterator_filter): | |
151 count = 0 | |
152 try: | |
153 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
154 except ImportError: | |
155 stop_err("SFF filtering requires Biopython 1.54 or later") | |
156 try: | |
157 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
158 except ImportError: | |
159 #Prior to Biopython 1.56 this was a private function | |
160 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
161 with open(in_file, "rb") as in_handle: | |
162 try: | |
163 manifest = ReadRocheXmlManifest(in_handle) | |
164 except ValueError: | |
165 manifest = None | |
166 in_handle.seek(0) | |
167 with open(out_file, "wb") as out_handle: | |
168 writer = SffWriter(out_handle, xml=manifest) | |
169 in_handle.seek(0) #start again after getting manifest | |
170 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | |
171 #count = writer.write_file(SffIterator(in_handle)) | |
172 return count | |
173 | |
174 if seq_format.lower()=="sff": | |
175 count = sff_filter(in_file, out_file, sampler) | |
176 elif seq_format.lower()=="fasta": | |
177 count = fasta_filter(in_file, out_file, sampler) | |
178 elif seq_format.lower().startswith("fastq"): | |
179 count = fastq_filter(in_file, out_file, sampler) | |
180 else: | |
181 stop_err("Unsupported file type %r" % seq_format) | |
182 | |
183 sys.stderr.write("Sampled %i records\n" % count) |