comparison tools/sample_seqs/sample_seqs.py @ 0:3a807e5ea6c8 draft

Uploaded v0.0.1
author peterjc
date Thu, 27 Mar 2014 09:40:53 -0400
parents
children da64f6a9e32b
comparison
equal deleted inserted replaced
-1:000000000000 0:3a807e5ea6c8
1 #!/usr/bin/env python
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file.
3
4 This tool is a short Python script which requires Biopython 1.62 or later
5 for SFF file support. If you use this tool in scientific work leading to a
6 publication, please cite the Biopython application note:
7
8 Cock et al 2009. Biopython: freely available Python tools for computational
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
11
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
14 See accompanying text file for licence details (MIT license).
15
16 This is version 0.1.0 of the script, use -v or --version to get the version.
17 """
18 import os
19 import sys
20
21 def stop_err(msg, err=1):
22 sys.stderr.write(msg.rstrip() + "\n")
23 sys.exit(err)
24
25 if "-v" in sys.argv or "--version" in sys.argv:
26 print("v0.1.0")
27 sys.exit(0)
28
29 #Parse Command Line
30 if len(sys.argv) < 5:
31 stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
32 seq_format, in_file, out_file, mode = sys.argv[1:5]
33 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
34 stop_err("Missing input file %r" % in_file)
35
36 if mode == "everyNth":
37 if len(sys.argv) != 6:
38 stop_err("If using everyNth, just need argument N (integer, at least 2)")
39 try:
40 N = int(sys.argv[5])
41 except:
42 stop_err("Bad N argument %r" % sys.argv[5])
43 if N < 2:
44 stop_err("Bad N argument %r" % sys.argv[5])
45 if (N % 10) == 1:
46 sys.stderr.write("Sampling every %ist sequence\n" % N)
47 elif (N % 10) == 2:
48 sys.stderr.write("Sampling every %ind sequence\n" % N)
49 elif (N % 10) == 3:
50 sys.stderr.write("Sampling every %ird sequence\n" % N)
51 else:
52 sys.stderr.write("Sampling every %ith sequence\n" % N)
53 def sampler(iterator):
54 global N
55 count = 0
56 for record in iterator:
57 count += 1
58 if count % N == 1:
59 yield record
60 elif mode == "percentage":
61 if len(sys.argv) != 6:
62 stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
63 try:
64 percent = float(sys.argv[5]) / 100.0
65 except:
66 stop_err("Bad percent argument %r" % sys.argv[5])
67 if percent <= 0.0 or 1.0 <= percent:
68 stop_err("Bad percent argument %r" % sys.argv[5])
69 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
70 def sampler(iterator):
71 global percent
72 count = 0
73 taken = 0
74 for record in iterator:
75 count += 1
76 if percent * count > taken:
77 taken += 1
78 yield record
79 else:
80 stop_err("Unsupported mode %r" % mode)
81
82 def raw_fasta_iterator(handle):
83 """Yields raw FASTA records as multi-line strings."""
84 while True:
85 line = handle.readline()
86 if line == "":
87 return # Premature end of file, or just empty?
88 if line[0] == ">":
89 break
90
91 no_id_warned = False
92 while True:
93 if line[0] != ">":
94 raise ValueError(
95 "Records in Fasta files should start with '>' character")
96 try:
97 id = line[1:].split(None, 1)[0]
98 except IndexError:
99 if not no_id_warned:
100 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
101 no_id_warned = True
102 id = None
103 lines = [line]
104 line = handle.readline()
105 while True:
106 if not line:
107 break
108 if line[0] == ">":
109 break
110 lines.append(line)
111 line = handle.readline()
112 yield "".join(lines)
113 if not line:
114 return # StopIteration
115
116 def fasta_filter(in_file, out_file, iterator_filter):
117 count = 0
118 #Galaxy now requires Python 2.5+ so can use with statements,
119 with open(in_file) as in_handle:
120 with open(out_file, "w") as pos_handle:
121 for record in iterator_filter(raw_fasta_iterator(in_handle)):
122 count += 1
123 pos_handle.write(record)
124 return count
125
126 try:
127 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
128 def fastq_filter(in_file, out_file, iterator_filter):
129 count = 0
130 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
131 reader = fastqReader(open(in_file, "rU"))
132 writer = fastqWriter(open(out_file, "w"))
133 for record in iterator_filter(reader):
134 count += 1
135 writer.write(record)
136 writer.close()
137 reader.close()
138 return count
139 except ImportError:
140 from Bio.SeqIO.QualityIO import FastqGeneralIterator
141 def fastq_filter(in_file, out_file, iterator_filter):
142 count = 0
143 with open(in_file) as in_handle:
144 with open(out_file, "w") as pos_handle:
145 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
146 count += 1
147 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
148 return count
149
150 def sff_filter(in_file, out_file, iterator_filter):
151 count = 0
152 try:
153 from Bio.SeqIO.SffIO import SffIterator, SffWriter
154 except ImportError:
155 stop_err("SFF filtering requires Biopython 1.54 or later")
156 try:
157 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
158 except ImportError:
159 #Prior to Biopython 1.56 this was a private function
160 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
161 with open(in_file, "rb") as in_handle:
162 try:
163 manifest = ReadRocheXmlManifest(in_handle)
164 except ValueError:
165 manifest = None
166 in_handle.seek(0)
167 with open(out_file, "wb") as out_handle:
168 writer = SffWriter(out_handle, xml=manifest)
169 in_handle.seek(0) #start again after getting manifest
170 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
171 #count = writer.write_file(SffIterator(in_handle))
172 return count
173
174 if seq_format.lower()=="sff":
175 count = sff_filter(in_file, out_file, sampler)
176 elif seq_format.lower()=="fasta":
177 count = fasta_filter(in_file, out_file, sampler)
178 elif seq_format.lower().startswith("fastq"):
179 count = fastq_filter(in_file, out_file, sampler)
180 else:
181 stop_err("Unsupported file type %r" % seq_format)
182
183 sys.stderr.write("Sampled %i records\n" % count)