Mercurial > repos > peterjc > seq_filter_by_id
comparison tools/filters/seq_filter_by_id.py @ 1:262f08104540 draft
Uploaded v0.0.4 which includes a unit test and is faster at filtering FASTA files with large records (e.g. whole chromosomes)
author | peterjc |
---|---|
date | Mon, 15 Apr 2013 12:27:30 -0400 |
parents | 5844f6a450ed |
children | abdd608c869b |
comparison
equal
deleted
inserted
replaced
0:5844f6a450ed | 1:262f08104540 |
---|---|
23 | 23 |
24 Cock et al 2009. Biopython: freely available Python tools for computational | 24 Cock et al 2009. Biopython: freely available Python tools for computational |
25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | 25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. |
26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
27 | 27 |
28 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. | 28 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute |
29 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | |
29 See accompanying text file for licence details (MIT/BSD style). | 30 See accompanying text file for licence details (MIT/BSD style). |
30 | 31 |
31 This is version 0.0.1 of the script. | 32 This is version 0.0.4 of the script, use -v or --version to get the version. |
32 """ | 33 """ |
33 import sys | 34 import sys |
34 | 35 |
35 def stop_err(msg, err=1): | 36 def stop_err(msg, err=1): |
36 sys.stderr.write(msg.rstrip() + "\n") | 37 sys.stderr.write(msg.rstrip() + "\n") |
37 sys.exit(err) | 38 sys.exit(err) |
39 | |
40 if "-v" in sys.argv or "--version" in sys.argv: | |
41 print "v0.0.4" | |
42 sys.exit(0) | |
38 | 43 |
39 #Parse Command Line | 44 #Parse Command Line |
40 try: | 45 try: |
41 tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:] | 46 tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:] |
42 except ValueError: | 47 except ValueError: |
43 stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | 48 stop_err("Expected six arguments (tab file, columns, in seq, seq format, out pos, out neg), got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) |
44 try: | 49 try: |
45 columns = [int(arg)-1 for arg in cols_arg.split(",")] | 50 columns = [int(arg)-1 for arg in cols_arg.split(",")] |
46 except ValueError: | 51 except ValueError: |
47 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) | 52 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) |
53 if min(columns) < 0: | |
54 stop_err("Expect one-based column numbers (not zero-based counting), got %s" % cols_arg) | |
48 | 55 |
49 if out_positive_file == "-" and out_negative_file == "-": | 56 if out_positive_file == "-" and out_negative_file == "-": |
50 stop_err("Neither output file requested") | 57 stop_err("Neither output file requested") |
51 | 58 |
52 | 59 |
71 ids.add(line.rstrip("\n").split("\t")[col]) | 78 ids.add(line.rstrip("\n").split("\t")[col]) |
72 print "Using %i IDs from tabular file" % (len(ids)) | 79 print "Using %i IDs from tabular file" % (len(ids)) |
73 handle.close() | 80 handle.close() |
74 | 81 |
75 | 82 |
83 def crude_fasta_iterator(handle): | |
84 """Yields tuples, record ID and the full record as a string.""" | |
85 while True: | |
86 line = handle.readline() | |
87 if line == "": | |
88 return # Premature end of file, or just empty? | |
89 if line[0] == ">": | |
90 break | |
91 | |
92 while True: | |
93 if line[0] != ">": | |
94 raise ValueError( | |
95 "Records in Fasta files should start with '>' character") | |
96 id = line[1:].split(None, 1)[0] | |
97 lines = [line] | |
98 line = handle.readline() | |
99 while True: | |
100 if not line: | |
101 break | |
102 if line[0] == ">": | |
103 break | |
104 lines.append(line) | |
105 line = handle.readline() | |
106 yield id, "".join(lines) | |
107 if not line: | |
108 return # StopIteration | |
109 | |
110 | |
111 def fasta_filter(in_file, pos_file, neg_file, wanted): | |
112 """FASTA filter producing 60 character line wrapped outout.""" | |
113 pos_count = neg_count = 0 | |
114 #Galaxy now requires Python 2.5+ so can use with statements, | |
115 with open(in_file) as in_handle: | |
116 #Doing the if statement outside the loop for speed | |
117 #(with the downside of three very similar loops). | |
118 if pos_file != "-" and neg_file != "-": | |
119 print "Generating two FASTA files" | |
120 with open(pos_file, "w") as pos_handle: | |
121 with open(neg_file, "w") as neg_handle: | |
122 for identifier, record in crude_fasta_iterator(in_handle): | |
123 if identifier in wanted: | |
124 pos_handle.write(record) | |
125 pos_count += 1 | |
126 else: | |
127 neg_handle.write(record) | |
128 neg_count += 1 | |
129 elif pos_file != "-": | |
130 print "Generating matching FASTA file" | |
131 with open(pos_file, "w") as pos_handle: | |
132 for identifier, record in crude_fasta_iterator(in_handle): | |
133 if identifier in wanted: | |
134 pos_handle.write(record) | |
135 pos_count += 1 | |
136 else: | |
137 neg_count += 1 | |
138 else: | |
139 print "Generating non-matching FASTA file" | |
140 assert neg_file != "-" | |
141 with open(neg_file, "w") as neg_handle: | |
142 for identifier, record in crude_fasta_iterator(in_handle): | |
143 if identifier in wanted: | |
144 pos_count += 1 | |
145 else: | |
146 neg_handle.write(record) | |
147 neg_count += 1 | |
148 return pos_count, neg_count | |
149 | |
150 | |
76 if seq_format.lower()=="sff": | 151 if seq_format.lower()=="sff": |
77 #Now write filtered SFF file based on IDs from BLAST file | 152 #Now write filtered SFF file based on IDs from BLAST file |
78 try: | 153 try: |
79 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 154 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
80 except ImportError: | 155 except ImportError: |
81 stop_err("Requires Biopython 1.54 or later") | 156 stop_err("SFF filtering requires Biopython 1.54 or later") |
82 | 157 |
83 try: | 158 try: |
84 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 159 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
85 except ImportError: | 160 except ImportError: |
86 #Prior to Biopython 1.56 this was a private function | 161 #Prior to Biopython 1.56 this was a private function |
90 manifest = ReadRocheXmlManifest(in_handle) | 165 manifest = ReadRocheXmlManifest(in_handle) |
91 except ValueError: | 166 except ValueError: |
92 manifest = None | 167 manifest = None |
93 #This makes two passes though the SFF file with isn't so efficient, | 168 #This makes two passes though the SFF file with isn't so efficient, |
94 #but this makes the code simple. | 169 #but this makes the code simple. |
170 pos_count = neg_count = 0 | |
95 if out_positive_file != "-": | 171 if out_positive_file != "-": |
96 out_handle = open(out_positive_file, "wb") | 172 out_handle = open(out_positive_file, "wb") |
97 writer = SffWriter(out_handle, xml=manifest) | 173 writer = SffWriter(out_handle, xml=manifest) |
98 in_handle.seek(0) #start again after getting manifest | 174 in_handle.seek(0) #start again after getting manifest |
99 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) | 175 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) |
106 out_handle.close() | 182 out_handle.close() |
107 #And we're done | 183 #And we're done |
108 in_handle.close() | 184 in_handle.close() |
109 #At the time of writing, Galaxy doesn't show SFF file read counts, | 185 #At the time of writing, Galaxy doesn't show SFF file read counts, |
110 #so it is useful to put them in stdout and thus shown in job info. | 186 #so it is useful to put them in stdout and thus shown in job info. |
111 if out_positive_file != "-" and out_negative_file != "-": | 187 print "%i with and %i without specified IDs" % (pos_count, neg_count) |
112 print "%i with and %i without specified IDs" % (pos_count, neg_count) | |
113 elif out_positive_file != "-": | |
114 print "%i with specified IDs" % pos_count | |
115 elif out_negative_file != "-": | |
116 print "%i without specified IDs" % neg_count | |
117 elif seq_format.lower()=="fasta": | 188 elif seq_format.lower()=="fasta": |
118 #Write filtered FASTA file based on IDs from tabular file | 189 #Write filtered FASTA file based on IDs from tabular file |
119 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter | 190 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) |
120 reader = fastaReader(open(in_file, "rU")) | 191 print "%i with and %i without specified IDs" % (pos_count, neg_count) |
121 if out_positive_file != "-" and out_negative_file != "-": | |
122 print "Generating two FASTA files" | |
123 positive_writer = fastaWriter(open(out_positive_file, "w")) | |
124 negative_writer = fastaWriter(open(out_negative_file, "w")) | |
125 for record in reader: | |
126 #The [1:] is because the fastaReader leaves the > on the identifer. | |
127 if record.identifier and record.identifier.split()[0][1:] in ids: | |
128 positive_writer.write(record) | |
129 else: | |
130 negative_writer.write(record) | |
131 positive_writer.close() | |
132 negative_writer.close() | |
133 elif out_positive_file != "-": | |
134 print "Generating matching FASTA file" | |
135 positive_writer = fastaWriter(open(out_positive_file, "w")) | |
136 for record in reader: | |
137 #The [1:] is because the fastaReader leaves the > on the identifer. | |
138 if record.identifier and record.identifier.split()[0][1:] in ids: | |
139 positive_writer.write(record) | |
140 positive_writer.close() | |
141 elif out_negative_file != "-": | |
142 print "Generating non-matching FASTA file" | |
143 negative_writer = fastaWriter(open(out_negative_file, "w")) | |
144 for record in reader: | |
145 #The [1:] is because the fastaReader leaves the > on the identifer. | |
146 if not record.identifier or record.identifier.split()[0][1:] not in ids: | |
147 negative_writer.write(record) | |
148 negative_writer.close() | |
149 elif seq_format.lower().startswith("fastq"): | 192 elif seq_format.lower().startswith("fastq"): |
150 #Write filtered FASTQ file based on IDs from tabular file | 193 #Write filtered FASTQ file based on IDs from tabular file |
151 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | 194 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter |
152 reader = fastqReader(open(in_file, "rU")) | 195 reader = fastqReader(open(in_file, "rU")) |
153 if out_positive_file != "-" and out_negative_file != "-": | 196 if out_positive_file != "-" and out_negative_file != "-": |
154 print "Generating two FASTQ files" | 197 print "Generating two FASTQ files" |
155 positive_writer = fastqWriter(open(out_positive_file, "w")) | 198 positive_writer = fastqWriter(open(out_positive_file, "w")) |
156 negative_writer = fastqWriter(open(out_negative_file, "w")) | 199 negative_writer = fastqWriter(open(out_negative_file, "w")) |
157 for record in reader: | 200 for record in reader: |
158 #The [1:] is because the fastaReader leaves the @ on the identifer. | 201 #The [1:] is because the fastaReader leaves the > on the identifier. |
159 if record.identifier and record.identifier.split()[0][1:] in ids: | 202 if record.identifier and record.identifier.split()[0][1:] in ids: |
160 positive_writer.write(record) | 203 positive_writer.write(record) |
161 else: | 204 else: |
162 negative_writer.write(record) | 205 negative_writer.write(record) |
163 positive_writer.close() | 206 positive_writer.close() |
164 negative_writer.close() | 207 negative_writer.close() |
165 elif out_positive_file != "-": | 208 elif out_positive_file != "-": |
166 print "Generating matching FASTQ file" | 209 print "Generating matching FASTQ file" |
167 positive_writer = fastqWriter(open(out_positive_file, "w")) | 210 positive_writer = fastqWriter(open(out_positive_file, "w")) |
168 for record in reader: | 211 for record in reader: |
169 #The [1:] is because the fastaReader leaves the @ on the identifer. | 212 #The [1:] is because the fastaReader leaves the > on the identifier. |
170 if record.identifier and record.identifier.split()[0][1:] in ids: | 213 if record.identifier and record.identifier.split()[0][1:] in ids: |
171 positive_writer.write(record) | 214 positive_writer.write(record) |
172 positive_writer.close() | 215 positive_writer.close() |
173 elif out_negative_file != "-": | 216 elif out_negative_file != "-": |
174 print "Generating non-matching FASTQ file" | 217 print "Generating non-matching FASTQ file" |
175 negative_writer = fastqWriter(open(out_negative_file, "w")) | 218 negative_writer = fastqWriter(open(out_negative_file, "w")) |
176 for record in reader: | 219 for record in reader: |
177 #The [1:] is because the fastaReader leaves the @ on the identifer. | 220 #The [1:] is because the fastaReader leaves the > on the identifier. |
178 if not record.identifier or record.identifier.split()[0][1:] not in ids: | 221 if not record.identifier or record.identifier.split()[0][1:] not in ids: |
179 negative_writer.write(record) | 222 negative_writer.write(record) |
180 negative_writer.close() | 223 negative_writer.close() |
224 reader.close() | |
181 else: | 225 else: |
182 stop_err("Unsupported file type %r" % seq_format) | 226 stop_err("Unsupported file type %r" % seq_format) |