comparison tools/filters/seq_filter_by_id.py @ 1:262f08104540 draft

Uploaded v0.0.4 which includes a unit test and is faster at filtering FASTA files with large records (e.g. whole chromosomes)
author peterjc
date Mon, 15 Apr 2013 12:27:30 -0400
parents 5844f6a450ed
children abdd608c869b
comparison
equal deleted inserted replaced
0:5844f6a450ed 1:262f08104540
23 23
24 Cock et al 2009. Biopython: freely available Python tools for computational 24 Cock et al 2009. Biopython: freely available Python tools for computational
25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. 25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
27 27
28 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. 28 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
29 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
29 See accompanying text file for licence details (MIT/BSD style). 30 See accompanying text file for licence details (MIT/BSD style).
30 31
31 This is version 0.0.1 of the script. 32 This is version 0.0.4 of the script, use -v or --version to get the version.
32 """ 33 """
33 import sys 34 import sys
34 35
35 def stop_err(msg, err=1): 36 def stop_err(msg, err=1):
36 sys.stderr.write(msg.rstrip() + "\n") 37 sys.stderr.write(msg.rstrip() + "\n")
37 sys.exit(err) 38 sys.exit(err)
39
40 if "-v" in sys.argv or "--version" in sys.argv:
41 print "v0.0.4"
42 sys.exit(0)
38 43
39 #Parse Command Line 44 #Parse Command Line
40 try: 45 try:
41 tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:] 46 tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:]
42 except ValueError: 47 except ValueError:
43 stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) 48 stop_err("Expected six arguments (tab file, columns, in seq, seq format, out pos, out neg), got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
44 try: 49 try:
45 columns = [int(arg)-1 for arg in cols_arg.split(",")] 50 columns = [int(arg)-1 for arg in cols_arg.split(",")]
46 except ValueError: 51 except ValueError:
47 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) 52 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg)
53 if min(columns) < 0:
54 stop_err("Expect one-based column numbers (not zero-based counting), got %s" % cols_arg)
48 55
49 if out_positive_file == "-" and out_negative_file == "-": 56 if out_positive_file == "-" and out_negative_file == "-":
50 stop_err("Neither output file requested") 57 stop_err("Neither output file requested")
51 58
52 59
71 ids.add(line.rstrip("\n").split("\t")[col]) 78 ids.add(line.rstrip("\n").split("\t")[col])
72 print "Using %i IDs from tabular file" % (len(ids)) 79 print "Using %i IDs from tabular file" % (len(ids))
73 handle.close() 80 handle.close()
74 81
75 82
83 def crude_fasta_iterator(handle):
84 """Yields tuples, record ID and the full record as a string."""
85 while True:
86 line = handle.readline()
87 if line == "":
88 return # Premature end of file, or just empty?
89 if line[0] == ">":
90 break
91
92 while True:
93 if line[0] != ">":
94 raise ValueError(
95 "Records in Fasta files should start with '>' character")
96 id = line[1:].split(None, 1)[0]
97 lines = [line]
98 line = handle.readline()
99 while True:
100 if not line:
101 break
102 if line[0] == ">":
103 break
104 lines.append(line)
105 line = handle.readline()
106 yield id, "".join(lines)
107 if not line:
108 return # StopIteration
109
110
111 def fasta_filter(in_file, pos_file, neg_file, wanted):
112 """FASTA filter producing 60 character line wrapped outout."""
113 pos_count = neg_count = 0
114 #Galaxy now requires Python 2.5+ so can use with statements,
115 with open(in_file) as in_handle:
116 #Doing the if statement outside the loop for speed
117 #(with the downside of three very similar loops).
118 if pos_file != "-" and neg_file != "-":
119 print "Generating two FASTA files"
120 with open(pos_file, "w") as pos_handle:
121 with open(neg_file, "w") as neg_handle:
122 for identifier, record in crude_fasta_iterator(in_handle):
123 if identifier in wanted:
124 pos_handle.write(record)
125 pos_count += 1
126 else:
127 neg_handle.write(record)
128 neg_count += 1
129 elif pos_file != "-":
130 print "Generating matching FASTA file"
131 with open(pos_file, "w") as pos_handle:
132 for identifier, record in crude_fasta_iterator(in_handle):
133 if identifier in wanted:
134 pos_handle.write(record)
135 pos_count += 1
136 else:
137 neg_count += 1
138 else:
139 print "Generating non-matching FASTA file"
140 assert neg_file != "-"
141 with open(neg_file, "w") as neg_handle:
142 for identifier, record in crude_fasta_iterator(in_handle):
143 if identifier in wanted:
144 pos_count += 1
145 else:
146 neg_handle.write(record)
147 neg_count += 1
148 return pos_count, neg_count
149
150
76 if seq_format.lower()=="sff": 151 if seq_format.lower()=="sff":
77 #Now write filtered SFF file based on IDs from BLAST file 152 #Now write filtered SFF file based on IDs from BLAST file
78 try: 153 try:
79 from Bio.SeqIO.SffIO import SffIterator, SffWriter 154 from Bio.SeqIO.SffIO import SffIterator, SffWriter
80 except ImportError: 155 except ImportError:
81 stop_err("Requires Biopython 1.54 or later") 156 stop_err("SFF filtering requires Biopython 1.54 or later")
82 157
83 try: 158 try:
84 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 159 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
85 except ImportError: 160 except ImportError:
86 #Prior to Biopython 1.56 this was a private function 161 #Prior to Biopython 1.56 this was a private function
90 manifest = ReadRocheXmlManifest(in_handle) 165 manifest = ReadRocheXmlManifest(in_handle)
91 except ValueError: 166 except ValueError:
92 manifest = None 167 manifest = None
93 #This makes two passes though the SFF file with isn't so efficient, 168 #This makes two passes though the SFF file with isn't so efficient,
94 #but this makes the code simple. 169 #but this makes the code simple.
170 pos_count = neg_count = 0
95 if out_positive_file != "-": 171 if out_positive_file != "-":
96 out_handle = open(out_positive_file, "wb") 172 out_handle = open(out_positive_file, "wb")
97 writer = SffWriter(out_handle, xml=manifest) 173 writer = SffWriter(out_handle, xml=manifest)
98 in_handle.seek(0) #start again after getting manifest 174 in_handle.seek(0) #start again after getting manifest
99 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) 175 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
106 out_handle.close() 182 out_handle.close()
107 #And we're done 183 #And we're done
108 in_handle.close() 184 in_handle.close()
109 #At the time of writing, Galaxy doesn't show SFF file read counts, 185 #At the time of writing, Galaxy doesn't show SFF file read counts,
110 #so it is useful to put them in stdout and thus shown in job info. 186 #so it is useful to put them in stdout and thus shown in job info.
111 if out_positive_file != "-" and out_negative_file != "-": 187 print "%i with and %i without specified IDs" % (pos_count, neg_count)
112 print "%i with and %i without specified IDs" % (pos_count, neg_count)
113 elif out_positive_file != "-":
114 print "%i with specified IDs" % pos_count
115 elif out_negative_file != "-":
116 print "%i without specified IDs" % neg_count
117 elif seq_format.lower()=="fasta": 188 elif seq_format.lower()=="fasta":
118 #Write filtered FASTA file based on IDs from tabular file 189 #Write filtered FASTA file based on IDs from tabular file
119 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter 190 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
120 reader = fastaReader(open(in_file, "rU")) 191 print "%i with and %i without specified IDs" % (pos_count, neg_count)
121 if out_positive_file != "-" and out_negative_file != "-":
122 print "Generating two FASTA files"
123 positive_writer = fastaWriter(open(out_positive_file, "w"))
124 negative_writer = fastaWriter(open(out_negative_file, "w"))
125 for record in reader:
126 #The [1:] is because the fastaReader leaves the > on the identifer.
127 if record.identifier and record.identifier.split()[0][1:] in ids:
128 positive_writer.write(record)
129 else:
130 negative_writer.write(record)
131 positive_writer.close()
132 negative_writer.close()
133 elif out_positive_file != "-":
134 print "Generating matching FASTA file"
135 positive_writer = fastaWriter(open(out_positive_file, "w"))
136 for record in reader:
137 #The [1:] is because the fastaReader leaves the > on the identifer.
138 if record.identifier and record.identifier.split()[0][1:] in ids:
139 positive_writer.write(record)
140 positive_writer.close()
141 elif out_negative_file != "-":
142 print "Generating non-matching FASTA file"
143 negative_writer = fastaWriter(open(out_negative_file, "w"))
144 for record in reader:
145 #The [1:] is because the fastaReader leaves the > on the identifer.
146 if not record.identifier or record.identifier.split()[0][1:] not in ids:
147 negative_writer.write(record)
148 negative_writer.close()
149 elif seq_format.lower().startswith("fastq"): 192 elif seq_format.lower().startswith("fastq"):
150 #Write filtered FASTQ file based on IDs from tabular file 193 #Write filtered FASTQ file based on IDs from tabular file
151 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter 194 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
152 reader = fastqReader(open(in_file, "rU")) 195 reader = fastqReader(open(in_file, "rU"))
153 if out_positive_file != "-" and out_negative_file != "-": 196 if out_positive_file != "-" and out_negative_file != "-":
154 print "Generating two FASTQ files" 197 print "Generating two FASTQ files"
155 positive_writer = fastqWriter(open(out_positive_file, "w")) 198 positive_writer = fastqWriter(open(out_positive_file, "w"))
156 negative_writer = fastqWriter(open(out_negative_file, "w")) 199 negative_writer = fastqWriter(open(out_negative_file, "w"))
157 for record in reader: 200 for record in reader:
158 #The [1:] is because the fastaReader leaves the @ on the identifer. 201 #The [1:] is because the fastaReader leaves the > on the identifier.
159 if record.identifier and record.identifier.split()[0][1:] in ids: 202 if record.identifier and record.identifier.split()[0][1:] in ids:
160 positive_writer.write(record) 203 positive_writer.write(record)
161 else: 204 else:
162 negative_writer.write(record) 205 negative_writer.write(record)
163 positive_writer.close() 206 positive_writer.close()
164 negative_writer.close() 207 negative_writer.close()
165 elif out_positive_file != "-": 208 elif out_positive_file != "-":
166 print "Generating matching FASTQ file" 209 print "Generating matching FASTQ file"
167 positive_writer = fastqWriter(open(out_positive_file, "w")) 210 positive_writer = fastqWriter(open(out_positive_file, "w"))
168 for record in reader: 211 for record in reader:
169 #The [1:] is because the fastaReader leaves the @ on the identifer. 212 #The [1:] is because the fastaReader leaves the > on the identifier.
170 if record.identifier and record.identifier.split()[0][1:] in ids: 213 if record.identifier and record.identifier.split()[0][1:] in ids:
171 positive_writer.write(record) 214 positive_writer.write(record)
172 positive_writer.close() 215 positive_writer.close()
173 elif out_negative_file != "-": 216 elif out_negative_file != "-":
174 print "Generating non-matching FASTQ file" 217 print "Generating non-matching FASTQ file"
175 negative_writer = fastqWriter(open(out_negative_file, "w")) 218 negative_writer = fastqWriter(open(out_negative_file, "w"))
176 for record in reader: 219 for record in reader:
177 #The [1:] is because the fastaReader leaves the @ on the identifer. 220 #The [1:] is because the fastaReader leaves the > on the identifier.
178 if not record.identifier or record.identifier.split()[0][1:] not in ids: 221 if not record.identifier or record.identifier.split()[0][1:] not in ids:
179 negative_writer.write(record) 222 negative_writer.write(record)
180 negative_writer.close() 223 negative_writer.close()
224 reader.close()
181 else: 225 else:
182 stop_err("Unsupported file type %r" % seq_format) 226 stop_err("Unsupported file type %r" % seq_format)