comparison tools/filters/seq_filter_by_id.py @ 0:5844f6a450ed

Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 17:24:30 -0400
parents
children 262f08104540
comparison
equal deleted inserted replaced
-1:000000000000 0:5844f6a450ed
1 #!/usr/bin/env python
2 """Filter a FASTA, FASTQ or SSF file with IDs from a tabular file.
3
4 Takes six command line options, tabular filename, ID column numbers (comma
5 separated list using one based counting), input filename, input type (e.g.
6 FASTA or SFF) and two output filenames (for records with and without the
7 given IDs, same format as input sequence file).
8
9 If either output filename is just a minus sign, that file is not created.
10 This is intended to allow output for just the matched (or just the non-matched)
11 records.
12
13 When filtering an SFF file, any Roche XML manifest in the input file is
14 preserved in both output files.
15
16 Note in the default NCBI BLAST+ tabular output, the query sequence ID is
17 in column one, and the ID of the match from the database is in column two.
18 Here sensible values for the column numbers would therefore be "1" or "2".
19
20 This tool is a short Python script which requires Biopython 1.54 or later
21 for SFF file support. If you use this tool in scientific work leading to a
22 publication, please cite the Biopython application note:
23
24 Cock et al 2009. Biopython: freely available Python tools for computational
25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
27
28 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved.
29 See accompanying text file for licence details (MIT/BSD style).
30
31 This is version 0.0.1 of the script.
32 """
33 import sys
34
35 def stop_err(msg, err=1):
36 sys.stderr.write(msg.rstrip() + "\n")
37 sys.exit(err)
38
39 #Parse Command Line
40 try:
41 tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:]
42 except ValueError:
43 stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
44 try:
45 columns = [int(arg)-1 for arg in cols_arg.split(",")]
46 except ValueError:
47 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg)
48
49 if out_positive_file == "-" and out_negative_file == "-":
50 stop_err("Neither output file requested")
51
52
53 #Read tabular file and record all specified identifiers
54 ids = set()
55 handle = open(tabular_file, "rU")
56 if len(columns)>1:
57 #General case of many columns
58 for line in handle:
59 if line.startswith("#"):
60 #Ignore comments
61 continue
62 parts = line.rstrip("\n").split("\t")
63 for col in columns:
64 ids.add(parts[col])
65 print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns))
66 else:
67 #Single column, special case speed up
68 col = columns[0]
69 for line in handle:
70 if not line.startswith("#"):
71 ids.add(line.rstrip("\n").split("\t")[col])
72 print "Using %i IDs from tabular file" % (len(ids))
73 handle.close()
74
75
76 if seq_format.lower()=="sff":
77 #Now write filtered SFF file based on IDs from BLAST file
78 try:
79 from Bio.SeqIO.SffIO import SffIterator, SffWriter
80 except ImportError:
81 stop_err("Requires Biopython 1.54 or later")
82
83 try:
84 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
85 except ImportError:
86 #Prior to Biopython 1.56 this was a private function
87 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
88 in_handle = open(in_file, "rb") #must be binary mode!
89 try:
90 manifest = ReadRocheXmlManifest(in_handle)
91 except ValueError:
92 manifest = None
93 #This makes two passes though the SFF file with isn't so efficient,
94 #but this makes the code simple.
95 if out_positive_file != "-":
96 out_handle = open(out_positive_file, "wb")
97 writer = SffWriter(out_handle, xml=manifest)
98 in_handle.seek(0) #start again after getting manifest
99 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
100 out_handle.close()
101 if out_negative_file != "-":
102 out_handle = open(out_negative_file, "wb")
103 writer = SffWriter(out_handle, xml=manifest)
104 in_handle.seek(0) #start again
105 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids)
106 out_handle.close()
107 #And we're done
108 in_handle.close()
109 #At the time of writing, Galaxy doesn't show SFF file read counts,
110 #so it is useful to put them in stdout and thus shown in job info.
111 if out_positive_file != "-" and out_negative_file != "-":
112 print "%i with and %i without specified IDs" % (pos_count, neg_count)
113 elif out_positive_file != "-":
114 print "%i with specified IDs" % pos_count
115 elif out_negative_file != "-":
116 print "%i without specified IDs" % neg_count
117 elif seq_format.lower()=="fasta":
118 #Write filtered FASTA file based on IDs from tabular file
119 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
120 reader = fastaReader(open(in_file, "rU"))
121 if out_positive_file != "-" and out_negative_file != "-":
122 print "Generating two FASTA files"
123 positive_writer = fastaWriter(open(out_positive_file, "w"))
124 negative_writer = fastaWriter(open(out_negative_file, "w"))
125 for record in reader:
126 #The [1:] is because the fastaReader leaves the > on the identifer.
127 if record.identifier and record.identifier.split()[0][1:] in ids:
128 positive_writer.write(record)
129 else:
130 negative_writer.write(record)
131 positive_writer.close()
132 negative_writer.close()
133 elif out_positive_file != "-":
134 print "Generating matching FASTA file"
135 positive_writer = fastaWriter(open(out_positive_file, "w"))
136 for record in reader:
137 #The [1:] is because the fastaReader leaves the > on the identifer.
138 if record.identifier and record.identifier.split()[0][1:] in ids:
139 positive_writer.write(record)
140 positive_writer.close()
141 elif out_negative_file != "-":
142 print "Generating non-matching FASTA file"
143 negative_writer = fastaWriter(open(out_negative_file, "w"))
144 for record in reader:
145 #The [1:] is because the fastaReader leaves the > on the identifer.
146 if not record.identifier or record.identifier.split()[0][1:] not in ids:
147 negative_writer.write(record)
148 negative_writer.close()
149 elif seq_format.lower().startswith("fastq"):
150 #Write filtered FASTQ file based on IDs from tabular file
151 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
152 reader = fastqReader(open(in_file, "rU"))
153 if out_positive_file != "-" and out_negative_file != "-":
154 print "Generating two FASTQ files"
155 positive_writer = fastqWriter(open(out_positive_file, "w"))
156 negative_writer = fastqWriter(open(out_negative_file, "w"))
157 for record in reader:
158 #The [1:] is because the fastaReader leaves the @ on the identifer.
159 if record.identifier and record.identifier.split()[0][1:] in ids:
160 positive_writer.write(record)
161 else:
162 negative_writer.write(record)
163 positive_writer.close()
164 negative_writer.close()
165 elif out_positive_file != "-":
166 print "Generating matching FASTQ file"
167 positive_writer = fastqWriter(open(out_positive_file, "w"))
168 for record in reader:
169 #The [1:] is because the fastaReader leaves the @ on the identifer.
170 if record.identifier and record.identifier.split()[0][1:] in ids:
171 positive_writer.write(record)
172 positive_writer.close()
173 elif out_negative_file != "-":
174 print "Generating non-matching FASTQ file"
175 negative_writer = fastqWriter(open(out_negative_file, "w"))
176 for record in reader:
177 #The [1:] is because the fastaReader leaves the @ on the identifer.
178 if not record.identifier or record.identifier.split()[0][1:] not in ids:
179 negative_writer.write(record)
180 negative_writer.close()
181 else:
182 stop_err("Unsupported file type %r" % seq_format)