Mercurial > repos > peterjc > seq_filter_by_id
comparison tools/filters/seq_filter_by_id.py @ 0:5844f6a450ed
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 17:24:30 -0400 |
parents | |
children | 262f08104540 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5844f6a450ed |
---|---|
1 #!/usr/bin/env python | |
2 """Filter a FASTA, FASTQ or SSF file with IDs from a tabular file. | |
3 | |
4 Takes six command line options, tabular filename, ID column numbers (comma | |
5 separated list using one based counting), input filename, input type (e.g. | |
6 FASTA or SFF) and two output filenames (for records with and without the | |
7 given IDs, same format as input sequence file). | |
8 | |
9 If either output filename is just a minus sign, that file is not created. | |
10 This is intended to allow output for just the matched (or just the non-matched) | |
11 records. | |
12 | |
13 When filtering an SFF file, any Roche XML manifest in the input file is | |
14 preserved in both output files. | |
15 | |
16 Note in the default NCBI BLAST+ tabular output, the query sequence ID is | |
17 in column one, and the ID of the match from the database is in column two. | |
18 Here sensible values for the column numbers would therefore be "1" or "2". | |
19 | |
20 This tool is a short Python script which requires Biopython 1.54 or later | |
21 for SFF file support. If you use this tool in scientific work leading to a | |
22 publication, please cite the Biopython application note: | |
23 | |
24 Cock et al 2009. Biopython: freely available Python tools for computational | |
25 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
26 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
27 | |
28 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. | |
29 See accompanying text file for licence details (MIT/BSD style). | |
30 | |
31 This is version 0.0.1 of the script. | |
32 """ | |
33 import sys | |
34 | |
35 def stop_err(msg, err=1): | |
36 sys.stderr.write(msg.rstrip() + "\n") | |
37 sys.exit(err) | |
38 | |
39 #Parse Command Line | |
40 try: | |
41 tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:] | |
42 except ValueError: | |
43 stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
44 try: | |
45 columns = [int(arg)-1 for arg in cols_arg.split(",")] | |
46 except ValueError: | |
47 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) | |
48 | |
49 if out_positive_file == "-" and out_negative_file == "-": | |
50 stop_err("Neither output file requested") | |
51 | |
52 | |
53 #Read tabular file and record all specified identifiers | |
54 ids = set() | |
55 handle = open(tabular_file, "rU") | |
56 if len(columns)>1: | |
57 #General case of many columns | |
58 for line in handle: | |
59 if line.startswith("#"): | |
60 #Ignore comments | |
61 continue | |
62 parts = line.rstrip("\n").split("\t") | |
63 for col in columns: | |
64 ids.add(parts[col]) | |
65 print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) | |
66 else: | |
67 #Single column, special case speed up | |
68 col = columns[0] | |
69 for line in handle: | |
70 if not line.startswith("#"): | |
71 ids.add(line.rstrip("\n").split("\t")[col]) | |
72 print "Using %i IDs from tabular file" % (len(ids)) | |
73 handle.close() | |
74 | |
75 | |
76 if seq_format.lower()=="sff": | |
77 #Now write filtered SFF file based on IDs from BLAST file | |
78 try: | |
79 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
80 except ImportError: | |
81 stop_err("Requires Biopython 1.54 or later") | |
82 | |
83 try: | |
84 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
85 except ImportError: | |
86 #Prior to Biopython 1.56 this was a private function | |
87 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
88 in_handle = open(in_file, "rb") #must be binary mode! | |
89 try: | |
90 manifest = ReadRocheXmlManifest(in_handle) | |
91 except ValueError: | |
92 manifest = None | |
93 #This makes two passes though the SFF file with isn't so efficient, | |
94 #but this makes the code simple. | |
95 if out_positive_file != "-": | |
96 out_handle = open(out_positive_file, "wb") | |
97 writer = SffWriter(out_handle, xml=manifest) | |
98 in_handle.seek(0) #start again after getting manifest | |
99 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) | |
100 out_handle.close() | |
101 if out_negative_file != "-": | |
102 out_handle = open(out_negative_file, "wb") | |
103 writer = SffWriter(out_handle, xml=manifest) | |
104 in_handle.seek(0) #start again | |
105 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) | |
106 out_handle.close() | |
107 #And we're done | |
108 in_handle.close() | |
109 #At the time of writing, Galaxy doesn't show SFF file read counts, | |
110 #so it is useful to put them in stdout and thus shown in job info. | |
111 if out_positive_file != "-" and out_negative_file != "-": | |
112 print "%i with and %i without specified IDs" % (pos_count, neg_count) | |
113 elif out_positive_file != "-": | |
114 print "%i with specified IDs" % pos_count | |
115 elif out_negative_file != "-": | |
116 print "%i without specified IDs" % neg_count | |
117 elif seq_format.lower()=="fasta": | |
118 #Write filtered FASTA file based on IDs from tabular file | |
119 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter | |
120 reader = fastaReader(open(in_file, "rU")) | |
121 if out_positive_file != "-" and out_negative_file != "-": | |
122 print "Generating two FASTA files" | |
123 positive_writer = fastaWriter(open(out_positive_file, "w")) | |
124 negative_writer = fastaWriter(open(out_negative_file, "w")) | |
125 for record in reader: | |
126 #The [1:] is because the fastaReader leaves the > on the identifer. | |
127 if record.identifier and record.identifier.split()[0][1:] in ids: | |
128 positive_writer.write(record) | |
129 else: | |
130 negative_writer.write(record) | |
131 positive_writer.close() | |
132 negative_writer.close() | |
133 elif out_positive_file != "-": | |
134 print "Generating matching FASTA file" | |
135 positive_writer = fastaWriter(open(out_positive_file, "w")) | |
136 for record in reader: | |
137 #The [1:] is because the fastaReader leaves the > on the identifer. | |
138 if record.identifier and record.identifier.split()[0][1:] in ids: | |
139 positive_writer.write(record) | |
140 positive_writer.close() | |
141 elif out_negative_file != "-": | |
142 print "Generating non-matching FASTA file" | |
143 negative_writer = fastaWriter(open(out_negative_file, "w")) | |
144 for record in reader: | |
145 #The [1:] is because the fastaReader leaves the > on the identifer. | |
146 if not record.identifier or record.identifier.split()[0][1:] not in ids: | |
147 negative_writer.write(record) | |
148 negative_writer.close() | |
149 elif seq_format.lower().startswith("fastq"): | |
150 #Write filtered FASTQ file based on IDs from tabular file | |
151 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
152 reader = fastqReader(open(in_file, "rU")) | |
153 if out_positive_file != "-" and out_negative_file != "-": | |
154 print "Generating two FASTQ files" | |
155 positive_writer = fastqWriter(open(out_positive_file, "w")) | |
156 negative_writer = fastqWriter(open(out_negative_file, "w")) | |
157 for record in reader: | |
158 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
159 if record.identifier and record.identifier.split()[0][1:] in ids: | |
160 positive_writer.write(record) | |
161 else: | |
162 negative_writer.write(record) | |
163 positive_writer.close() | |
164 negative_writer.close() | |
165 elif out_positive_file != "-": | |
166 print "Generating matching FASTQ file" | |
167 positive_writer = fastqWriter(open(out_positive_file, "w")) | |
168 for record in reader: | |
169 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
170 if record.identifier and record.identifier.split()[0][1:] in ids: | |
171 positive_writer.write(record) | |
172 positive_writer.close() | |
173 elif out_negative_file != "-": | |
174 print "Generating non-matching FASTQ file" | |
175 negative_writer = fastqWriter(open(out_negative_file, "w")) | |
176 for record in reader: | |
177 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
178 if not record.identifier or record.identifier.split()[0][1:] not in ids: | |
179 negative_writer.write(record) | |
180 negative_writer.close() | |
181 else: | |
182 stop_err("Unsupported file type %r" % seq_format) |