Mercurial > repos > peterjc > fasta_filter_by_id
comparison tools/fasta_tools/fasta_filter_by_id.py @ 3:812383b5d3b8 draft default tip
v0.0.5 - galaxy_sequence_utils dependency and other cleanups inc using MIT license
author | peterjc |
---|---|
date | Fri, 03 Feb 2017 05:32:34 -0500 |
parents | 5b552b3005f2 |
children |
comparison
equal
deleted
inserted
replaced
2:5b552b3005f2 | 3:812383b5d3b8 |
---|---|
1 #!/usr/bin/env python | |
2 """Filter a FASTA file with IDs from a tabular file, e.g. from BLAST. | |
3 | |
4 Takes five command line options, tabular filename, ID column numbers | |
5 (comma separated list using one based counting), input FASTA filename, and | |
6 two output FASTA filenames (for records with and without the given IDs). | |
7 | |
8 If either output filename is just a minus sign, that file is not created. | |
9 This is intended to allow output for just the matched (or just the non-matched) | |
10 records. | |
11 | |
12 Note in the default NCBI BLAST+ tabular output, the query sequence ID is | |
13 in column one, and the ID of the match from the database is in column two. | |
14 Here sensible values for the column numbers would therefore be "1" or "2". | |
15 | |
16 This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. | |
17 See accompanying text file for licence details (MIT/BSD style). | |
18 | |
19 This is version 0.0.3 of the script. | |
20 """ | |
21 import sys | |
22 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter | |
23 | |
24 def stop_err( msg ): | |
25 sys.stderr.write( msg ) | |
26 sys.exit() | |
27 | |
28 #Parse Command Line | |
29 try: | |
30 tabular_file, cols_arg, in_file, out_positive_file, out_negative_file = sys.argv[1:] | |
31 except ValueError: | |
32 stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
33 try: | |
34 columns = [int(arg)-1 for arg in cols_arg.split(",")] | |
35 except ValueError: | |
36 stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) | |
37 | |
38 #Read tabular file and record all specified identifiers | |
39 ids = set() | |
40 handle = open(tabular_file, "rU") | |
41 if len(columns)>1: | |
42 #General case of many columns | |
43 for line in handle: | |
44 if line.startswith("#"): | |
45 #Ignore comments | |
46 continue | |
47 parts = line.rstrip("\n").split("\t") | |
48 for col in columns: | |
49 ids.add(parts[col]) | |
50 print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) | |
51 else: | |
52 #Single column, special case speed up | |
53 col = columns[0] | |
54 for line in handle: | |
55 if not line.startswith("#"): | |
56 ids.add(line.rstrip("\n").split("\t")[col]) | |
57 print "Using %i IDs from tabular file" % (len(ids)) | |
58 handle.close() | |
59 | |
60 #Write filtered FASTA file based on IDs from tabular file | |
61 reader = fastaReader(open(in_file, "rU")) | |
62 if out_positive_file != "-" and out_negative_file != "-": | |
63 print "Generating two FASTA files" | |
64 positive_writer = fastaWriter(open(out_positive_file, "w")) | |
65 negative_writer = fastaWriter(open(out_negative_file, "w")) | |
66 for record in reader: | |
67 #The [1:] is because the fastaReader leaves the > on the identifer. | |
68 if record.identifier and record.identifier.split()[0][1:] in ids: | |
69 positive_writer.write(record) | |
70 else: | |
71 negative_writer.write(record) | |
72 positive_writer.close() | |
73 negative_writer.close() | |
74 elif out_positive_file != "-": | |
75 print "Generating matching FASTA file" | |
76 positive_writer = fastaWriter(open(out_positive_file, "w")) | |
77 for record in reader: | |
78 #The [1:] is because the fastaReader leaves the > on the identifer. | |
79 if record.identifier and record.identifier.split()[0][1:] in ids: | |
80 positive_writer.write(record) | |
81 positive_writer.close() | |
82 elif out_negative_file != "-": | |
83 print "Generating non-matching FASTA file" | |
84 negative_writer = fastaWriter(open(out_negative_file, "w")) | |
85 for record in reader: | |
86 #The [1:] is because the fastaReader leaves the > on the identifer. | |
87 if not record.identifier or record.identifier.split()[0][1:] not in ids: | |
88 negative_writer.write(record) | |
89 negative_writer.close() | |
90 else: | |
91 stop_err("Neither output file requested") | |
92 reader.close() |