comparison tools/filters/seq_select_by_id.py @ 0:838b9bebfa3c

Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 17:43:38 -0400
parents
children 50a8a6917a9c
comparison
equal deleted inserted replaced
-1:000000000000 0:838b9bebfa3c
1 #!/usr/bin/env python
2 """Select FASTA, QUAL, FASTQ or SSF sequences by IDs from a tabular file.
3
4 Takes five command line options, tabular filename, ID column number (using
5 one based counting), input filename, input type (e.g. FASTA or SFF) and the
6 output filename (same format as input sequence file).
7
8 When selecting from an SFF file, any Roche XML manifest in the input file is
9 preserved in both output files.
10
11 This tool is a short Python script which requires Biopython 1.54 or later
12 for SFF file support. If you use this tool in scientific work leading to a
13 publication, please cite the Biopython application note:
14
15 Cock et al 2009. Biopython: freely available Python tools for computational
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
18
19 This script is copyright 2011 by Peter Cock, The James Hutton Institute UK.
20 All rights reserved. See accompanying text file for licence details (MIT/BSD
21 style).
22
23 This is version 0.0.1 of the script.
24 """
25 import sys
26
27 def stop_err(msg, err=1):
28 sys.stderr.write(msg.rstrip() + "\n")
29 sys.exit(err)
30
31 #Parse Command Line
32 try:
33 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
34 except ValueError:
35 stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
36 try:
37 if col_arg.startswith("c"):
38 column = int(col_arg[1:])-1
39 else:
40 column = int(col_arg)-1
41 except ValueError:
42 stop_err("Expected column number, got %s" % cols_arg)
43
44 if seq_format == "fastqcssanger":
45 stop_err("Colorspace FASTQ not supported.")
46 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
47 seq_format = seq_format.lower()
48 elif seq_format.lower().startswith("fastq"):
49 #We don't care how the qualities are encoded
50 seq_format = "fastq"
51 elif seq_format.lower().startswith("qual"):
52 #We don't care what the scores are
53 seq_format = "qual"
54 else:
55 stop_err("Unrecognised file format %r" % seq_format)
56
57
58 try:
59 from Bio import SeqIO
60 except ImportError:
61 stop_err("Biopython 1.54 or later is required")
62
63
64 def parse_ids(tabular_file, col):
65 """Read tabular file and record all specified identifiers."""
66 handle = open(tabular_file, "rU")
67 for line in handle:
68 if not line.startswith("#"):
69 yield line.rstrip("\n").split("\t")[col].strip()
70 handle.close()
71
72 #Index the sequence file.
73 #If very big, could use SeqIO.index_db() to avoid memory bottleneck...
74 records = SeqIO.index(in_file, seq_format)
75 print "Indexed %i sequences" % len(records)
76
77 if seq_format.lower()=="sff":
78 #Special case to try to preserve the XML manifest
79 try:
80 from Bio.SeqIO.SffIO import SffIterator, SffWriter
81 except ImportError:
82 stop_err("Requires Biopython 1.54 or later")
83
84 try:
85 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
86 except ImportError:
87 #Prior to Biopython 1.56 this was a private function
88 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
89
90 in_handle = open(in_file, "rb") #must be binary mode!
91 try:
92 manifest = ReadRocheXmlManifest(in_handle)
93 except ValueError:
94 manifest = None
95 in_handle.close()
96
97 out_handle = open(out_file, "wb")
98 writer = SffWriter(out_handle, xml=manifest)
99 count = 0
100 #This does have the overhead of parsing into SeqRecord objects,
101 #but doing the header and index at the low level is too fidly.
102 iterator = (records[name] for name in parse_ids(tabular_file, column))
103 try:
104 count = writer.write_file(iterator)
105 except KeyError, err:
106 out_handle.close()
107 if name not in records:
108 stop_err("Identifier %s not found in sequence file" % name)
109 else:
110 raise err
111 out_handle.close()
112 else:
113 #Avoid overhead of parsing into SeqRecord objects,
114 #just re-use the original formatting from the input file.
115 out_handle = open(out_file, "w")
116 count = 0
117 for name in parse_ids(tabular_file, column):
118 try:
119 out_handle.write(records.get_raw(name))
120 except KeyError:
121 out_handle.close()
122 stop_err("Identifier %s not found in sequence file" % name)
123 count += 1
124 out_handle.close()
125
126 print "Selected %i sequences by ID" % count