comparison tools/seq_select_by_id/seq_select_by_id.py @ 4:6842c0c7bc70 draft

Uploaded v0.0.7, depend on Biopython 1.62, tabs to spaces in XML
author peterjc
date Mon, 28 Oct 2013 05:21:45 -0400
parents
children 91f55ee8fea5
comparison
equal deleted inserted replaced
3:19e26966ed3e 4:6842c0c7bc70
1 #!/usr/bin/env python
2 """Select FASTA, QUAL, FASTQ or SSF sequences by IDs from a tabular file.
3
4 Takes five command line options, tabular filename, ID column number (using
5 one based counting), input filename, input type (e.g. FASTA or SFF) and the
6 output filename (same format as input sequence file).
7
8 When selecting from an SFF file, any Roche XML manifest in the input file is
9 preserved in both output files.
10
11 This tool is a short Python script which requires Biopython 1.54 or later
12 for SFF file support. If you use this tool in scientific work leading to a
13 publication, please cite the Biopython application note:
14
15 Cock et al 2009. Biopython: freely available Python tools for computational
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
18
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
20 All rights reserved. See accompanying text file for licence details (MIT
21 license).
22
23 This is version 0.0.6 of the script.
24 """
25 import sys
26
27 def stop_err(msg, err=1):
28 sys.stderr.write(msg.rstrip() + "\n")
29 sys.exit(err)
30
31 if "-v" in sys.argv or "--version" in sys.argv:
32 print "v0.0.6"
33 sys.exit(0)
34
35 #Parse Command Line
36 try:
37 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
38 except ValueError:
39 stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
40 try:
41 if col_arg.startswith("c"):
42 column = int(col_arg[1:])-1
43 else:
44 column = int(col_arg)-1
45 except ValueError:
46 stop_err("Expected column number, got %s" % col_arg)
47
48 if seq_format == "fastqcssanger":
49 stop_err("Colorspace FASTQ not supported.")
50 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
51 seq_format = seq_format.lower()
52 elif seq_format.lower().startswith("fastq"):
53 #We don't care how the qualities are encoded
54 seq_format = "fastq"
55 elif seq_format.lower().startswith("qual"):
56 #We don't care what the scores are
57 seq_format = "qual"
58 else:
59 stop_err("Unrecognised file format %r" % seq_format)
60
61
62 try:
63 from Bio import SeqIO
64 except ImportError:
65 stop_err("Biopython 1.54 or later is required")
66
67
68 def parse_ids(tabular_file, col):
69 """Read tabular file and record all specified identifiers."""
70 handle = open(tabular_file, "rU")
71 for line in handle:
72 if line.strip() and not line.startswith("#"):
73 yield line.rstrip("\n").split("\t")[col].strip()
74 handle.close()
75
76 #Index the sequence file.
77 #If very big, could use SeqIO.index_db() to avoid memory bottleneck...
78 records = SeqIO.index(in_file, seq_format)
79 print "Indexed %i sequences" % len(records)
80
81 if seq_format.lower()=="sff":
82 #Special case to try to preserve the XML manifest
83 try:
84 from Bio.SeqIO.SffIO import SffIterator, SffWriter
85 except ImportError:
86 stop_err("Requires Biopython 1.54 or later")
87
88 try:
89 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
90 except ImportError:
91 #Prior to Biopython 1.56 this was a private function
92 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
93
94 in_handle = open(in_file, "rb") #must be binary mode!
95 try:
96 manifest = ReadRocheXmlManifest(in_handle)
97 except ValueError:
98 manifest = None
99 in_handle.close()
100
101 out_handle = open(out_file, "wb")
102 writer = SffWriter(out_handle, xml=manifest)
103 count = 0
104 #This does have the overhead of parsing into SeqRecord objects,
105 #but doing the header and index at the low level is too fidly.
106 iterator = (records[name] for name in parse_ids(tabular_file, column))
107 try:
108 count = writer.write_file(iterator)
109 except KeyError, err:
110 out_handle.close()
111 if name not in records:
112 stop_err("Identifier %r not found in sequence file" % name)
113 else:
114 raise err
115 out_handle.close()
116 else:
117 #Avoid overhead of parsing into SeqRecord objects,
118 #just re-use the original formatting from the input file.
119 out_handle = open(out_file, "w")
120 count = 0
121 for name in parse_ids(tabular_file, column):
122 try:
123 out_handle.write(records.get_raw(name))
124 except KeyError:
125 out_handle.close()
126 stop_err("Identifier %r not found in sequence file" % name)
127 count += 1
128 out_handle.close()
129
130 print "Selected %i sequences by ID" % count