Mercurial > repos > peterjc > seq_select_by_id
comparison tools/seq_select_by_id/seq_select_by_id.py @ 4:6842c0c7bc70 draft
Uploaded v0.0.7, depend on Biopython 1.62, tabs to spaces in XML
author | peterjc |
---|---|
date | Mon, 28 Oct 2013 05:21:45 -0400 |
parents | |
children | 91f55ee8fea5 |
comparison
equal
deleted
inserted
replaced
3:19e26966ed3e | 4:6842c0c7bc70 |
---|---|
1 #!/usr/bin/env python | |
2 """Select FASTA, QUAL, FASTQ or SSF sequences by IDs from a tabular file. | |
3 | |
4 Takes five command line options, tabular filename, ID column number (using | |
5 one based counting), input filename, input type (e.g. FASTA or SFF) and the | |
6 output filename (same format as input sequence file). | |
7 | |
8 When selecting from an SFF file, any Roche XML manifest in the input file is | |
9 preserved in both output files. | |
10 | |
11 This tool is a short Python script which requires Biopython 1.54 or later | |
12 for SFF file support. If you use this tool in scientific work leading to a | |
13 publication, please cite the Biopython application note: | |
14 | |
15 Cock et al 2009. Biopython: freely available Python tools for computational | |
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
18 | |
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. | |
20 All rights reserved. See accompanying text file for licence details (MIT | |
21 license). | |
22 | |
23 This is version 0.0.6 of the script. | |
24 """ | |
25 import sys | |
26 | |
27 def stop_err(msg, err=1): | |
28 sys.stderr.write(msg.rstrip() + "\n") | |
29 sys.exit(err) | |
30 | |
31 if "-v" in sys.argv or "--version" in sys.argv: | |
32 print "v0.0.6" | |
33 sys.exit(0) | |
34 | |
35 #Parse Command Line | |
36 try: | |
37 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] | |
38 except ValueError: | |
39 stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
40 try: | |
41 if col_arg.startswith("c"): | |
42 column = int(col_arg[1:])-1 | |
43 else: | |
44 column = int(col_arg)-1 | |
45 except ValueError: | |
46 stop_err("Expected column number, got %s" % col_arg) | |
47 | |
48 if seq_format == "fastqcssanger": | |
49 stop_err("Colorspace FASTQ not supported.") | |
50 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: | |
51 seq_format = seq_format.lower() | |
52 elif seq_format.lower().startswith("fastq"): | |
53 #We don't care how the qualities are encoded | |
54 seq_format = "fastq" | |
55 elif seq_format.lower().startswith("qual"): | |
56 #We don't care what the scores are | |
57 seq_format = "qual" | |
58 else: | |
59 stop_err("Unrecognised file format %r" % seq_format) | |
60 | |
61 | |
62 try: | |
63 from Bio import SeqIO | |
64 except ImportError: | |
65 stop_err("Biopython 1.54 or later is required") | |
66 | |
67 | |
68 def parse_ids(tabular_file, col): | |
69 """Read tabular file and record all specified identifiers.""" | |
70 handle = open(tabular_file, "rU") | |
71 for line in handle: | |
72 if line.strip() and not line.startswith("#"): | |
73 yield line.rstrip("\n").split("\t")[col].strip() | |
74 handle.close() | |
75 | |
76 #Index the sequence file. | |
77 #If very big, could use SeqIO.index_db() to avoid memory bottleneck... | |
78 records = SeqIO.index(in_file, seq_format) | |
79 print "Indexed %i sequences" % len(records) | |
80 | |
81 if seq_format.lower()=="sff": | |
82 #Special case to try to preserve the XML manifest | |
83 try: | |
84 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
85 except ImportError: | |
86 stop_err("Requires Biopython 1.54 or later") | |
87 | |
88 try: | |
89 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
90 except ImportError: | |
91 #Prior to Biopython 1.56 this was a private function | |
92 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
93 | |
94 in_handle = open(in_file, "rb") #must be binary mode! | |
95 try: | |
96 manifest = ReadRocheXmlManifest(in_handle) | |
97 except ValueError: | |
98 manifest = None | |
99 in_handle.close() | |
100 | |
101 out_handle = open(out_file, "wb") | |
102 writer = SffWriter(out_handle, xml=manifest) | |
103 count = 0 | |
104 #This does have the overhead of parsing into SeqRecord objects, | |
105 #but doing the header and index at the low level is too fidly. | |
106 iterator = (records[name] for name in parse_ids(tabular_file, column)) | |
107 try: | |
108 count = writer.write_file(iterator) | |
109 except KeyError, err: | |
110 out_handle.close() | |
111 if name not in records: | |
112 stop_err("Identifier %r not found in sequence file" % name) | |
113 else: | |
114 raise err | |
115 out_handle.close() | |
116 else: | |
117 #Avoid overhead of parsing into SeqRecord objects, | |
118 #just re-use the original formatting from the input file. | |
119 out_handle = open(out_file, "w") | |
120 count = 0 | |
121 for name in parse_ids(tabular_file, column): | |
122 try: | |
123 out_handle.write(records.get_raw(name)) | |
124 except KeyError: | |
125 out_handle.close() | |
126 stop_err("Identifier %r not found in sequence file" % name) | |
127 count += 1 | |
128 out_handle.close() | |
129 | |
130 print "Selected %i sequences by ID" % count |