comparison tools/seq_select_by_id/seq_select_by_id.py @ 7:a5602454b0ad draft

v0.0.12 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda; Python 3 compatible print function
author peterjc
date Thu, 11 May 2017 06:26:05 -0400
parents 91f55ee8fea5
children 8e1a90917fa7
comparison
equal deleted inserted replaced
6:91f55ee8fea5 7:a5602454b0ad
14 14
15 Cock et al 2009. Biopython: freely available Python tools for computational 15 Cock et al 2009. Biopython: freely available Python tools for computational
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. 16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
18 18
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. 19 This script is copyright 2011-2017 by Peter Cock, The James Hutton Institute UK.
20 All rights reserved. See accompanying text file for licence details (MIT 20 All rights reserved. See accompanying text file for licence details (MIT
21 license). 21 license).
22 """ 22 """
23
24 from __future__ import print_function
25
23 import sys 26 import sys
24 27
25 def sys_exit(msg, err=1):
26 sys.stderr.write(msg.rstrip() + "\n")
27 sys.exit(err)
28
29 if "-v" in sys.argv or "--version" in sys.argv: 28 if "-v" in sys.argv or "--version" in sys.argv:
30 print "v0.0.9" 29 print("v0.0.12")
31 sys.exit(0) 30 sys.exit(0)
32 31
33 #Parse Command Line 32 # Parse Command Line
34 try: 33 try:
35 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] 34 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
36 except ValueError: 35 except ValueError:
37 sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) 36 sys.exit("Expected five arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
38 try: 37 try:
39 if col_arg.startswith("c"): 38 if col_arg.startswith("c"):
40 column = int(col_arg[1:])-1 39 column = int(col_arg[1:]) - 1
41 else: 40 else:
42 column = int(col_arg)-1 41 column = int(col_arg) - 1
43 except ValueError: 42 except ValueError:
44 sys_exit("Expected column number, got %s" % col_arg) 43 sys.exit("Expected column number, got %s" % col_arg)
45 44
46 if seq_format == "fastqcssanger": 45 if seq_format == "fastqcssanger":
47 sys_exit("Colorspace FASTQ not supported.") 46 sys.exit("Colorspace FASTQ not supported.")
48 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: 47 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
49 seq_format = seq_format.lower() 48 seq_format = seq_format.lower()
50 elif seq_format.lower().startswith("fastq"): 49 elif seq_format.lower().startswith("fastq"):
51 #We don't care how the qualities are encoded 50 # We don't care how the qualities are encoded
52 seq_format = "fastq" 51 seq_format = "fastq"
53 elif seq_format.lower().startswith("qual"): 52 elif seq_format.lower().startswith("qual"):
54 #We don't care what the scores are 53 # We don't care what the scores are
55 seq_format = "qual" 54 seq_format = "qual"
56 else: 55 else:
57 sys_exit("Unrecognised file format %r" % seq_format) 56 sys.exit("Unrecognised file format %r" % seq_format)
58 57
59 58
60 try: 59 try:
61 from Bio import SeqIO 60 from Bio import SeqIO
62 except ImportError: 61 except ImportError:
63 sys_exit("Biopython 1.54 or later is required") 62 sys.exit("Biopython 1.54 or later is required")
64 63
65 64
66 def parse_ids(tabular_file, col): 65 def parse_ids(tabular_file, col):
67 """Read tabular file and record all specified identifiers. 66 """Read tabular file and record all specified identifiers.
68 67
82 yield parts[0] 81 yield parts[0]
83 handle.close() 82 handle.close()
84 if warn: 83 if warn:
85 sys.stderr.write(warn) 84 sys.stderr.write(warn)
86 85
87 #Index the sequence file. 86
88 #If very big, could use SeqIO.index_db() to avoid memory bottleneck... 87 # Index the sequence file.
88 # If very big, could use SeqIO.index_db() to avoid memory bottleneck...
89 records = SeqIO.index(in_file, seq_format) 89 records = SeqIO.index(in_file, seq_format)
90 print "Indexed %i sequences" % len(records) 90 print("Indexed %i sequences" % len(records))
91 91
92 if seq_format.lower()=="sff": 92 if seq_format.lower() == "sff":
93 #Special case to try to preserve the XML manifest 93 # Special case to try to preserve the XML manifest
94 try: 94 try:
95 from Bio.SeqIO.SffIO import SffIterator, SffWriter 95 from Bio.SeqIO.SffIO import SffWriter
96 except ImportError: 96 except ImportError:
97 sys_exit("Requires Biopython 1.54 or later") 97 sys.exit("Requires Biopython 1.54 or later")
98 98
99 try: 99 try:
100 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 100 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
101 except ImportError: 101 except ImportError:
102 #Prior to Biopython 1.56 this was a private function 102 # Prior to Biopython 1.56 this was a private function
103 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest 103 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
104 104
105 in_handle = open(in_file, "rb") #must be binary mode! 105 in_handle = open(in_file, "rb") # must be binary mode!
106 try: 106 try:
107 manifest = ReadRocheXmlManifest(in_handle) 107 manifest = ReadRocheXmlManifest(in_handle)
108 except ValueError: 108 except ValueError:
109 manifest = None 109 manifest = None
110 in_handle.close() 110 in_handle.close()
111 111
112 out_handle = open(out_file, "wb") 112 out_handle = open(out_file, "wb")
113 writer = SffWriter(out_handle, xml=manifest) 113 writer = SffWriter(out_handle, xml=manifest)
114 count = 0 114 count = 0
115 #This does have the overhead of parsing into SeqRecord objects, 115 # This does have the overhead of parsing into SeqRecord objects,
116 #but doing the header and index at the low level is too fidly. 116 # but doing the header and index at the low level is too fidly.
117 name = None # We want the variable to leak from the iterator's scope...
117 iterator = (records[name] for name in parse_ids(tabular_file, column)) 118 iterator = (records[name] for name in parse_ids(tabular_file, column))
118 try: 119 try:
119 count = writer.write_file(iterator) 120 count = writer.write_file(iterator)
120 except KeyError, err: 121 except KeyError, err:
121 out_handle.close() 122 out_handle.close()
122 if name not in records: 123 if name not in records:
123 sys_exit("Identifier %r not found in sequence file" % name) 124 sys.exit("Identifier %r not found in sequence file" % name)
124 else: 125 else:
125 raise err 126 raise err
126 out_handle.close() 127 out_handle.close()
127 else: 128 else:
128 #Avoid overhead of parsing into SeqRecord objects, 129 # Avoid overhead of parsing into SeqRecord objects,
129 #just re-use the original formatting from the input file. 130 # just re-use the original formatting from the input file.
130 out_handle = open(out_file, "w") 131 out_handle = open(out_file, "w")
131 count = 0 132 count = 0
132 for name in parse_ids(tabular_file, column): 133 for name in parse_ids(tabular_file, column):
133 try: 134 try:
134 out_handle.write(records.get_raw(name)) 135 out_handle.write(records.get_raw(name))
135 except KeyError: 136 except KeyError:
136 out_handle.close() 137 out_handle.close()
137 sys_exit("Identifier %r not found in sequence file" % name) 138 sys.exit("Identifier %r not found in sequence file" % name)
138 count += 1 139 count += 1
139 out_handle.close() 140 out_handle.close()
140 141
141 print "Selected %i sequences by ID" % count 142 print("Selected %i sequences by ID" % count)