Mercurial > repos > peterjc > seq_select_by_id
comparison tools/seq_select_by_id/seq_select_by_id.py @ 7:a5602454b0ad draft
v0.0.12 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda; Python 3 compatible print function
author | peterjc |
---|---|
date | Thu, 11 May 2017 06:26:05 -0400 |
parents | 91f55ee8fea5 |
children | 8e1a90917fa7 |
comparison
equal
deleted
inserted
replaced
6:91f55ee8fea5 | 7:a5602454b0ad |
---|---|
14 | 14 |
15 Cock et al 2009. Biopython: freely available Python tools for computational | 15 Cock et al 2009. Biopython: freely available Python tools for computational |
16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | 16 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. |
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
18 | 18 |
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. | 19 This script is copyright 2011-2017 by Peter Cock, The James Hutton Institute UK. |
20 All rights reserved. See accompanying text file for licence details (MIT | 20 All rights reserved. See accompanying text file for licence details (MIT |
21 license). | 21 license). |
22 """ | 22 """ |
23 | |
24 from __future__ import print_function | |
25 | |
23 import sys | 26 import sys |
24 | 27 |
25 def sys_exit(msg, err=1): | |
26 sys.stderr.write(msg.rstrip() + "\n") | |
27 sys.exit(err) | |
28 | |
29 if "-v" in sys.argv or "--version" in sys.argv: | 28 if "-v" in sys.argv or "--version" in sys.argv: |
30 print "v0.0.9" | 29 print("v0.0.12") |
31 sys.exit(0) | 30 sys.exit(0) |
32 | 31 |
33 #Parse Command Line | 32 # Parse Command Line |
34 try: | 33 try: |
35 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] | 34 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] |
36 except ValueError: | 35 except ValueError: |
37 sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | 36 sys.exit("Expected five arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) |
38 try: | 37 try: |
39 if col_arg.startswith("c"): | 38 if col_arg.startswith("c"): |
40 column = int(col_arg[1:])-1 | 39 column = int(col_arg[1:]) - 1 |
41 else: | 40 else: |
42 column = int(col_arg)-1 | 41 column = int(col_arg) - 1 |
43 except ValueError: | 42 except ValueError: |
44 sys_exit("Expected column number, got %s" % col_arg) | 43 sys.exit("Expected column number, got %s" % col_arg) |
45 | 44 |
46 if seq_format == "fastqcssanger": | 45 if seq_format == "fastqcssanger": |
47 sys_exit("Colorspace FASTQ not supported.") | 46 sys.exit("Colorspace FASTQ not supported.") |
48 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: | 47 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: |
49 seq_format = seq_format.lower() | 48 seq_format = seq_format.lower() |
50 elif seq_format.lower().startswith("fastq"): | 49 elif seq_format.lower().startswith("fastq"): |
51 #We don't care how the qualities are encoded | 50 # We don't care how the qualities are encoded |
52 seq_format = "fastq" | 51 seq_format = "fastq" |
53 elif seq_format.lower().startswith("qual"): | 52 elif seq_format.lower().startswith("qual"): |
54 #We don't care what the scores are | 53 # We don't care what the scores are |
55 seq_format = "qual" | 54 seq_format = "qual" |
56 else: | 55 else: |
57 sys_exit("Unrecognised file format %r" % seq_format) | 56 sys.exit("Unrecognised file format %r" % seq_format) |
58 | 57 |
59 | 58 |
60 try: | 59 try: |
61 from Bio import SeqIO | 60 from Bio import SeqIO |
62 except ImportError: | 61 except ImportError: |
63 sys_exit("Biopython 1.54 or later is required") | 62 sys.exit("Biopython 1.54 or later is required") |
64 | 63 |
65 | 64 |
66 def parse_ids(tabular_file, col): | 65 def parse_ids(tabular_file, col): |
67 """Read tabular file and record all specified identifiers. | 66 """Read tabular file and record all specified identifiers. |
68 | 67 |
82 yield parts[0] | 81 yield parts[0] |
83 handle.close() | 82 handle.close() |
84 if warn: | 83 if warn: |
85 sys.stderr.write(warn) | 84 sys.stderr.write(warn) |
86 | 85 |
87 #Index the sequence file. | 86 |
88 #If very big, could use SeqIO.index_db() to avoid memory bottleneck... | 87 # Index the sequence file. |
88 # If very big, could use SeqIO.index_db() to avoid memory bottleneck... | |
89 records = SeqIO.index(in_file, seq_format) | 89 records = SeqIO.index(in_file, seq_format) |
90 print "Indexed %i sequences" % len(records) | 90 print("Indexed %i sequences" % len(records)) |
91 | 91 |
92 if seq_format.lower()=="sff": | 92 if seq_format.lower() == "sff": |
93 #Special case to try to preserve the XML manifest | 93 # Special case to try to preserve the XML manifest |
94 try: | 94 try: |
95 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 95 from Bio.SeqIO.SffIO import SffWriter |
96 except ImportError: | 96 except ImportError: |
97 sys_exit("Requires Biopython 1.54 or later") | 97 sys.exit("Requires Biopython 1.54 or later") |
98 | 98 |
99 try: | 99 try: |
100 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 100 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
101 except ImportError: | 101 except ImportError: |
102 #Prior to Biopython 1.56 this was a private function | 102 # Prior to Biopython 1.56 this was a private function |
103 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 103 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
104 | 104 |
105 in_handle = open(in_file, "rb") #must be binary mode! | 105 in_handle = open(in_file, "rb") # must be binary mode! |
106 try: | 106 try: |
107 manifest = ReadRocheXmlManifest(in_handle) | 107 manifest = ReadRocheXmlManifest(in_handle) |
108 except ValueError: | 108 except ValueError: |
109 manifest = None | 109 manifest = None |
110 in_handle.close() | 110 in_handle.close() |
111 | 111 |
112 out_handle = open(out_file, "wb") | 112 out_handle = open(out_file, "wb") |
113 writer = SffWriter(out_handle, xml=manifest) | 113 writer = SffWriter(out_handle, xml=manifest) |
114 count = 0 | 114 count = 0 |
115 #This does have the overhead of parsing into SeqRecord objects, | 115 # This does have the overhead of parsing into SeqRecord objects, |
116 #but doing the header and index at the low level is too fidly. | 116 # but doing the header and index at the low level is too fidly. |
117 name = None # We want the variable to leak from the iterator's scope... | |
117 iterator = (records[name] for name in parse_ids(tabular_file, column)) | 118 iterator = (records[name] for name in parse_ids(tabular_file, column)) |
118 try: | 119 try: |
119 count = writer.write_file(iterator) | 120 count = writer.write_file(iterator) |
120 except KeyError, err: | 121 except KeyError, err: |
121 out_handle.close() | 122 out_handle.close() |
122 if name not in records: | 123 if name not in records: |
123 sys_exit("Identifier %r not found in sequence file" % name) | 124 sys.exit("Identifier %r not found in sequence file" % name) |
124 else: | 125 else: |
125 raise err | 126 raise err |
126 out_handle.close() | 127 out_handle.close() |
127 else: | 128 else: |
128 #Avoid overhead of parsing into SeqRecord objects, | 129 # Avoid overhead of parsing into SeqRecord objects, |
129 #just re-use the original formatting from the input file. | 130 # just re-use the original formatting from the input file. |
130 out_handle = open(out_file, "w") | 131 out_handle = open(out_file, "w") |
131 count = 0 | 132 count = 0 |
132 for name in parse_ids(tabular_file, column): | 133 for name in parse_ids(tabular_file, column): |
133 try: | 134 try: |
134 out_handle.write(records.get_raw(name)) | 135 out_handle.write(records.get_raw(name)) |
135 except KeyError: | 136 except KeyError: |
136 out_handle.close() | 137 out_handle.close() |
137 sys_exit("Identifier %r not found in sequence file" % name) | 138 sys.exit("Identifier %r not found in sequence file" % name) |
138 count += 1 | 139 count += 1 |
139 out_handle.close() | 140 out_handle.close() |
140 | 141 |
141 print "Selected %i sequences by ID" % count | 142 print("Selected %i sequences by ID" % count) |