Mercurial > repos > peterjc > seq_select_by_id
comparison tools/seq_select_by_id/seq_select_by_id.py @ 6:91f55ee8fea5 draft
v0.0.11; more tests and assorting minor changes
author | peterjc |
---|---|
date | Wed, 13 May 2015 10:56:29 -0400 |
parents | 6842c0c7bc70 |
children | a5602454b0ad |
comparison
equal
deleted
inserted
replaced
5:1a83f5ab9e95 | 6:91f55ee8fea5 |
---|---|
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
18 | 18 |
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. | 19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. |
20 All rights reserved. See accompanying text file for licence details (MIT | 20 All rights reserved. See accompanying text file for licence details (MIT |
21 license). | 21 license). |
22 | |
23 This is version 0.0.6 of the script. | |
24 """ | 22 """ |
25 import sys | 23 import sys |
26 | 24 |
27 def stop_err(msg, err=1): | 25 def sys_exit(msg, err=1): |
28 sys.stderr.write(msg.rstrip() + "\n") | 26 sys.stderr.write(msg.rstrip() + "\n") |
29 sys.exit(err) | 27 sys.exit(err) |
30 | 28 |
31 if "-v" in sys.argv or "--version" in sys.argv: | 29 if "-v" in sys.argv or "--version" in sys.argv: |
32 print "v0.0.6" | 30 print "v0.0.9" |
33 sys.exit(0) | 31 sys.exit(0) |
34 | 32 |
35 #Parse Command Line | 33 #Parse Command Line |
36 try: | 34 try: |
37 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] | 35 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] |
38 except ValueError: | 36 except ValueError: |
39 stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | 37 sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) |
40 try: | 38 try: |
41 if col_arg.startswith("c"): | 39 if col_arg.startswith("c"): |
42 column = int(col_arg[1:])-1 | 40 column = int(col_arg[1:])-1 |
43 else: | 41 else: |
44 column = int(col_arg)-1 | 42 column = int(col_arg)-1 |
45 except ValueError: | 43 except ValueError: |
46 stop_err("Expected column number, got %s" % col_arg) | 44 sys_exit("Expected column number, got %s" % col_arg) |
47 | 45 |
48 if seq_format == "fastqcssanger": | 46 if seq_format == "fastqcssanger": |
49 stop_err("Colorspace FASTQ not supported.") | 47 sys_exit("Colorspace FASTQ not supported.") |
50 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: | 48 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: |
51 seq_format = seq_format.lower() | 49 seq_format = seq_format.lower() |
52 elif seq_format.lower().startswith("fastq"): | 50 elif seq_format.lower().startswith("fastq"): |
53 #We don't care how the qualities are encoded | 51 #We don't care how the qualities are encoded |
54 seq_format = "fastq" | 52 seq_format = "fastq" |
55 elif seq_format.lower().startswith("qual"): | 53 elif seq_format.lower().startswith("qual"): |
56 #We don't care what the scores are | 54 #We don't care what the scores are |
57 seq_format = "qual" | 55 seq_format = "qual" |
58 else: | 56 else: |
59 stop_err("Unrecognised file format %r" % seq_format) | 57 sys_exit("Unrecognised file format %r" % seq_format) |
60 | 58 |
61 | 59 |
62 try: | 60 try: |
63 from Bio import SeqIO | 61 from Bio import SeqIO |
64 except ImportError: | 62 except ImportError: |
65 stop_err("Biopython 1.54 or later is required") | 63 sys_exit("Biopython 1.54 or later is required") |
66 | 64 |
67 | 65 |
68 def parse_ids(tabular_file, col): | 66 def parse_ids(tabular_file, col): |
69 """Read tabular file and record all specified identifiers.""" | 67 """Read tabular file and record all specified identifiers. |
68 | |
69 Will print a single warning to stderr if any of the fields have | |
70 non-trailing white space (only the first word will be used as | |
71 the identifier). | |
72 """ | |
70 handle = open(tabular_file, "rU") | 73 handle = open(tabular_file, "rU") |
74 warn = False | |
71 for line in handle: | 75 for line in handle: |
72 if line.strip() and not line.startswith("#"): | 76 if line.strip() and not line.startswith("#"): |
73 yield line.rstrip("\n").split("\t")[col].strip() | 77 field = line.rstrip("\n").split("\t")[col].strip() |
78 parts = field.split(None, 1) | |
79 if len(parts) > 1 and not warn: | |
80 warn = "WARNING: Some of your identifiers had white space in them, " + \ | |
81 "using first word only. e.g.:\n%s\n" % field | |
82 yield parts[0] | |
74 handle.close() | 83 handle.close() |
84 if warn: | |
85 sys.stderr.write(warn) | |
75 | 86 |
76 #Index the sequence file. | 87 #Index the sequence file. |
77 #If very big, could use SeqIO.index_db() to avoid memory bottleneck... | 88 #If very big, could use SeqIO.index_db() to avoid memory bottleneck... |
78 records = SeqIO.index(in_file, seq_format) | 89 records = SeqIO.index(in_file, seq_format) |
79 print "Indexed %i sequences" % len(records) | 90 print "Indexed %i sequences" % len(records) |
81 if seq_format.lower()=="sff": | 92 if seq_format.lower()=="sff": |
82 #Special case to try to preserve the XML manifest | 93 #Special case to try to preserve the XML manifest |
83 try: | 94 try: |
84 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 95 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
85 except ImportError: | 96 except ImportError: |
86 stop_err("Requires Biopython 1.54 or later") | 97 sys_exit("Requires Biopython 1.54 or later") |
87 | 98 |
88 try: | 99 try: |
89 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 100 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
90 except ImportError: | 101 except ImportError: |
91 #Prior to Biopython 1.56 this was a private function | 102 #Prior to Biopython 1.56 this was a private function |
107 try: | 118 try: |
108 count = writer.write_file(iterator) | 119 count = writer.write_file(iterator) |
109 except KeyError, err: | 120 except KeyError, err: |
110 out_handle.close() | 121 out_handle.close() |
111 if name not in records: | 122 if name not in records: |
112 stop_err("Identifier %r not found in sequence file" % name) | 123 sys_exit("Identifier %r not found in sequence file" % name) |
113 else: | 124 else: |
114 raise err | 125 raise err |
115 out_handle.close() | 126 out_handle.close() |
116 else: | 127 else: |
117 #Avoid overhead of parsing into SeqRecord objects, | 128 #Avoid overhead of parsing into SeqRecord objects, |
121 for name in parse_ids(tabular_file, column): | 132 for name in parse_ids(tabular_file, column): |
122 try: | 133 try: |
123 out_handle.write(records.get_raw(name)) | 134 out_handle.write(records.get_raw(name)) |
124 except KeyError: | 135 except KeyError: |
125 out_handle.close() | 136 out_handle.close() |
126 stop_err("Identifier %r not found in sequence file" % name) | 137 sys_exit("Identifier %r not found in sequence file" % name) |
127 count += 1 | 138 count += 1 |
128 out_handle.close() | 139 out_handle.close() |
129 | 140 |
130 print "Selected %i sequences by ID" % count | 141 print "Selected %i sequences by ID" % count |