comparison tools/seq_select_by_id/seq_select_by_id.py @ 6:91f55ee8fea5 draft

v0.0.11; more tests and assorting minor changes
author peterjc
date Wed, 13 May 2015 10:56:29 -0400
parents 6842c0c7bc70
children a5602454b0ad
comparison
equal deleted inserted replaced
5:1a83f5ab9e95 6:91f55ee8fea5
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
18 18
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. 19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
20 All rights reserved. See accompanying text file for licence details (MIT 20 All rights reserved. See accompanying text file for licence details (MIT
21 license). 21 license).
22
23 This is version 0.0.6 of the script.
24 """ 22 """
25 import sys 23 import sys
26 24
27 def stop_err(msg, err=1): 25 def sys_exit(msg, err=1):
28 sys.stderr.write(msg.rstrip() + "\n") 26 sys.stderr.write(msg.rstrip() + "\n")
29 sys.exit(err) 27 sys.exit(err)
30 28
31 if "-v" in sys.argv or "--version" in sys.argv: 29 if "-v" in sys.argv or "--version" in sys.argv:
32 print "v0.0.6" 30 print "v0.0.9"
33 sys.exit(0) 31 sys.exit(0)
34 32
35 #Parse Command Line 33 #Parse Command Line
36 try: 34 try:
37 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] 35 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
38 except ValueError: 36 except ValueError:
39 stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) 37 sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
40 try: 38 try:
41 if col_arg.startswith("c"): 39 if col_arg.startswith("c"):
42 column = int(col_arg[1:])-1 40 column = int(col_arg[1:])-1
43 else: 41 else:
44 column = int(col_arg)-1 42 column = int(col_arg)-1
45 except ValueError: 43 except ValueError:
46 stop_err("Expected column number, got %s" % col_arg) 44 sys_exit("Expected column number, got %s" % col_arg)
47 45
48 if seq_format == "fastqcssanger": 46 if seq_format == "fastqcssanger":
49 stop_err("Colorspace FASTQ not supported.") 47 sys_exit("Colorspace FASTQ not supported.")
50 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: 48 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
51 seq_format = seq_format.lower() 49 seq_format = seq_format.lower()
52 elif seq_format.lower().startswith("fastq"): 50 elif seq_format.lower().startswith("fastq"):
53 #We don't care how the qualities are encoded 51 #We don't care how the qualities are encoded
54 seq_format = "fastq" 52 seq_format = "fastq"
55 elif seq_format.lower().startswith("qual"): 53 elif seq_format.lower().startswith("qual"):
56 #We don't care what the scores are 54 #We don't care what the scores are
57 seq_format = "qual" 55 seq_format = "qual"
58 else: 56 else:
59 stop_err("Unrecognised file format %r" % seq_format) 57 sys_exit("Unrecognised file format %r" % seq_format)
60 58
61 59
62 try: 60 try:
63 from Bio import SeqIO 61 from Bio import SeqIO
64 except ImportError: 62 except ImportError:
65 stop_err("Biopython 1.54 or later is required") 63 sys_exit("Biopython 1.54 or later is required")
66 64
67 65
68 def parse_ids(tabular_file, col): 66 def parse_ids(tabular_file, col):
69 """Read tabular file and record all specified identifiers.""" 67 """Read tabular file and record all specified identifiers.
68
69 Will print a single warning to stderr if any of the fields have
70 non-trailing white space (only the first word will be used as
71 the identifier).
72 """
70 handle = open(tabular_file, "rU") 73 handle = open(tabular_file, "rU")
74 warn = False
71 for line in handle: 75 for line in handle:
72 if line.strip() and not line.startswith("#"): 76 if line.strip() and not line.startswith("#"):
73 yield line.rstrip("\n").split("\t")[col].strip() 77 field = line.rstrip("\n").split("\t")[col].strip()
78 parts = field.split(None, 1)
79 if len(parts) > 1 and not warn:
80 warn = "WARNING: Some of your identifiers had white space in them, " + \
81 "using first word only. e.g.:\n%s\n" % field
82 yield parts[0]
74 handle.close() 83 handle.close()
84 if warn:
85 sys.stderr.write(warn)
75 86
76 #Index the sequence file. 87 #Index the sequence file.
77 #If very big, could use SeqIO.index_db() to avoid memory bottleneck... 88 #If very big, could use SeqIO.index_db() to avoid memory bottleneck...
78 records = SeqIO.index(in_file, seq_format) 89 records = SeqIO.index(in_file, seq_format)
79 print "Indexed %i sequences" % len(records) 90 print "Indexed %i sequences" % len(records)
81 if seq_format.lower()=="sff": 92 if seq_format.lower()=="sff":
82 #Special case to try to preserve the XML manifest 93 #Special case to try to preserve the XML manifest
83 try: 94 try:
84 from Bio.SeqIO.SffIO import SffIterator, SffWriter 95 from Bio.SeqIO.SffIO import SffIterator, SffWriter
85 except ImportError: 96 except ImportError:
86 stop_err("Requires Biopython 1.54 or later") 97 sys_exit("Requires Biopython 1.54 or later")
87 98
88 try: 99 try:
89 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 100 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
90 except ImportError: 101 except ImportError:
91 #Prior to Biopython 1.56 this was a private function 102 #Prior to Biopython 1.56 this was a private function
107 try: 118 try:
108 count = writer.write_file(iterator) 119 count = writer.write_file(iterator)
109 except KeyError, err: 120 except KeyError, err:
110 out_handle.close() 121 out_handle.close()
111 if name not in records: 122 if name not in records:
112 stop_err("Identifier %r not found in sequence file" % name) 123 sys_exit("Identifier %r not found in sequence file" % name)
113 else: 124 else:
114 raise err 125 raise err
115 out_handle.close() 126 out_handle.close()
116 else: 127 else:
117 #Avoid overhead of parsing into SeqRecord objects, 128 #Avoid overhead of parsing into SeqRecord objects,
121 for name in parse_ids(tabular_file, column): 132 for name in parse_ids(tabular_file, column):
122 try: 133 try:
123 out_handle.write(records.get_raw(name)) 134 out_handle.write(records.get_raw(name))
124 except KeyError: 135 except KeyError:
125 out_handle.close() 136 out_handle.close()
126 stop_err("Identifier %r not found in sequence file" % name) 137 sys_exit("Identifier %r not found in sequence file" % name)
127 count += 1 138 count += 1
128 out_handle.close() 139 out_handle.close()
129 140
130 print "Selected %i sequences by ID" % count 141 print "Selected %i sequences by ID" % count