Mercurial > repos > peterjc > seq_filter_by_id
diff tools/seq_filter_by_id/seq_filter_by_id.py @ 5:832c1fd57852 draft
v0.2.2; New options for IDs via text parameter, ignore paired read suffix; misc changes
author | peterjc |
---|---|
date | Wed, 13 May 2015 11:03:57 -0400 |
parents | 44ab4c0f7683 |
children | 03e134cae41a |
line wrap: on
line diff
--- a/tools/seq_filter_by_id/seq_filter_by_id.py Wed Jul 30 06:39:53 2014 -0400 +++ b/tools/seq_filter_by_id/seq_filter_by_id.py Wed May 13 11:03:57 2015 -0400 @@ -3,12 +3,8 @@ Takes six command line options, tabular filename, ID column numbers (comma separated list using one based counting), input filename, input type (e.g. -FASTA or SFF) and two output filenames (for records with and without the -given IDs, same format as input sequence file). - -If either output filename is just a minus sign, that file is not created. -This is intended to allow output for just the matched (or just the non-matched) -records. +FASTA or SFF) and up to two output filenames (for records with and without +the given IDs, same format as input sequence file). When filtering an SFF file, any Roche XML manifest in the input file is preserved in both output files. @@ -17,9 +13,9 @@ in column one, and the ID of the match from the database is in column two. Here sensible values for the column numbers would therefore be "1" or "2". -This tool is a short Python script which requires Biopython 1.54 or later -for SFF file support. If you use this tool in scientific work leading to a -publication, please cite the Biopython application note: +This tool is a short Python script which requires Biopython 1.54 or later. +If you use this tool in scientific work leading to a publication, please +cite the Biopython application note: Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. @@ -29,51 +25,177 @@ (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. See accompanying text file for licence details (MIT license). -This is version 0.1.0 of the script, use -v or --version to get the version. +Use -v or --version to get the version, -h or --help for help. """ import os import sys +import re +from optparse import OptionParser -def stop_err(msg, err=1): +def sys_exit(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) -if "-v" in sys.argv or "--version" in sys.argv: - print "v0.1.0" +#Parse Command Line +usage = """Use as follows: + +$ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...] + +e.g. Positive matches using column one from tabular file: + +$ seq_filter_by_id.py -i my_seqs.fastq -f fastq -p matches.fastq ids.tabular 1 + +Multiple tabular files and column numbers may be given, or replaced with +the -t or --text option. +""" +parser = OptionParser(usage=usage) +parser.add_option('-i', '--input', dest='input', + default=None, help='Input sequences filename', + metavar="FILE") +parser.add_option('-f', '--format', dest='format', + default=None, + help='Input sequence format (e.g. fasta, fastq, sff)') +parser.add_option('-t', '--text', dest='id_list', + default=None, help="Lists of white space separated IDs (instead of a tabular file)") +parser.add_option('-p', '--positive', dest='output_positive', + default=None, + help='Output filename for matches', + metavar="FILE") +parser.add_option('-n', '--negative', dest='output_negative', + default=None, + help='Output filename for non-matches', + metavar="FILE") +parser.add_option("-l", "--logic", dest="logic", + default="UNION", + help="How to combined multiple ID columns (UNION or INTERSECTION)") +parser.add_option("-s", "--suffix", dest="suffix", + action="store_true", + help="Ignore pair-read suffices for matching names") +parser.add_option("-v", "--version", dest="version", + default=False, action="store_true", + help="Show version and quit") + +options, args = parser.parse_args() + +if options.version: + print "v0.2.1" sys.exit(0) -#Parse Command Line -if len(sys.argv) - 1 < 7 or len(sys.argv) % 2 == 1: - stop_err("Expected 7 or more arguments, 5 required " - "(in seq, seq format, out pos, out neg, logic) " - "then one or more pairs (tab file, columns), " - "got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) +in_file = options.input +seq_format = options.format +out_positive_file = options.output_positive +out_negative_file = options.output_negative +logic = options.logic +drop_suffices = bool(options.suffix) -in_file, seq_format, out_positive_file, out_negative_file, logic = sys.argv[1:6] +if in_file is None or not os.path.isfile(in_file): + sys_exit("Missing input file: %r" % in_file) +if out_positive_file is None and out_negative_file is None: + sys_exit("Neither output file requested") +if seq_format is None: + sys_exit("Missing sequence format") +if logic not in ["UNION", "INTERSECTION"]: + sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) +if options.id_list and args: + sys_exit("Cannot accepted IDs via both -t and as tabular files") +elif not options.id_list and not args: + sys_exit("Expected matched pairs of tabular files and columns (or -t given)") +if len(args) % 2: + sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args) + -if not os.path.isfile(in_file): - stop_err("Missing input file %r" % in_file) -if out_positive_file == "-" and out_negative_file == "-": - stop_err("Neither output file requested") -if logic not in ["UNION", "INTERSECTION"]: - stop_err("Fifth agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) +#Cope with three widely used suffix naming convensions, +#Illumina: /1 or /2 +#Forward/revered: .f or .r +#Sanger, e.g. .p1k and .q1k +#See http://staden.sourceforge.net/manual/pregap4_unix_50.html +#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") +#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") +re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") +assert re_suffix.search("demo.f") +assert re_suffix.search("demo.s1") +assert re_suffix.search("demo.f1k") +assert re_suffix.search("demo.p1") +assert re_suffix.search("demo.p1k") +assert re_suffix.search("demo.p1lk") +assert re_suffix.search("demo/2") +assert re_suffix.search("demo.r") +assert re_suffix.search("demo.q1") +assert re_suffix.search("demo.q1lk") identifiers = [] -for i in range((len(sys.argv) - 6) // 2): - tabular_file = sys.argv[6+2*i] - cols_arg = sys.argv[7+2*i] +for i in range(len(args) // 2): + tabular_file = args[2*i] + cols_arg = args[2*i + 1] if not os.path.isfile(tabular_file): - stop_err("Missing tabular identifier file %r" % tabular_file) + sys_exit("Missing tabular identifier file %r" % tabular_file) try: columns = [int(arg)-1 for arg in cols_arg.split(",")] except ValueError: - stop_err("Expected list of columns (comma separated integers), got %r" % cols_arg) + sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg) if min(columns) < 0: - stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) + sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) identifiers.append((tabular_file, columns)) +name_warn = False +def check_white_space(name): + parts = name.split(None, 1) + global name_warn + if not name_warn and len(parts) > 1: + name_warn = "WARNING: Some of your identifiers had white space in them, " + \ + "using first word only. e.g.:\n%s\n" % name + return parts[0] + +if drop_suffices: + def clean_name(name): + """Remove suffix.""" + name = check_white_space(name) + match = re_suffix.search(name) + if match: + # Use the fact this is a suffix, and regular expression will be + # anchored to the end of the name: + return name[:match.start()] + else: + # Nothing to do + return name + assert clean_name("foo/1") == "foo" + assert clean_name("foo/2") == "foo" + assert clean_name("bar.f") == "bar" + assert clean_name("bar.r") == "bar" + assert clean_name("baz.p1") == "baz" + assert clean_name("baz.q2") == "baz" +else: + # Just check the white space + clean_name = check_white_space + + +mapped_chars = { '>' :'__gt__', + '<' :'__lt__', + "'" :'__sq__', + '"' :'__dq__', + '[' :'__ob__', + ']' :'__cb__', + '{' :'__oc__', + '}' :'__cc__', + '@' : '__at__', + '\n' : '__cn__', + '\r' : '__cr__', + '\t' : '__tc__', + '#' : '__pd__' + } + #Read tabular file(s) and record all specified identifiers ids = None #Will be a set +if options.id_list: + assert not identifiers + ids = set() + id_list = options.id_list + #Galaxy turns \r into __cr__ (CR) etc + for k in mapped_chars: + id_list = id_list.replace(mapped_chars[k], k) + for x in options.id_list.split(): + ids.add(clean_name(x.strip())) + print("Have %i unique identifiers from list" % len(ids)) for tabular_file, columns in identifiers: file_ids = set() handle = open(tabular_file, "rU") @@ -85,13 +207,13 @@ continue parts = line.rstrip("\n").split("\t") for col in columns: - file_ids.add(parts[col]) + file_ids.add(clean_name(parts[col])) else: #Single column, special case speed up col = columns[0] for line in handle: if not line.startswith("#"): - file_ids.add(line.rstrip("\n").split("\t")[col]) + file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns)) if ids is None: ids = file_ids @@ -105,7 +227,8 @@ print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) else: print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) - +if name_warn: + sys.stderr.write(name_warn) def crude_fasta_iterator(handle): """Yields tuples, record ID and the full record as a string.""" @@ -149,32 +272,32 @@ with open(in_file) as in_handle: #Doing the if statement outside the loop for speed #(with the downside of three very similar loops). - if pos_file != "-" and neg_file != "-": + if pos_file is not None and neg_file is not None: print "Generating two FASTA files" with open(pos_file, "w") as pos_handle: with open(neg_file, "w") as neg_handle: for identifier, record in crude_fasta_iterator(in_handle): - if identifier in wanted: + if clean_name(identifier) in wanted: pos_handle.write(record) pos_count += 1 else: neg_handle.write(record) neg_count += 1 - elif pos_file != "-": + elif pos_file is not None: print "Generating matching FASTA file" with open(pos_file, "w") as pos_handle: for identifier, record in crude_fasta_iterator(in_handle): - if identifier in wanted: + if clean_name(identifier) in wanted: pos_handle.write(record) pos_count += 1 else: neg_count += 1 else: print "Generating non-matching FASTA file" - assert neg_file != "-" + assert neg_file is not None with open(neg_file, "w") as neg_handle: for identifier, record in crude_fasta_iterator(in_handle): - if identifier in wanted: + if clean_name(identifier) in wanted: pos_count += 1 else: neg_handle.write(record) @@ -182,79 +305,94 @@ return pos_count, neg_count -if seq_format.lower()=="sff": - #Now write filtered SFF file based on IDs from BLAST file +def fastq_filter(in_file, pos_file, neg_file, wanted): + """FASTQ filter.""" + from Bio.SeqIO.QualityIO import FastqGeneralIterator + handle = open(in_file, "r") + if out_positive_file is not None and out_negative_file is not None: + print "Generating two FASTQ files" + positive_handle = open(out_positive_file, "w") + negative_handle = open(out_negative_file, "w") + print in_file + for title, seq, qual in FastqGeneralIterator(handle): + print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) + if clean_name(title.split(None, 1)[0]) in ids: + positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + else: + negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + positive_handle.close() + negative_handle.close() + elif out_positive_file is not None: + print "Generating matching FASTQ file" + positive_handle = open(out_positive_file, "w") + for title, seq, qual in FastqGeneralIterator(handle): + if clean_name(title.split(None, 1)[0]) in ids: + positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + positive_handle.close() + elif out_negative_file is not None: + print "Generating non-matching FASTQ file" + negative_handle = open(out_negative_file, "w") + for title, seq, qual in FastqGeneralIterator(handle): + if clean_name(title.split(None, 1)[0]) not in ids: + negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + negative_handle.close() + handle.close() + # This does not currently bother to record record counts (faster) + + +def sff_filter(in_file, pos_file, neg_file, wanted): + """SFF filter.""" try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: - stop_err("SFF filtering requires Biopython 1.54 or later") + sys_exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest + in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None + #This makes two passes though the SFF file with isn't so efficient, #but this makes the code simple. pos_count = neg_count = 0 - if out_positive_file != "-": + if out_positive_file is not None: out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest - pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) + pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) out_handle.close() - if out_negative_file != "-": + if out_negative_file is not None: out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again - neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) + neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) out_handle.close() #And we're done in_handle.close() #At the time of writing, Galaxy doesn't show SFF file read counts, #so it is useful to put them in stdout and thus shown in job info. - print "%i with and %i without specified IDs" % (pos_count, neg_count) + return pos_count, neg_count + + +if seq_format.lower()=="sff": + # Now write filtered SFF file based on IDs wanted + pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) + # At the time of writing, Galaxy doesn't show SFF file read counts, + # so it is useful to put them in stdout and thus shown in job info. elif seq_format.lower()=="fasta": - #Write filtered FASTA file based on IDs from tabular file + # Write filtered FASTA file based on IDs from tabular file pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) print "%i with and %i without specified IDs" % (pos_count, neg_count) elif seq_format.lower().startswith("fastq"): - #Write filtered FASTQ file based on IDs from tabular file - from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - reader = fastqReader(open(in_file, "rU")) - if out_positive_file != "-" and out_negative_file != "-": - print "Generating two FASTQ files" - positive_writer = fastqWriter(open(out_positive_file, "w")) - negative_writer = fastqWriter(open(out_negative_file, "w")) - for record in reader: - #The [1:] is because the fastaReader leaves the > on the identifier. - if record.identifier and record.identifier.split()[0][1:] in ids: - positive_writer.write(record) - else: - negative_writer.write(record) - positive_writer.close() - negative_writer.close() - elif out_positive_file != "-": - print "Generating matching FASTQ file" - positive_writer = fastqWriter(open(out_positive_file, "w")) - for record in reader: - #The [1:] is because the fastaReader leaves the > on the identifier. - if record.identifier and record.identifier.split()[0][1:] in ids: - positive_writer.write(record) - positive_writer.close() - elif out_negative_file != "-": - print "Generating non-matching FASTQ file" - negative_writer = fastqWriter(open(out_negative_file, "w")) - for record in reader: - #The [1:] is because the fastaReader leaves the > on the identifier. - if not record.identifier or record.identifier.split()[0][1:] not in ids: - negative_writer.write(record) - negative_writer.close() - reader.close() + # Write filtered FASTQ file based on IDs from tabular file + fastq_filter(in_file, out_positive_file, out_negative_file, ids) + # This does not currently track the counts else: - stop_err("Unsupported file type %r" % seq_format) + sys_exit("Unsupported file type %r" % seq_format)