Mercurial > repos > peterjc > seq_filter_by_id
changeset 6:03e134cae41a draft
v0.2.3, ignore blank lines in ID file (contributed by Gildas Le Corguille)
author | peterjc |
---|---|
date | Tue, 17 May 2016 05:59:24 -0400 |
parents | 832c1fd57852 |
children | fb1313d79396 |
files | tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.py tools/seq_filter_by_id/seq_filter_by_id.xml tools/seq_filter_by_id/tool_dependencies.xml |
diffstat | 4 files changed, 100 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_filter_by_id/README.rst Wed May 13 11:03:57 2015 -0400 +++ b/tools/seq_filter_by_id/README.rst Tue May 17 05:59:24 2016 -0400 @@ -86,6 +86,9 @@ v0.2.2 - Use the ``format_source=...`` tag. - Reorder XML elements (internal change only). - Planemo for Tool Shed upload (``.shed.yml``, internal change only). +v0.2.3 - Ignore blank lines in ID file (contributed by Gildas Le Corguillé). + - Defensive quoting of filenames etc in the command definition + (internal change only). ======= ====================================================================== @@ -103,17 +106,17 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/venn_list/ + $ planemo shed_update -t testtoolshed --check_diff tools/seq_filter_by_id/ -m "New release" ... or:: - $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/venn_list/ + $ planemo shed_update -t toolshed --check_diff tools/seq_filter_by_id/ -m "New release" ... To just build and check the tar ball, use:: - $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/venn_list/ + $ planemo shed_upload --tar_only tools/seq_filter_by_id/ ... $ tar -tzf shed_upload.tar.gz test-data/empty_file.dat
--- a/tools/seq_filter_by_id/seq_filter_by_id.py Wed May 13 11:03:57 2015 -0400 +++ b/tools/seq_filter_by_id/seq_filter_by_id.py Tue May 17 05:59:24 2016 -0400 @@ -32,11 +32,7 @@ import re from optparse import OptionParser -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - -#Parse Command Line +# Parse Command Line usage = """Use as follows: $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...] @@ -78,7 +74,7 @@ options, args = parser.parse_args() if options.version: - print "v0.2.1" + print "v0.2.3" sys.exit(0) in_file = options.input @@ -89,28 +85,28 @@ drop_suffices = bool(options.suffix) if in_file is None or not os.path.isfile(in_file): - sys_exit("Missing input file: %r" % in_file) + sys.exit("Missing input file: %r" % in_file) if out_positive_file is None and out_negative_file is None: - sys_exit("Neither output file requested") + sys.exit("Neither output file requested") if seq_format is None: - sys_exit("Missing sequence format") + sys.exit("Missing sequence format") if logic not in ["UNION", "INTERSECTION"]: - sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) + sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) if options.id_list and args: - sys_exit("Cannot accepted IDs via both -t and as tabular files") + sys.exit("Cannot accepted IDs via both -t and as tabular files") elif not options.id_list and not args: - sys_exit("Expected matched pairs of tabular files and columns (or -t given)") + sys.exit("Expected matched pairs of tabular files and columns (or -t given)") if len(args) % 2: - sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args) + sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args) -#Cope with three widely used suffix naming convensions, -#Illumina: /1 or /2 -#Forward/revered: .f or .r -#Sanger, e.g. .p1k and .q1k -#See http://staden.sourceforge.net/manual/pregap4_unix_50.html -#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") -#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") +# Cope with three widely used suffix naming convensions, +# Illumina: /1 or /2 +# Forward/revered: .f or .r +# Sanger, e.g. .p1k and .q1k +# See http://staden.sourceforge.net/manual/pregap4_unix_50.html +# re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") +# re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") assert re_suffix.search("demo.f") assert re_suffix.search("demo.s1") @@ -125,19 +121,21 @@ identifiers = [] for i in range(len(args) // 2): - tabular_file = args[2*i] - cols_arg = args[2*i + 1] + tabular_file = args[2 * i] + cols_arg = args[2 * i + 1] if not os.path.isfile(tabular_file): - sys_exit("Missing tabular identifier file %r" % tabular_file) + sys.exit("Missing tabular identifier file %r" % tabular_file) try: - columns = [int(arg)-1 for arg in cols_arg.split(",")] + columns = [int(arg) - 1 for arg in cols_arg.split(",")] except ValueError: - sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg) + sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg) if min(columns) < 0: - sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) + sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) identifiers.append((tabular_file, columns)) name_warn = False + + def check_white_space(name): parts = name.split(None, 1) global name_warn @@ -169,28 +167,29 @@ clean_name = check_white_space -mapped_chars = { '>' :'__gt__', - '<' :'__lt__', - "'" :'__sq__', - '"' :'__dq__', - '[' :'__ob__', - ']' :'__cb__', - '{' :'__oc__', - '}' :'__cc__', - '@' : '__at__', - '\n' : '__cn__', - '\r' : '__cr__', - '\t' : '__tc__', - '#' : '__pd__' - } +mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__', + } -#Read tabular file(s) and record all specified identifiers -ids = None #Will be a set +# Read tabular file(s) and record all specified identifiers +ids = None # Will be a set if options.id_list: assert not identifiers ids = set() id_list = options.id_list - #Galaxy turns \r into __cr__ (CR) etc + # Galaxy turns \r into __cr__ (CR) etc for k in mapped_chars: id_list = id_list.replace(mapped_chars[k], k) for x in options.id_list.split(): @@ -199,22 +198,24 @@ for tabular_file, columns in identifiers: file_ids = set() handle = open(tabular_file, "rU") - if len(columns)>1: - #General case of many columns + if len(columns) > 1: + # General case of many columns for line in handle: if line.startswith("#"): - #Ignore comments + # Ignore comments continue parts = line.rstrip("\n").split("\t") for col in columns: file_ids.add(clean_name(parts[col])) else: - #Single column, special case speed up + # Single column, special case speed up col = columns[0] for line in handle: + if not line.strip(): #skip empty lines + continue if not line.startswith("#"): file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) - print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns)) + print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)) if ids is None: ids = file_ids if logic == "UNION": @@ -230,12 +231,13 @@ if name_warn: sys.stderr.write(name_warn) + def crude_fasta_iterator(handle): """Yields tuples, record ID and the full record as a string.""" while True: line = handle.readline() if line == "": - return # Premature end of file, or just empty? + return # Premature end of file, or just empty? if line[0] == ">": break @@ -262,16 +264,16 @@ line = handle.readline() yield id, "".join(lines) if not line: - return # StopIteration + return # StopIteration def fasta_filter(in_file, pos_file, neg_file, wanted): """FASTA filter producing 60 character line wrapped outout.""" pos_count = neg_count = 0 - #Galaxy now requires Python 2.5+ so can use with statements, + # Galaxy now requires Python 2.5+ so can use with statements, with open(in_file) as in_handle: - #Doing the if statement outside the loop for speed - #(with the downside of three very similar loops). + # Doing the if statement outside the loop for speed + # (with the downside of three very similar loops). if pos_file is not None and neg_file is not None: print "Generating two FASTA files" with open(pos_file, "w") as pos_handle: @@ -309,31 +311,31 @@ """FASTQ filter.""" from Bio.SeqIO.QualityIO import FastqGeneralIterator handle = open(in_file, "r") - if out_positive_file is not None and out_negative_file is not None: + if pos_file is not None and neg_file is not None: print "Generating two FASTQ files" - positive_handle = open(out_positive_file, "w") - negative_handle = open(out_negative_file, "w") + positive_handle = open(pos_file, "w") + negative_handle = open(neg_file, "w") print in_file for title, seq, qual in FastqGeneralIterator(handle): print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) - if clean_name(title.split(None, 1)[0]) in ids: + if clean_name(title.split(None, 1)[0]) in wanted: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) positive_handle.close() negative_handle.close() - elif out_positive_file is not None: + elif pos_file is not None: print "Generating matching FASTQ file" - positive_handle = open(out_positive_file, "w") + positive_handle = open(pos_file, "w") for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) in ids: + if clean_name(title.split(None, 1)[0]) in wanted: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) positive_handle.close() - elif out_negative_file is not None: + elif neg_file is not None: print "Generating non-matching FASTQ file" - negative_handle = open(out_negative_file, "w") + negative_handle = open(neg_file, "w") for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) not in ids: + if clean_name(title.split(None, 1)[0]) not in wanted: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) negative_handle.close() handle.close() @@ -345,48 +347,48 @@ try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: - sys_exit("SFF filtering requires Biopython 1.54 or later") + sys.exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest - in_handle = open(in_file, "rb") #must be binary mode! + in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None - #This makes two passes though the SFF file with isn't so efficient, - #but this makes the code simple. + # This makes two passes though the SFF file with isn't so efficient, + # but this makes the code simple. pos_count = neg_count = 0 - if out_positive_file is not None: - out_handle = open(out_positive_file, "wb") + if pos_file is not None: + out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again after getting manifest - pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) + in_handle.seek(0) # start again after getting manifest + pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) out_handle.close() - if out_negative_file is not None: - out_handle = open(out_negative_file, "wb") + if neg_file is not None: + out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again - neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) + in_handle.seek(0) # start again + neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) out_handle.close() - #And we're done + # And we're done in_handle.close() - #At the time of writing, Galaxy doesn't show SFF file read counts, - #so it is useful to put them in stdout and thus shown in job info. + # At the time of writing, Galaxy doesn't show SFF file read counts, + # so it is useful to put them in stdout and thus shown in job info. return pos_count, neg_count -if seq_format.lower()=="sff": +if seq_format.lower() == "sff": # Now write filtered SFF file based on IDs wanted pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. -elif seq_format.lower()=="fasta": +elif seq_format.lower() == "fasta": # Write filtered FASTA file based on IDs from tabular file pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) print "%i with and %i without specified IDs" % (pos_count, neg_count) @@ -395,4 +397,4 @@ fastq_filter(in_file, out_positive_file, out_negative_file, ids) # This does not currently track the counts else: - sys_exit("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % seq_format)
--- a/tools/seq_filter_by_id/seq_filter_by_id.xml Wed May 13 11:03:57 2015 -0400 +++ b/tools/seq_filter_by_id/seq_filter_by_id.xml Tue May 17 05:59:24 2016 -0400 @@ -1,4 +1,4 @@ -<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.2"> +<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.3"> <description>from a tabular file</description> <requirements> <requirement type="package" version="1.64">biopython</requirement> @@ -12,17 +12,17 @@ <version_command interpreter="python">seq_filter_by_id.py --version</version_command> <command interpreter="python"> seq_filter_by_id.py -i "$input_file" -f "$input_file.ext" -#if $output_choice_cond.output_choice=="both" - -p $output_pos -n $output_neg -#elif $output_choice_cond.output_choice=="pos" - -p $output_pos -#elif $output_choice_cond.output_choice=="neg" - -n $output_neg +#if str($output_choice_cond.output_choice)=="both" + -p "$output_pos" -n "$output_neg" +#elif str($output_choice_cond.output_choice)=="pos" + -p "$output_pos" +#elif str($output_choice_cond.output_choice)=="neg" + -n "$output_neg" #end if -#if $adv_opts.adv_opts_selector=="advanced" and $adv_opts.strip_suffix +#if str($adv_opts.adv_opts_selector)=="advanced" and $adv_opts.strip_suffix -s #end if -#if $id_opts.id_opts_selector=="tabular": +#if str($id_opts.id_opts_selector)=="tabular": ## TODO - Decide on best way to expose multiple ID files via the XML wrapper. ## Single tabular file, can call the Python script with either UNION or INTERSECTION -l UNION "$id_opts.input_tabular" "$id_opts.columns"
--- a/tools/seq_filter_by_id/tool_dependencies.xml Wed May 13 11:03:57 2015 -0400 +++ b/tools/seq_filter_by_id/tool_dependencies.xml Tue May 17 05:59:24 2016 -0400 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <tool_dependency> <package name="biopython" version="1.64"> - <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="b64c8edb7e45" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>