Mercurial > repos > peterjc > seq_filter_by_mapping
changeset 4:f82868a026ea draft
"v0.0.7 - long overdue upload to main Tool Shed"
| author | peterjc | 
|---|---|
| date | Fri, 16 Apr 2021 22:22:47 +0000 | 
| parents | 481b0a925e66 | 
| children | 1d6c149ca211 | 
| files | tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml | 
| diffstat | 4 files changed, 107 insertions(+), 67 deletions(-) [+] | 
line wrap: on
 line diff
--- a/tools/seq_filter_by_mapping/README.rst Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/README.rst Fri Apr 16 22:22:47 2021 +0000 @@ -71,6 +71,7 @@ - Use ``<command detect_errors="aggressive">`` (internal change only). - Single quote command line arguments (internal change only). v0.0.6 - Python 3 compatible print function. +v0.0.7 - Script works on Python 2 and 3 (fixed input file mode) ======= ======================================================================
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Fri Apr 16 22:22:47 2021 +0000 @@ -10,7 +10,7 @@ Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This script is copyright 2014 by Peter Cock, The James Hutton Institute (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. @@ -40,31 +40,57 @@ Multiple SAM/BAM mapping files may be given. """ parser = OptionParser(usage=usage) -parser.add_option('-i', '--input', dest='input', - default=None, help='Input sequences filename', - metavar="FILE") -parser.add_option('-f', '--format', dest='format', - default=None, - help='Input sequence format (e.g. fasta, fastq, sff)') -parser.add_option('-p', '--positive', dest='output_positive', - default=None, - help='Output filename for mapping reads', - metavar="FILE") -parser.add_option('-n', '--negative', dest='output_negative', - default=None, - help='Output filename for non-mapping reads', - metavar="FILE") -parser.add_option("-m", "--pair-mode", dest="pair_mode", - default="lax", - help="How to treat paired reads (lax or strict, default lax)") -parser.add_option("-v", "--version", dest="version", - default=False, action="store_true", - help="Show version and quit") +parser.add_option( + "-i", + "--input", + dest="input", + default=None, + help="Input sequences filename", + metavar="FILE", +) +parser.add_option( + "-f", + "--format", + dest="format", + default=None, + help="Input sequence format (e.g. fasta, fastq, sff)", +) +parser.add_option( + "-p", + "--positive", + dest="output_positive", + default=None, + help="Output filename for mapping reads", + metavar="FILE", +) +parser.add_option( + "-n", + "--negative", + dest="output_negative", + default=None, + help="Output filename for non-mapping reads", + metavar="FILE", +) +parser.add_option( + "-m", + "--pair-mode", + dest="pair_mode", + default="lax", + help="How to treat paired reads (lax or strict, default lax)", +) +parser.add_option( + "-v", + "--version", + dest="version", + default=False, + action="store_true", + help="Show version and quit", +) options, args = parser.parse_args() if options.version: - print("v0.0.6") + print("v0.0.7") sys.exit(0) in_file = options.input @@ -114,7 +140,7 @@ if match: # Use the fact this is a suffix, and regular expression will be # anchored to the end of the name: - return name[:match.start()] + return name[: match.start()] else: # Nothing to do return name @@ -128,19 +154,19 @@ assert clean_name("baz.q2") == "baz" mapped_chars = { - '>': '__gt__', - '<': '__lt__', - "'": '__sq__', - '"': '__dq__', - '[': '__ob__', - ']': '__cb__', - '{': '__oc__', - '}': '__cc__', - '@': '__at__', - '\n': '__cn__', - '\r': '__cr__', - '\t': '__tc__', - '#': '__pd__', + ">": "__gt__", + "<": "__lt__", + "'": "__sq__", + '"': "__dq__", + "[": "__ob__", + "]": "__cb__", + "{": "__oc__", + "}": "__cc__", + "@": "__at__", + "\n": "__cn__", + "\r": "__cr__", + "\t": "__tc__", + "#": "__pd__", } @@ -149,21 +175,25 @@ Parses BAM files via call out to samtools view command. """ - handle = open(filename, "rb") - magic = handle.read(4) + with open(filename, "rb") as handle: + magic = handle.read(4) + if magic == b"\x1f\x8b\x08\x04": # Presumably a BAM file then... - handle.close() # Call samtools view, don't need header so no -h added: - child = subprocess.Popen(["samtools", "view", filename], - stdin=None, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + child = subprocess.Popen( + ["samtools", "view", filename], + universal_newlines=True, + stdin=None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) handle = child.stdout else: # Presumably a SAM file... child = None - handle.seek(0) + # Open in text mode + handle = open(filename) # Handle should now contain SAM records for line in handle: # Ignore header lines @@ -178,7 +208,7 @@ ids.add(qname) elif pair_mode == "strict": # For paired reads, require BOTH be mapped. - if (flag & 0x4): + if flag & 0x4: # This is unmapped, ignore it pass elif not (flag & 0x1): @@ -192,8 +222,11 @@ stdout, stderr = child.communicate() assert child.returncode is not None if child.returncode: - msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, - filename, stderr) + msg = "Error %i from 'samtools view %s'\n%s" % ( + child.returncode, + filename, + stderr, + ) sys.exit(msg.strip(), child.returncode) else: handle.close() @@ -211,7 +244,7 @@ def crude_fasta_iterator(handle): - """Yields tuples, record ID and the full record as a string.""" + """Yield tuples, record ID and the full record as a string.""" while True: line = handle.readline() if line == "": @@ -222,8 +255,7 @@ no_id_warned = False while True: if line[0] != ">": - raise ValueError( - "Records in Fasta files should start with '>' character") + raise ValueError("Records in Fasta files should start with '>' character") try: id = line[1:].split(None, 1)[0] except IndexError: @@ -288,6 +320,7 @@ def fastq_filter(in_file, pos_file, neg_file, wanted): """FASTQ filter.""" from Bio.SeqIO.QualityIO import FastqGeneralIterator + pos_count = neg_count = 0 handle = open(in_file, "r") if pos_file is not None and neg_file is not None: @@ -355,13 +388,17 @@ out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest - pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) + pos_count = writer.write_file( + rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted + ) out_handle.close() if neg_file is not None: out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again - neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) + neg_count = writer.write_file( + rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted + ) out_handle.close() # And we're done in_handle.close() @@ -377,7 +414,9 @@ else: sys.exit("Unsupported file type %r" % seq_format) -pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) +pos_count, neg_count = sequence_filter( + in_file, out_positive_file, out_negative_file, ids +) print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) fraction = float(pos_count) * 100.0 / float(pos_count + neg_count) print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction))
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Fri Apr 16 22:22:47 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.6"> +<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.7"> <description>from SAM/BAM file</description> <requirements> <requirement type="package" version="1.67">biopython</requirement> @@ -21,7 +21,7 @@ </command> <inputs> <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." /> - <param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." /> + <param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." /> <conditional name="output_choice_cond"> <param name="output_choice" type="select" label="Output mapped reads, unmapped reads, or both?"> <option value="both">Both mapped and unmapped reads, as two files</option> @@ -33,12 +33,12 @@ <when value="pos" /> <when value="neg" /> </conditional> - <param name="pair_mode" type="select" label="Paired read treatment"> + <param name="pair_mode" type="select" label="Paired read treatment"> <option value="lax" selected="true">Treat as a pair, allow either read to be mapped</option> - <option value="strict">Treat as a pair, require both reads to be mapped</option> - <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter... + <option value="strict">Treat as a pair, require both reads to be mapped</option> + <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter... <option value="solo">Treat independently (will split partners when only one maps)</option> - --> + --> </param> </inputs> <outputs> @@ -50,19 +50,19 @@ </data> </outputs> <tests> - <test> + <test> <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" /> <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" /> <param name="pair_mode" value="lax" /> <param name="output_choice" value="pos" /> <output name="output_pos" file="SRR639755_sample_lax.fastq" ftype="fastqsanger" /> </test> - <test> + <test> <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" /> <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" /> <param name="pair_mode" value="strict" /> <param name="output_choice" value="pos" /> - <output name="output_pos" file="SRR639755_sample_strict.fastq" ftype="fastqsanger" /> + <output name="output_pos" file="SRR639755_sample_strict.fastq" ftype="fastqsanger" /> </test> </tests> <help> @@ -97,7 +97,7 @@ Cock et al (2009). Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This tool is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
--- a/tools/seq_filter_by_mapping/tool_dependencies.xml Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/tool_dependencies.xml Fri Apr 16 22:22:47 2021 +0000 @@ -1,9 +1,9 @@ -<?xml version="1.0"?> +<?xml version="1.0" ?> <tool_dependency> <package name="biopython" version="1.67"> - <repository changeset_revision="a12f73c3b116" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/> </package> <package name="samtools" version="0.1.19"> - <repository changeset_revision="c9bd782f5342" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="c9bd782f5342"/> </package> -</tool_dependency> +</tool_dependency> \ No newline at end of file
