Mercurial > repos > peterjc > seq_filter_by_mapping

Binary file test-data/SRR639755_mito_pairs.fastq.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SRR639755_sample_by_coord.sam	Tue Jan 27 08:31:13 2015 -0500
@@ -0,0 +1,14 @@
+@HD	VN:1.0	SO:coordinate
+@SQ	SN:gi|187250362|ref|NC_010642.1|	LN:16990
+@PG	ID:clc_mapper	VN:4.10.86742	CL:clc_mapper -d -z xxx -p fb ss 1 1000 -q yyy -o temp_job.cas --cpus 8
+@RG	ID:RG_0	DS:yyy	PG:clc_mapper
+SRR639755.6451003	99	gi|187250362|ref|NC_010642.1|	1352	255	7S94M	=	1604	353	ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT	<?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4	RG:Z:RG_0	MD:Z:8G8C22T2T42A7	NM:i:11
+SRR639755.6451003	147	gi|187250362|ref|NC_010642.1|	1604	255	101M	=	1352	353	TTCAGCCTACATACCGCCATCTTCAGCAAACCCTAAAAAGGAAGAAAAGTAAGCACAAGTGCCTTAACACAAAAAAGTTAGGTCAAGGTGTAGCCCATGAG	>?BA::3B:C?98B@@CC>CA>EEEFFFBCEE>FJJIHC@DDGIIIHF@HFHCFBHD@GGHBFD@HHBDGAFCCEC8HEIHDH>HEHBHHFGHDDD=F@@@	RG:Z:RG_0	MD:Z:9T50AT7T31	NM:i:4
+SRR639755.6035319	137	gi|187250362|ref|NC_010642.1|	9094	255	101M	*	0	0	TACTTATGTCTCTCATTTTATTTATTGGGCCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACGCAATTATCAATAAAACTAGGCATA	?@@FD;:+AHB4<C9F4HEFH@EAA?EE++<<A@FHADDB<?:4D)0*0?D?BFFGIIIDBE:=BDE)=FHGED>;('(5,;;;6;6;>;>(5((5?C(,:	RG:Z:RG_0	MD:Z:29T46C14C9	NM:i:3
+SRR639755.2301126	137	gi|187250362|ref|NC_010642.1|	9117	255	101M	*	0	0	ATTGGGTCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAAACCTAGGCATAGCCATCCCCTTGTGAGCCGGCAC	?@BDB;=AAAAFFDHBGAAHI>;<AFCAF@?GEDEEFHAHFH@<F<D@GGEHIEF@HIE>GH=B@CCGAHC@@DEHHFEHDFFFFDDE>?CCDACCBDB@B	RG:Z:RG_0	MD:Z:101	NM:i:0
+SRR639755.6955680	89	gi|187250362|ref|NC_010642.1|	9285	255	97M4S	*	0	0	GTCCCCTTAATCCCTATGCTCGTAATTATCGAAACTATCAGCCTTTTTATCCAGCCCGTAGCCCTAGCCGTACGACTCACAGCTAATATTACTGCAGATAT	9<>8CAACC?CBC@DDD@?<8A@>(BBC?DDAB;?C@BFFEEAIIHD@EGF6GGGGAFD?B8GCEJJIIHEIIHEGIHHIJJIEGIGGDHFFC=2FFFBB?	RG:Z:RG_0	MD:Z:97	NM:i:0
+SRR639755.7076397	89	gi|187250362|ref|NC_010642.1|	9285	255	97M4S	*	0	0	GTCCCCTTAATCCCTATGCTCGTAATTATCGAAACTATCAGCCTTTTTATCCAGCCCGTAGCCCTAGCCGTACGACTCACAGCTAATATTACTGCAGATAT	BBDBCDDCDDCDDCDCBDDD@CACC@DFBDDEB>DFFFFFHHHJJJIGGDJIIGJJIIHGIJJJIJIJIIHGIJIJIJJJJJJJJJJJHHHFC=2FFD?@=	RG:Z:RG_0	MD:Z:97	NM:i:0
+SRR639755.2301126	69	*	0	255	*	gi|187250362|ref|NC_010642.1|	9117	0	ATATCTGCAGTGAGCCTGACTTGGGGCTCGAACTCATGAACCATGAGATCATGACCTGAGCTGAAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGC	+1=BDDDDHHHFBF?GDECFGHDEFH@?C:?F@;DH@4B?9)?<BF?=)8C@G=88CC@EHICHH>7=;?;?><A>=A;?(<?9'2-<>B91(>@@CC5?<	RG:Z:RG_0
+SRR639755.6035319	69	*	0	255	*	gi|187250362|ref|NC_010642.1|	9094	0	ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCCACGGGCTGGGTAGCAACGATGCTGGGGTGCAGAATTAGGCGGCTGGGCATTAAAGCGAC	1:;44222222<ABDGG@I?4CFD<ACBG?8ACF@0???B9)0)7A0<@8'().()).(,(.5,((-,',,(((,,(((++(0&&0&&(((+((((+()(&	RG:Z:RG_0
+SRR639755.6955680	165	*	0	255	*	gi|187250362|ref|NC_010642.1|	9285	0	CCGACTGAGCCACCCAGGCGCCCCTAAATTTGCTTTTAATTTTAATGGTGGTTATATTTATTGGGGCAACAAATCTGCTAGGTCTACTACCTCACTCATTT	@@BFFADDF?FHHJAE:A@GEHHGHGGIGGE9DCHHHIJIIIG@GGBGHID';=7C==@:=.=CH64<?(;?B>35>@:@>9,::@>;5><A>:<:>9444	RG:Z:RG_0
+SRR639755.7076397	165	*	0	255	*	gi|187250362|ref|NC_010642.1|	9285	0	TGACAGCACATTTATTTACAAACTGGTTTGCTGAATATTTTAATCCCACTGTTGAGGCCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAA	CC@FFFFFHFHHHJJJJJJIJJJJJIFHGIIIJIIGEGIJJJJIJIHIJHIJIIJJIIIJJJJJJJIJJJJIJJJJGHHGHGFFFFFEDCDDDDDD;>;AC	RG:Z:RG_0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SRR639755_sample_lax.fastq	Tue Jan 27 08:31:13 2015 -0500
@@ -0,0 +1,40 @@
+@SRR639755.2301126/1
+ATATCTGCAGTGAGCCTGACTTGGGGCTCGAACTCATGAACCATGAGATCATGACCTGAGCTGAAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGC
++
++1=BDDDDHHHFBF?GDECFGHDEFH@?C:?F@;DH@4B?9)?<BF?=)8C@G=88CC@EHICHH>7=;?;?><A>=A;?(<?9'2-<>B91(>@@CC5?<
+@SRR639755.2301126/2
+ATTGGGTCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAAACCTAGGCATAGCCATCCCCTTGTGAGCCGGCAC
++
+?@BDB;=AAAAFFDHBGAAHI>;<AFCAF@?GEDEEFHAHFH@<F<D@GGEHIEF@HIE>GH=B@CCGAHC@@DEHHFEHDFFFFDDE>?CCDACCBDB@B
+@SRR639755.6035319/1
+ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCCACGGGCTGGGTAGCAACGATGCTGGGGTGCAGAATTAGGCGGCTGGGCATTAAAGCGAC
++
+1:;44222222<ABDGG@I?4CFD<ACBG?8ACF@0???B9)0)7A0<@8'().()).(,(.5,((-,',,(((,,(((++(0&&0&&(((+((((+()(&
+@SRR639755.6035319/2
+TACTTATGTCTCTCATTTTATTTATTGGGCCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACGCAATTATCAATAAAACTAGGCATA
++
+?@@FD;:+AHB4<C9F4HEFH@EAA?EE++<<A@FHADDB<?:4D)0*0?D?BFFGIIIDBE:=BDE)=FHGED>;('(5,;;;6;6;>;>(5((5?C(,:
+@SRR639755.6451003/1
+ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT
++
+<?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4
+@SRR639755.6451003/2
+CTCATGGGCTACACCTTGACCTAACTTTTTTGTGTTAAGGCACTTGTGCTTACTTTTCTTCCTTTTTAGGGTTTGCTGAAGATGGCGGTATGTAGGCTGAA
++
+@@@F=DDDHGFHHBHEH>HDHIEH8CECCFAGDBHH@DFBHGG@DHBFCHFH@FHIIIGDD@CHIJJF>EECBFFFEEE>AC>CC@@B89?C:B3::AB?>
+@SRR639755.6955680/1
+ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGCTGATAGTTTCGATAATTACGAGCATAGGGATTAAGGGGAC
++
+?BBFFF2=CFFHDGGIGEIJJIHHIGEHIIEHIIJJECG8B?DFAGGGG6FGE@DHIIAEEFFB@C?;BADD?CBB(>@A8<?@DDD@CBC?CCAAC8><9
+@SRR639755.6955680/2
+CCGACTGAGCCACCCAGGCGCCCCTAAATTTGCTTTTAATTTTAATGGTGGTTATATTTATTGGGGCAACAAATCTGCTAGGTCTACTACCTCACTCATTT
++
+@@BFFADDF?FHHJAE:A@GEHHGHGGIGGE9DCHHHIJIIIG@GGBGHID';=7C==@:=.=CH64<?(;?B>35>@:@>9,::@>;5><A>:<:>9444
+@SRR639755.7076397/1
+ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGCTGATAGTTTCGATAATTACGAGCATAGGGATTAAGGGGAC
++
+=@?DFF2=CFHHHJJJJJJJJJJJIJIJIGHIIJIJIJJJIGHIIJJGIIJDGGIJJJHHHFFFFFD>BEDDBFD@CCAC@DDDBCDCDDCDDCDDCBDBB
+@SRR639755.7076397/2
+TGACAGCACATTTATTTACAAACTGGTTTGCTGAATATTTTAATCCCACTGTTGAGGCCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAA
++
+CC@FFFFFHFHHHJJJJJJIJJJJJIFHGIIIJIIGEGIJJJJIJIHIJHIJIIJJIIIJJJJJJJIJJJJIJJJJGHHGHGFFFFFEDCDDDDDD;>;AC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/SRR639755_sample_strict.fastq	Tue Jan 27 08:31:13 2015 -0500
@@ -0,0 +1,8 @@
+@SRR639755.6451003/1
+ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT
++
+<?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4
+@SRR639755.6451003/2
+CTCATGGGCTACACCTTGACCTAACTTTTTTGTGTTAAGGCACTTGTGCTTACTTTTCTTCCTTTTTAGGGTTTGCTGAAGATGGCGGTATGTAGGCTGAA
++
+@@@F=DDDHGFHHBHEH>HDHIEH8CECCFAGDBHH@DFBHGG@DHBFCHFH@FHIIIGDD@CHIJJF>EECBFFFEEE>AC>CC@@B89?C:B3::AB?>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_filter_by_mapping/README.rst	Tue Jan 27 08:31:13 2015 -0500
@@ -0,0 +1,114 @@
+Galaxy tool to filter FASTA, FASTQ or SFF sequences by SAM/BAM mapping
+======================================================================
+
+This tool is copyright 2014 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+This tool is a short Python script (using Biopython library functions) which
+divides a FASTA, FASTQ, or SFF file in two, those sequences which do or do
+not map according to given SAM/BAM file(s).
+
+Example uses include mapping of FASTQ reads against a known contaminant
+in order to remove reads prior to a de novo assembly.
+
+This tool is available from the Galaxy Tool Shed at:
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
+
+See also related tools:
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_select_by_id
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_rename
+
+
+Automated Installation
+======================
+
+This should be straightforward using the Galaxy Tool Shed, which should be
+able to automatically install the dependency on Biopython and samtools
+and then install this tool and run its unit tests.
+
+
+Manual Installation
+===================
+
+There are just two files to install to use this tool from within Galaxy:
+
+* ``seq_filter_by_mapping.py`` (the Python script)
+* ``seq_filter_by_mapping.xml`` (the Galaxy tool definition)
+
+The suggested location is a dedicated ``tools/seq_filter_by_mapping/`` folder.
+
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the
+tool. One suggested location is in the filters section. Simply add the line::
+
+    <tool file="seq_filter_by_mapping/seq_filter_by_mapping.xml" />
+
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
+
+    $ ./run_tests.sh -id seq_filter_by_mapping
+
+You will also need to install Biopython 1.54 or later. That's it.
+
+
+History
+=======
+
+======= ======================================================================
+Version Changes
+------- ----------------------------------------------------------------------
+v0.0.1  - Initial version.
+v0.0.2  - Fixed some error messages.
+======= ======================================================================
+
+
+Developers
+==========
+
+Development is on this GitHub repository:
+https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_filter_by_mapping
+
+Much of the code was copied from my older tool:
+https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_filter_by_id
+
+For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder::
+
+    $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq
+
+Check this worked::
+
+    $ tar -tzf seq_filter_by_mapping.tar.gz
+    tools/seq_filter_by_mapping/README.rst
+    tools/seq_filter_by_mapping/seq_filter_by_mapping.py
+    tools/seq_filter_by_mapping/seq_filter_by_mapping.xml
+    tools/seq_filter_by_mapping/tool_dependencies.xml
+    test-data/SRR639755_mito_pairs.fastq.gz
+    test-data/SRR639755_sample_by_coord.sam
+    test-data/SRR639755_sample_strict.fastq
+    test-data/SRR639755_sample_lax.fastq
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py	Tue Jan 27 08:31:13 2015 -0500
@@ -0,0 +1,370 @@
+#!/usr/bin/env python
+"""Filter a FASTA, FASTQ or SSF file with SAM/BAM mapping information.
+
+When filtering an SFF file, any Roche XML manifest in the input file is
+preserved in both output files.
+
+This tool is a short Python script which requires Biopython 1.54 or later
+for SFF file support. If you use this tool in scientific work leading to a
+publication, please cite the Biopython application note:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This script is copyright 2014 by Peter Cock, The James Hutton Institute
+(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
+See accompanying text file for licence details (MIT license).
+
+Use -v or --version to get the version, -h or --help for help.
+"""
+import os
+import sys
+import re
+import subprocess
+from optparse import OptionParser
+
+def sys_exit(msg, err=1):
+    sys.stderr.write(msg.rstrip() + "\n")
+    sys.exit(err)
+
+#Parse Command Line
+usage = """Use as follows:
+
+$ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings]
+
+e.g. Positive matches using column one from a single BAM file:
+
+$ seq_filter_by_mapping.py -i my_seqs.fastq -f fastq -p matches.fastq mapping.bam
+
+Multiple SAM/BAM mapping files may be given.
+"""
+parser = OptionParser(usage=usage)
+parser.add_option('-i', '--input', dest='input',
+                  default=None, help='Input sequences filename',
+                  metavar="FILE")
+parser.add_option('-f', '--format', dest='format',
+                  default=None,
+                  help='Input sequence format (e.g. fasta, fastq, sff)')
+parser.add_option('-p', '--positive', dest='output_positive',
+                  default=None,
+                  help='Output filename for mapping reads',
+                  metavar="FILE")
+parser.add_option('-n', '--negative', dest='output_negative',
+                  default=None,
+                  help='Output filename for non-mapping reads',
+                  metavar="FILE")
+parser.add_option("-m", "--pair-mode", dest="pair_mode",
+                  default="lax",
+                  help="How to treat paired reads (lax or strict, default lax)")
+parser.add_option("-v", "--version", dest="version",
+                  default=False, action="store_true",
+                  help="Show version and quit")
+
+options, args = parser.parse_args()
+
+if options.version:
+    print "v0.0.2"
+    sys.exit(0)
+
+in_file = options.input
+seq_format = options.format
+out_positive_file = options.output_positive
+out_negative_file = options.output_negative
+pair_mode = options.pair_mode
+
+if in_file is None or not os.path.isfile(in_file):
+    sys_exit("Missing input file: %r" % in_file)
+if out_positive_file is None and out_negative_file is None:
+    sys_exit("Neither output file requested")
+if seq_format is None:
+    sys_exit("Missing sequence format")
+if pair_mode not in ["lax", "strict"]:
+    sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
+for mapping in args:
+    if not os.path.isfile(mapping):
+        sys_exit("Mapping file %r not found" % mapping)
+if not args:
+    sys_exit("At least one SAM/BAM mapping file is required")
+
+
+#Cope with three widely used suffix naming convensions,
+#Illumina: /1 or /2
+#Forward/revered: .f or .r
+#Sanger, e.g. .p1k and .q1k
+#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
+#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
+#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
+re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
+assert re_suffix.search("demo.f")
+assert re_suffix.search("demo.s1")
+assert re_suffix.search("demo.f1k")
+assert re_suffix.search("demo.p1")
+assert re_suffix.search("demo.p1k")
+assert re_suffix.search("demo.p1lk")
+assert re_suffix.search("demo/2")
+assert re_suffix.search("demo.r")
+assert re_suffix.search("demo.q1")
+assert re_suffix.search("demo.q1lk")
+
+def clean_name(name):
+    """Remove suffix."""
+    match = re_suffix.search(name)
+    if match:
+        # Use the fact this is a suffix, and regular expression will be
+        # anchored to the end of the name:
+        return name[:match.start()]
+    else:
+        # Nothing to do
+        return name
+assert clean_name("foo/1") == "foo"
+assert clean_name("foo/2") == "foo"
+assert clean_name("bar.f") == "bar"
+assert clean_name("bar.r") == "bar"
+assert clean_name("baz.p1") == "baz"
+assert clean_name("baz.q2") == "baz"
+
+mapped_chars = { '>' :'__gt__',
+                 '<' :'__lt__',
+                 "'" :'__sq__',
+                 '"' :'__dq__',
+                 '[' :'__ob__',
+                 ']' :'__cb__',
+                 '{' :'__oc__',
+                 '}' :'__cc__',
+                 '@' : '__at__',
+                 '\n' : '__cn__',
+                 '\r' : '__cr__',
+                 '\t' : '__tc__',
+                 '#' : '__pd__'
+                 }
+
+def load_mapping_ids(filename, pair_mode, ids):
+    """Parse SAM/BAM file, updating given set of ids.
+
+    Parses BAM files via call out to samtools view command.
+    """
+    handle = open(filename, "rb")
+    magic = handle.read(4)
+    if magic == b"\x1f\x8b\x08\x04":
+        # Presumably a BAM file then...
+        handle.close()
+        # Call samtools view, don't need header so no -h added:
+        child = subprocess.Popen(["samtools", "view", filename],
+                                 stdin=None,
+                                 stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE)
+        handle = child.stdout
+    else:
+        # Presumably a SAM file...
+        child = None
+        handle.seek(0)
+    # Handle should now contain SAM records
+    for line in handle:
+        # Ignore header lines
+        if line[0] != "@":
+            qname, flag, rest = line.split("\t", 2)
+            flag = int(flag)
+            if pair_mode == "lax":
+                # If either read or its partner is mapped, take it!
+                # Being lazy, since we will look at both reads
+                # can just check if (either) has 0x4 clear.
+                if not (flag & 0x4):
+                    ids.add(qname)
+            elif pair_mode == "strict":
+                # For paired reads, require BOTH be mapped.
+                if (flag & 0x4):
+                    # This is unmapped, ignore it
+                    pass
+                elif not (flag & 0x1):
+                    # Unpaired (& mapped) - take it
+                    ids.add(qname)
+                elif not (flag & 0x8):
+                    # Paired and partner also mapped, good
+                    ids.add(qname)
+    if child:
+        # Check terminated normally.
+        stdout, stderr = child.communicate()
+        assert child.returncode is not None
+        if child.returncode:
+            msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
+                                                            filename, stderr)
+            sys_exit(msg.strip(), child.returncode)
+    else:
+        handle.close()
+
+
+# Read mapping file(s) and record all mapped identifiers
+ids = set()
+for filename in args:
+    load_mapping_ids(filename, pair_mode, ids)
+# TODO - If want to support naive paired mode, have to record
+# more than just qname (need /1 or /2 indicator)
+print("Loaded %i mapped IDs" % (len(ids)))
+if len(ids) < 10:
+    print("Looking for %s" % ", ".join(sorted(ids)))
+
+def crude_fasta_iterator(handle):
+    """Yields tuples, record ID and the full record as a string."""
+    while True:
+        line = handle.readline()
+        if line == "":
+            return # Premature end of file, or just empty?
+        if line[0] == ">":
+            break
+
+    no_id_warned = False
+    while True:
+        if line[0] != ">":
+            raise ValueError(
+                "Records in Fasta files should start with '>' character")
+        try:
+            id = line[1:].split(None, 1)[0]
+        except IndexError:
+            if not no_id_warned:
+                sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
+                no_id_warned = True
+            id = None
+        lines = [line]
+        line = handle.readline()
+        while True:
+            if not line:
+                break
+            if line[0] == ">":
+                break
+            lines.append(line)
+            line = handle.readline()
+        yield id, "".join(lines)
+        if not line:
+            return # StopIteration
+
+
+def fasta_filter(in_file, pos_file, neg_file, wanted):
+    """FASTA filter producing 60 character line wrapped outout."""
+    pos_count = neg_count = 0
+    #Galaxy now requires Python 2.5+ so can use with statements,
+    with open(in_file) as in_handle:
+        #Doing the if statement outside the loop for speed
+        #(with the downside of three very similar loops).
+        if pos_file is not None and neg_file is not None:
+            print "Generating two FASTA files"
+            with open(pos_file, "w") as pos_handle:
+                with open(neg_file, "w") as neg_handle:
+                    for identifier, record in crude_fasta_iterator(in_handle):
+                        if clean_name(identifier) in wanted:
+                            pos_handle.write(record)
+                            pos_count += 1
+                        else:
+                            neg_handle.write(record)
+                            neg_count += 1
+        elif pos_file is not None:
+            print "Generating matching FASTA file"
+            with open(pos_file, "w") as pos_handle:
+                for identifier, record in crude_fasta_iterator(in_handle):
+                    if clean_name(identifier) in wanted:
+                        pos_handle.write(record)
+                        pos_count += 1
+                    else:
+                        neg_count += 1
+        else:
+            print "Generating non-matching FASTA file"
+            assert neg_file is not None
+            with open(neg_file, "w") as neg_handle:
+                for identifier, record in crude_fasta_iterator(in_handle):
+                    if clean_name(identifier) in wanted:
+                        pos_count += 1
+                    else:
+                        neg_handle.write(record)
+                        neg_count += 1
+    return pos_count, neg_count
+
+
+def fastq_filter(in_file, pos_file, neg_file, wanted):
+    """FASTQ filter."""
+    from Bio.SeqIO.QualityIO import FastqGeneralIterator
+    handle = open(in_file, "r")
+    if out_positive_file is not None and out_negative_file is not None:
+        print "Generating two FASTQ files"
+        positive_handle = open(out_positive_file, "w")
+        negative_handle = open(out_negative_file, "w")
+        print in_file
+        for title, seq, qual in FastqGeneralIterator(handle):
+            # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
+            if clean_name(title.split(None, 1)[0]) in ids:
+                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+            else:
+                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+        positive_handle.close()
+        negative_handle.close()
+    elif out_positive_file is not None:
+        print "Generating matching FASTQ file"
+        positive_handle = open(out_positive_file, "w")
+        for title, seq, qual in FastqGeneralIterator(handle):
+            if clean_name(title.split(None, 1)[0]) in ids:
+                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+        positive_handle.close()
+    elif out_negative_file is not None:
+        print "Generating non-matching FASTQ file"
+        negative_handle = open(out_negative_file, "w")
+        for title, seq, qual in FastqGeneralIterator(handle):
+            if clean_name(title.split(None, 1)[0]) not in ids:
+                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+        negative_handle.close()
+    handle.close()
+    # This does not currently bother to record record counts (faster)
+
+def sff_filter(in_file, pos_file, neg_file, wanted):
+    """SFF filter."""
+    try:
+        from Bio.SeqIO.SffIO import SffIterator, SffWriter
+    except ImportError:
+        sys_exit("SFF filtering requires Biopython 1.54 or later")
+
+    try:
+        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
+    except ImportError:
+        #Prior to Biopython 1.56 this was a private function
+        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
+
+    in_handle = open(in_file, "rb") #must be binary mode!
+    try:
+        manifest = ReadRocheXmlManifest(in_handle)
+    except ValueError:
+        manifest = None
+
+    #This makes two passes though the SFF file with isn't so efficient,
+    #but this makes the code simple.
+    pos_count = neg_count = 0
+    if out_positive_file is not None:
+        out_handle = open(out_positive_file, "wb")
+        writer = SffWriter(out_handle, xml=manifest)
+        in_handle.seek(0) #start again after getting manifest
+        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
+        out_handle.close()
+    if out_negative_file is not None:
+        out_handle = open(out_negative_file, "wb")
+        writer = SffWriter(out_handle, xml=manifest)
+        in_handle.seek(0) #start again
+        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
+        out_handle.close()
+    #And we're done
+    in_handle.close()
+    return pos_count, neg_count
+
+
+if seq_format.lower()=="sff":
+    # Now write filtered SFF file based on IDs wanted
+    pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
+    # At the time of writing, Galaxy doesn't show SFF file read counts,
+    # so it is useful to put them in stdout and thus shown in job info.
+    print "%i with and %i without specified IDs" % (pos_count, neg_count)
+elif seq_format.lower()=="fasta":
+    # Write filtered FASTA file based on IDs from tabular file
+    pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
+    print "%i with and %i without specified IDs" % (pos_count, neg_count)
+elif seq_format.lower().startswith("fastq"):
+    #Write filtered FASTQ file based on IDs from mapping file
+    fastq_filter(in_file, out_positive_file, out_negative_file, ids)
+    # This does not currently track the counts
+else:
+    sys_exit("Unsupported file type %r" % seq_format)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml	Tue Jan 27 08:31:13 2015 -0500
@@ -0,0 +1,113 @@
+<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.2">
+    <description>from SAM/BAM file</description>
+    <requirements>
+        <requirement type="package" version="1.64">biopython</requirement>
+        <requirement type="python-module">Bio</requirement>
+        <requirement type="binary">samtools</requirement>
+        <requirement type="package" version="0.1.19">samtools</requirement>
+    </requirements>
+    <version_command interpreter="python">seq_filter_by_mapping.py --version</version_command>
+    <command interpreter="python">
+seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode
+#if $output_choice_cond.output_choice=="both"
+ -p $output_pos -n $output_neg
+#elif $output_choice_cond.output_choice=="pos"
+ -p $output_pos
+#elif $output_choice_cond.output_choice=="neg"
+ -n $output_neg
+#end if
+## Now loop over all the mapping files
+#for i in $mapping_file#${i} #end for#
+    </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
+    <inputs>
+        <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." />
+	<param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." />
+        <conditional name="output_choice_cond">
+            <param name="output_choice" type="select" label="Output mapped reads, unmapped reads, or both?">
+                <option value="both">Both mapped and unmapped reads, as two files</option>
+                <option value="pos">Just mapped reads, as a single file</option>
+                <option value="neg">Just unmapped reads, as a single file</option>
+            </param>
+            <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml -->
+            <when value="both" />
+            <when value="pos" />
+            <when value="neg" />
+        </conditional>
+	<param name="pair_mode" type="select" label="Paired read treatment">
+            <option value="lax" selected="true">Treat as a pair, allow either read to be mapped</option>
+	    <option value="strict">Treat as a pair, require both reads to be mapped</option>
+	    <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter...
+            <option value="solo">Treat independently (will split partners when only one maps)</option>
+	    -->
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output_pos" format="input" metadata_source="input_file" label="$input_file.name (mapped)">
+            <filter>output_choice_cond["output_choice"] != "neg"</filter>
+        </data>
+        <data name="output_neg" format="input" metadata_source="input_file" label="$input_file.name (unmapped)">
+            <filter>output_choice_cond["output_choice"] != "pos"</filter>
+        </data>
+    </outputs>
+    <tests>
+	<test>
+            <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" />
+            <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" />
+            <param name="pair_mode" value="lax" />
+            <param name="output_choice" value="pos" />
+            <output name="output_pos" file="SRR639755_sample_lax.fastq" ftype="fastqsanger" />
+        </test>
+	<test>
+            <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" />
+            <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" />
+            <param name="pair_mode" value="strict" />
+            <param name="output_choice" value="pos" />
+            <output name="output_pos" file="SRR639755_sample_strict.fastq" ftype="fastqsanger" />
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in
+two, those sequences (or read pairs) which do or don't map in the provided
+SAM/BAM file. You can opt to have a single output file of just the mapping reads,
+or just the non-mapping ones.
+
+**Example Usage**
+
+You might wish to perform a contamination screan by mapping your reads against
+known contaminant reference sequences, then use this tool to select only the
+unmapped reads for further analysis (e.g. *de novo* assembly).
+
+Similarly you might wish to map your reads against a known bacterial reference,
+then take the non-mapping sequences forward for analysis if looking for novel
+plasmids.
+
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite:
+
+Peter J.A. Cock (2014), Galaxy tool for filtering reads by mapping
+http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
+
+This tool uses Biopython to read and write SFF files, so you may also wish to
+cite the Biopython application note (and Galaxy too of course):
+
+Cock et al (2009). Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This tool is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btp163</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_filter_by_mapping/tool_dependencies.xml	Tue Jan 27 08:31:13 2015 -0500
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.64">
+        <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="samtools" version="0.1.19">
+        <repository changeset_revision="923adc89c666" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>