Mercurial > repos > peterjc > seq_filter_by_mapping
changeset 0:1d773da0ccf0 draft
Uploaded v0.0.2, fixed some error messages
author | peterjc |
---|---|
date | Tue, 27 Jan 2015 08:31:13 -0500 |
parents | |
children | 8ff0ac66f1a3 |
files | test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_lax.fastq test-data/SRR639755_sample_strict.fastq tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml |
diffstat | 8 files changed, 668 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SRR639755_sample_by_coord.sam Tue Jan 27 08:31:13 2015 -0500 @@ -0,0 +1,14 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:gi|187250362|ref|NC_010642.1| LN:16990 +@PG ID:clc_mapper VN:4.10.86742 CL:clc_mapper -d -z xxx -p fb ss 1 1000 -q yyy -o temp_job.cas --cpus 8 +@RG ID:RG_0 DS:yyy PG:clc_mapper +SRR639755.6451003 99 gi|187250362|ref|NC_010642.1| 1352 255 7S94M = 1604 353 ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT <?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4 RG:Z:RG_0 MD:Z:8G8C22T2T42A7 NM:i:11 +SRR639755.6451003 147 gi|187250362|ref|NC_010642.1| 1604 255 101M = 1352 353 TTCAGCCTACATACCGCCATCTTCAGCAAACCCTAAAAAGGAAGAAAAGTAAGCACAAGTGCCTTAACACAAAAAAGTTAGGTCAAGGTGTAGCCCATGAG >?BA::3B:C?98B@@CC>CA>EEEFFFBCEE>FJJIHC@DDGIIIHF@HFHCFBHD@GGHBFD@HHBDGAFCCEC8HEIHDH>HEHBHHFGHDDD=F@@@ RG:Z:RG_0 MD:Z:9T50AT7T31 NM:i:4 +SRR639755.6035319 137 gi|187250362|ref|NC_010642.1| 9094 255 101M * 0 0 TACTTATGTCTCTCATTTTATTTATTGGGCCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACGCAATTATCAATAAAACTAGGCATA ?@@FD;:+AHB4<C9F4HEFH@EAA?EE++<<A@FHADDB<?:4D)0*0?D?BFFGIIIDBE:=BDE)=FHGED>;('(5,;;;6;6;>;>(5((5?C(,: RG:Z:RG_0 MD:Z:29T46C14C9 NM:i:3 +SRR639755.2301126 137 gi|187250362|ref|NC_010642.1| 9117 255 101M * 0 0 ATTGGGTCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAAACCTAGGCATAGCCATCCCCTTGTGAGCCGGCAC ?@BDB;=AAAAFFDHBGAAHI>;<AFCAF@?GEDEEFHAHFH@<F<D@GGEHIEF@HIE>GH=B@CCGAHC@@DEHHFEHDFFFFDDE>?CCDACCBDB@B RG:Z:RG_0 MD:Z:101 NM:i:0 +SRR639755.6955680 89 gi|187250362|ref|NC_010642.1| 9285 255 97M4S * 0 0 GTCCCCTTAATCCCTATGCTCGTAATTATCGAAACTATCAGCCTTTTTATCCAGCCCGTAGCCCTAGCCGTACGACTCACAGCTAATATTACTGCAGATAT 9<>8CAACC?CBC@DDD@?<8A@>(BBC?DDAB;?C@BFFEEAIIHD@EGF6GGGGAFD?B8GCEJJIIHEIIHEGIHHIJJIEGIGGDHFFC=2FFFBB? RG:Z:RG_0 MD:Z:97 NM:i:0 +SRR639755.7076397 89 gi|187250362|ref|NC_010642.1| 9285 255 97M4S * 0 0 GTCCCCTTAATCCCTATGCTCGTAATTATCGAAACTATCAGCCTTTTTATCCAGCCCGTAGCCCTAGCCGTACGACTCACAGCTAATATTACTGCAGATAT BBDBCDDCDDCDDCDCBDDD@CACC@DFBDDEB>DFFFFFHHHJJJIGGDJIIGJJIIHGIJJJIJIJIIHGIJIJIJJJJJJJJJJJHHHFC=2FFD?@= RG:Z:RG_0 MD:Z:97 NM:i:0 +SRR639755.2301126 69 * 0 255 * gi|187250362|ref|NC_010642.1| 9117 0 ATATCTGCAGTGAGCCTGACTTGGGGCTCGAACTCATGAACCATGAGATCATGACCTGAGCTGAAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGC +1=BDDDDHHHFBF?GDECFGHDEFH@?C:?F@;DH@4B?9)?<BF?=)8C@G=88CC@EHICHH>7=;?;?><A>=A;?(<?9'2-<>B91(>@@CC5?< RG:Z:RG_0 +SRR639755.6035319 69 * 0 255 * gi|187250362|ref|NC_010642.1| 9094 0 ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCCACGGGCTGGGTAGCAACGATGCTGGGGTGCAGAATTAGGCGGCTGGGCATTAAAGCGAC 1:;44222222<ABDGG@I?4CFD<ACBG?8ACF@0???B9)0)7A0<@8'().()).(,(.5,((-,',,(((,,(((++(0&&0&&(((+((((+()(& RG:Z:RG_0 +SRR639755.6955680 165 * 0 255 * gi|187250362|ref|NC_010642.1| 9285 0 CCGACTGAGCCACCCAGGCGCCCCTAAATTTGCTTTTAATTTTAATGGTGGTTATATTTATTGGGGCAACAAATCTGCTAGGTCTACTACCTCACTCATTT @@BFFADDF?FHHJAE:A@GEHHGHGGIGGE9DCHHHIJIIIG@GGBGHID';=7C==@:=.=CH64<?(;?B>35>@:@>9,::@>;5><A>:<:>9444 RG:Z:RG_0 +SRR639755.7076397 165 * 0 255 * gi|187250362|ref|NC_010642.1| 9285 0 TGACAGCACATTTATTTACAAACTGGTTTGCTGAATATTTTAATCCCACTGTTGAGGCCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAA CC@FFFFFHFHHHJJJJJJIJJJJJIFHGIIIJIIGEGIJJJJIJIHIJHIJIIJJIIIJJJJJJJIJJJJIJJJJGHHGHGFFFFFEDCDDDDDD;>;AC RG:Z:RG_0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SRR639755_sample_lax.fastq Tue Jan 27 08:31:13 2015 -0500 @@ -0,0 +1,40 @@ +@SRR639755.2301126/1 +ATATCTGCAGTGAGCCTGACTTGGGGCTCGAACTCATGAACCATGAGATCATGACCTGAGCTGAAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGC ++ ++1=BDDDDHHHFBF?GDECFGHDEFH@?C:?F@;DH@4B?9)?<BF?=)8C@G=88CC@EHICHH>7=;?;?><A>=A;?(<?9'2-<>B91(>@@CC5?< +@SRR639755.2301126/2 +ATTGGGTCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAAACCTAGGCATAGCCATCCCCTTGTGAGCCGGCAC ++ +?@BDB;=AAAAFFDHBGAAHI>;<AFCAF@?GEDEEFHAHFH@<F<D@GGEHIEF@HIE>GH=B@CCGAHC@@DEHHFEHDFFFFDDE>?CCDACCBDB@B +@SRR639755.6035319/1 +ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCCACGGGCTGGGTAGCAACGATGCTGGGGTGCAGAATTAGGCGGCTGGGCATTAAAGCGAC ++ +1:;44222222<ABDGG@I?4CFD<ACBG?8ACF@0???B9)0)7A0<@8'().()).(,(.5,((-,',,(((,,(((++(0&&0&&(((+((((+()(& +@SRR639755.6035319/2 +TACTTATGTCTCTCATTTTATTTATTGGGCCAACAAATCTGCTAGGTCTACTACCTCACTCATTTACTCCAACTACGCAATTATCAATAAAACTAGGCATA ++ +?@@FD;:+AHB4<C9F4HEFH@EAA?EE++<<A@FHADDB<?:4D)0*0?D?BFFGIIIDBE:=BDE)=FHGED>;('(5,;;;6;6;>;>(5((5?C(,: +@SRR639755.6451003/1 +ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT ++ +<?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4 +@SRR639755.6451003/2 +CTCATGGGCTACACCTTGACCTAACTTTTTTGTGTTAAGGCACTTGTGCTTACTTTTCTTCCTTTTTAGGGTTTGCTGAAGATGGCGGTATGTAGGCTGAA ++ +@@@F=DDDHGFHHBHEH>HDHIEH8CECCFAGDBHH@DFBHGG@DHBFCHFH@FHIIIGDD@CHIJJF>EECBFFFEEE>AC>CC@@B89?C:B3::AB?> +@SRR639755.6955680/1 +ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGCTGATAGTTTCGATAATTACGAGCATAGGGATTAAGGGGAC ++ +?BBFFF2=CFFHDGGIGEIJJIHHIGEHIIEHIIJJECG8B?DFAGGGG6FGE@DHIIAEEFFB@C?;BADD?CBB(>@A8<?@DDD@CBC?CCAAC8><9 +@SRR639755.6955680/2 +CCGACTGAGCCACCCAGGCGCCCCTAAATTTGCTTTTAATTTTAATGGTGGTTATATTTATTGGGGCAACAAATCTGCTAGGTCTACTACCTCACTCATTT ++ +@@BFFADDF?FHHJAE:A@GEHHGHGGIGGE9DCHHHIJIIIG@GGBGHID';=7C==@:=.=CH64<?(;?B>35>@:@>9,::@>;5><A>:<:>9444 +@SRR639755.7076397/1 +ATATCTGCAGTAATATTAGCTGTGAGTCGTACGGCTAGGGCTACGGGCTGGATAAAAAGGCTGATAGTTTCGATAATTACGAGCATAGGGATTAAGGGGAC ++ +=@?DFF2=CFHHHJJJJJJJJJJJIJIJIGHIIJIJIJJJIGHIIJJGIIJDGGIJJJHHHFFFFFD>BEDDBFD@CCAC@DDDBCDCDDCDDCDDCBDBB +@SRR639755.7076397/2 +TGACAGCACATTTATTTACAAACTGGTTTGCTGAATATTTTAATCCCACTGTTGAGGCCTACTACCTCACTCATTTACTCCAACTACCCAATTATCAATAA ++ +CC@FFFFFHFHHHJJJJJJIJJJJJIFHGIIIJIIGEGIJJJJIJIHIJHIJIIJJIIIJJJJJJJIJJJJIJJJJGHHGHGFFFFFEDCDDDDDD;>;AC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SRR639755_sample_strict.fastq Tue Jan 27 08:31:13 2015 -0500 @@ -0,0 +1,8 @@ +@SRR639755.6451003/1 +ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT ++ +<?7A4ADDFHHDHIIBH<HGDHEIG>HBHGGEFH@?D<GDGGHDGGG>DDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4 +@SRR639755.6451003/2 +CTCATGGGCTACACCTTGACCTAACTTTTTTGTGTTAAGGCACTTGTGCTTACTTTTCTTCCTTTTTAGGGTTTGCTGAAGATGGCGGTATGTAGGCTGAA ++ +@@@F=DDDHGFHHBHEH>HDHIEH8CECCFAGDBHH@DFBHGG@DHBFCHFH@FHIIIGDD@CHIJJF>EECBFFFEEE>AC>CC@@B89?C:B3::AB?>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_filter_by_mapping/README.rst Tue Jan 27 08:31:13 2015 -0500 @@ -0,0 +1,114 @@ +Galaxy tool to filter FASTA, FASTQ or SFF sequences by SAM/BAM mapping +====================================================================== + +This tool is copyright 2014 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +This tool is a short Python script (using Biopython library functions) which +divides a FASTA, FASTQ, or SFF file in two, those sequences which do or do +not map according to given SAM/BAM file(s). + +Example uses include mapping of FASTQ reads against a known contaminant +in order to remove reads prior to a de novo assembly. + +This tool is available from the Galaxy Tool Shed at: + +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping + +See also related tools: + +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_select_by_id +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_rename + + +Automated Installation +====================== + +This should be straightforward using the Galaxy Tool Shed, which should be +able to automatically install the dependency on Biopython and samtools +and then install this tool and run its unit tests. + + +Manual Installation +=================== + +There are just two files to install to use this tool from within Galaxy: + +* ``seq_filter_by_mapping.py`` (the Python script) +* ``seq_filter_by_mapping.xml`` (the Galaxy tool definition) + +The suggested location is a dedicated ``tools/seq_filter_by_mapping/`` folder. + +You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line:: + + <tool file="seq_filter_by_mapping/seq_filter_by_mapping.xml" /> + +If you wish to run the unit tests, also move/copy the ``test-data/`` files +under Galaxy's ``test-data/`` folder. Then:: + + $ ./run_tests.sh -id seq_filter_by_mapping + +You will also need to install Biopython 1.54 or later. That's it. + + +History +======= + +======= ====================================================================== +Version Changes +------- ---------------------------------------------------------------------- +v0.0.1 - Initial version. +v0.0.2 - Fixed some error messages. +======= ====================================================================== + + +Developers +========== + +Development is on this GitHub repository: +https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_filter_by_mapping + +Much of the code was copied from my older tool: +https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_filter_by_id + +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder:: + + $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq + +Check this worked:: + + $ tar -tzf seq_filter_by_mapping.tar.gz + tools/seq_filter_by_mapping/README.rst + tools/seq_filter_by_mapping/seq_filter_by_mapping.py + tools/seq_filter_by_mapping/seq_filter_by_mapping.xml + tools/seq_filter_by_mapping/tool_dependencies.xml + test-data/SRR639755_mito_pairs.fastq.gz + test-data/SRR639755_sample_by_coord.sam + test-data/SRR639755_sample_strict.fastq + test-data/SRR639755_sample_lax.fastq + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Tue Jan 27 08:31:13 2015 -0500 @@ -0,0 +1,370 @@ +#!/usr/bin/env python +"""Filter a FASTA, FASTQ or SSF file with SAM/BAM mapping information. + +When filtering an SFF file, any Roche XML manifest in the input file is +preserved in both output files. + +This tool is a short Python script which requires Biopython 1.54 or later +for SFF file support. If you use this tool in scientific work leading to a +publication, please cite the Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2014 by Peter Cock, The James Hutton Institute +(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. +See accompanying text file for licence details (MIT license). + +Use -v or --version to get the version, -h or --help for help. +""" +import os +import sys +import re +import subprocess +from optparse import OptionParser + +def sys_exit(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +#Parse Command Line +usage = """Use as follows: + +$ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings] + +e.g. Positive matches using column one from a single BAM file: + +$ seq_filter_by_mapping.py -i my_seqs.fastq -f fastq -p matches.fastq mapping.bam + +Multiple SAM/BAM mapping files may be given. +""" +parser = OptionParser(usage=usage) +parser.add_option('-i', '--input', dest='input', + default=None, help='Input sequences filename', + metavar="FILE") +parser.add_option('-f', '--format', dest='format', + default=None, + help='Input sequence format (e.g. fasta, fastq, sff)') +parser.add_option('-p', '--positive', dest='output_positive', + default=None, + help='Output filename for mapping reads', + metavar="FILE") +parser.add_option('-n', '--negative', dest='output_negative', + default=None, + help='Output filename for non-mapping reads', + metavar="FILE") +parser.add_option("-m", "--pair-mode", dest="pair_mode", + default="lax", + help="How to treat paired reads (lax or strict, default lax)") +parser.add_option("-v", "--version", dest="version", + default=False, action="store_true", + help="Show version and quit") + +options, args = parser.parse_args() + +if options.version: + print "v0.0.2" + sys.exit(0) + +in_file = options.input +seq_format = options.format +out_positive_file = options.output_positive +out_negative_file = options.output_negative +pair_mode = options.pair_mode + +if in_file is None or not os.path.isfile(in_file): + sys_exit("Missing input file: %r" % in_file) +if out_positive_file is None and out_negative_file is None: + sys_exit("Neither output file requested") +if seq_format is None: + sys_exit("Missing sequence format") +if pair_mode not in ["lax", "strict"]: + sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) +for mapping in args: + if not os.path.isfile(mapping): + sys_exit("Mapping file %r not found" % mapping) +if not args: + sys_exit("At least one SAM/BAM mapping file is required") + + +#Cope with three widely used suffix naming convensions, +#Illumina: /1 or /2 +#Forward/revered: .f or .r +#Sanger, e.g. .p1k and .q1k +#See http://staden.sourceforge.net/manual/pregap4_unix_50.html +#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") +#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") +re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") +assert re_suffix.search("demo.f") +assert re_suffix.search("demo.s1") +assert re_suffix.search("demo.f1k") +assert re_suffix.search("demo.p1") +assert re_suffix.search("demo.p1k") +assert re_suffix.search("demo.p1lk") +assert re_suffix.search("demo/2") +assert re_suffix.search("demo.r") +assert re_suffix.search("demo.q1") +assert re_suffix.search("demo.q1lk") + +def clean_name(name): + """Remove suffix.""" + match = re_suffix.search(name) + if match: + # Use the fact this is a suffix, and regular expression will be + # anchored to the end of the name: + return name[:match.start()] + else: + # Nothing to do + return name +assert clean_name("foo/1") == "foo" +assert clean_name("foo/2") == "foo" +assert clean_name("bar.f") == "bar" +assert clean_name("bar.r") == "bar" +assert clean_name("baz.p1") == "baz" +assert clean_name("baz.q2") == "baz" + +mapped_chars = { '>' :'__gt__', + '<' :'__lt__', + "'" :'__sq__', + '"' :'__dq__', + '[' :'__ob__', + ']' :'__cb__', + '{' :'__oc__', + '}' :'__cc__', + '@' : '__at__', + '\n' : '__cn__', + '\r' : '__cr__', + '\t' : '__tc__', + '#' : '__pd__' + } + +def load_mapping_ids(filename, pair_mode, ids): + """Parse SAM/BAM file, updating given set of ids. + + Parses BAM files via call out to samtools view command. + """ + handle = open(filename, "rb") + magic = handle.read(4) + if magic == b"\x1f\x8b\x08\x04": + # Presumably a BAM file then... + handle.close() + # Call samtools view, don't need header so no -h added: + child = subprocess.Popen(["samtools", "view", filename], + stdin=None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + handle = child.stdout + else: + # Presumably a SAM file... + child = None + handle.seek(0) + # Handle should now contain SAM records + for line in handle: + # Ignore header lines + if line[0] != "@": + qname, flag, rest = line.split("\t", 2) + flag = int(flag) + if pair_mode == "lax": + # If either read or its partner is mapped, take it! + # Being lazy, since we will look at both reads + # can just check if (either) has 0x4 clear. + if not (flag & 0x4): + ids.add(qname) + elif pair_mode == "strict": + # For paired reads, require BOTH be mapped. + if (flag & 0x4): + # This is unmapped, ignore it + pass + elif not (flag & 0x1): + # Unpaired (& mapped) - take it + ids.add(qname) + elif not (flag & 0x8): + # Paired and partner also mapped, good + ids.add(qname) + if child: + # Check terminated normally. + stdout, stderr = child.communicate() + assert child.returncode is not None + if child.returncode: + msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, + filename, stderr) + sys_exit(msg.strip(), child.returncode) + else: + handle.close() + + +# Read mapping file(s) and record all mapped identifiers +ids = set() +for filename in args: + load_mapping_ids(filename, pair_mode, ids) +# TODO - If want to support naive paired mode, have to record +# more than just qname (need /1 or /2 indicator) +print("Loaded %i mapped IDs" % (len(ids))) +if len(ids) < 10: + print("Looking for %s" % ", ".join(sorted(ids))) + +def crude_fasta_iterator(handle): + """Yields tuples, record ID and the full record as a string.""" + while True: + line = handle.readline() + if line == "": + return # Premature end of file, or just empty? + if line[0] == ">": + break + + no_id_warned = False + while True: + if line[0] != ">": + raise ValueError( + "Records in Fasta files should start with '>' character") + try: + id = line[1:].split(None, 1)[0] + except IndexError: + if not no_id_warned: + sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") + no_id_warned = True + id = None + lines = [line] + line = handle.readline() + while True: + if not line: + break + if line[0] == ">": + break + lines.append(line) + line = handle.readline() + yield id, "".join(lines) + if not line: + return # StopIteration + + +def fasta_filter(in_file, pos_file, neg_file, wanted): + """FASTA filter producing 60 character line wrapped outout.""" + pos_count = neg_count = 0 + #Galaxy now requires Python 2.5+ so can use with statements, + with open(in_file) as in_handle: + #Doing the if statement outside the loop for speed + #(with the downside of three very similar loops). + if pos_file is not None and neg_file is not None: + print "Generating two FASTA files" + with open(pos_file, "w") as pos_handle: + with open(neg_file, "w") as neg_handle: + for identifier, record in crude_fasta_iterator(in_handle): + if clean_name(identifier) in wanted: + pos_handle.write(record) + pos_count += 1 + else: + neg_handle.write(record) + neg_count += 1 + elif pos_file is not None: + print "Generating matching FASTA file" + with open(pos_file, "w") as pos_handle: + for identifier, record in crude_fasta_iterator(in_handle): + if clean_name(identifier) in wanted: + pos_handle.write(record) + pos_count += 1 + else: + neg_count += 1 + else: + print "Generating non-matching FASTA file" + assert neg_file is not None + with open(neg_file, "w") as neg_handle: + for identifier, record in crude_fasta_iterator(in_handle): + if clean_name(identifier) in wanted: + pos_count += 1 + else: + neg_handle.write(record) + neg_count += 1 + return pos_count, neg_count + + +def fastq_filter(in_file, pos_file, neg_file, wanted): + """FASTQ filter.""" + from Bio.SeqIO.QualityIO import FastqGeneralIterator + handle = open(in_file, "r") + if out_positive_file is not None and out_negative_file is not None: + print "Generating two FASTQ files" + positive_handle = open(out_positive_file, "w") + negative_handle = open(out_negative_file, "w") + print in_file + for title, seq, qual in FastqGeneralIterator(handle): + # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) + if clean_name(title.split(None, 1)[0]) in ids: + positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + else: + negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + positive_handle.close() + negative_handle.close() + elif out_positive_file is not None: + print "Generating matching FASTQ file" + positive_handle = open(out_positive_file, "w") + for title, seq, qual in FastqGeneralIterator(handle): + if clean_name(title.split(None, 1)[0]) in ids: + positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + positive_handle.close() + elif out_negative_file is not None: + print "Generating non-matching FASTQ file" + negative_handle = open(out_negative_file, "w") + for title, seq, qual in FastqGeneralIterator(handle): + if clean_name(title.split(None, 1)[0]) not in ids: + negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + negative_handle.close() + handle.close() + # This does not currently bother to record record counts (faster) + +def sff_filter(in_file, pos_file, neg_file, wanted): + """SFF filter.""" + try: + from Bio.SeqIO.SffIO import SffIterator, SffWriter + except ImportError: + sys_exit("SFF filtering requires Biopython 1.54 or later") + + try: + from Bio.SeqIO.SffIO import ReadRocheXmlManifest + except ImportError: + #Prior to Biopython 1.56 this was a private function + from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest + + in_handle = open(in_file, "rb") #must be binary mode! + try: + manifest = ReadRocheXmlManifest(in_handle) + except ValueError: + manifest = None + + #This makes two passes though the SFF file with isn't so efficient, + #but this makes the code simple. + pos_count = neg_count = 0 + if out_positive_file is not None: + out_handle = open(out_positive_file, "wb") + writer = SffWriter(out_handle, xml=manifest) + in_handle.seek(0) #start again after getting manifest + pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) + out_handle.close() + if out_negative_file is not None: + out_handle = open(out_negative_file, "wb") + writer = SffWriter(out_handle, xml=manifest) + in_handle.seek(0) #start again + neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) + out_handle.close() + #And we're done + in_handle.close() + return pos_count, neg_count + + +if seq_format.lower()=="sff": + # Now write filtered SFF file based on IDs wanted + pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) + # At the time of writing, Galaxy doesn't show SFF file read counts, + # so it is useful to put them in stdout and thus shown in job info. + print "%i with and %i without specified IDs" % (pos_count, neg_count) +elif seq_format.lower()=="fasta": + # Write filtered FASTA file based on IDs from tabular file + pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) + print "%i with and %i without specified IDs" % (pos_count, neg_count) +elif seq_format.lower().startswith("fastq"): + #Write filtered FASTQ file based on IDs from mapping file + fastq_filter(in_file, out_positive_file, out_negative_file, ids) + # This does not currently track the counts +else: + sys_exit("Unsupported file type %r" % seq_format)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Tue Jan 27 08:31:13 2015 -0500 @@ -0,0 +1,113 @@ +<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.2"> + <description>from SAM/BAM file</description> + <requirements> + <requirement type="package" version="1.64">biopython</requirement> + <requirement type="python-module">Bio</requirement> + <requirement type="binary">samtools</requirement> + <requirement type="package" version="0.1.19">samtools</requirement> + </requirements> + <version_command interpreter="python">seq_filter_by_mapping.py --version</version_command> + <command interpreter="python"> +seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode +#if $output_choice_cond.output_choice=="both" + -p $output_pos -n $output_neg +#elif $output_choice_cond.output_choice=="pos" + -p $output_pos +#elif $output_choice_cond.output_choice=="neg" + -n $output_neg +#end if +## Now loop over all the mapping files +#for i in $mapping_file#${i} #end for# + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <inputs> + <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." /> + <param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." /> + <conditional name="output_choice_cond"> + <param name="output_choice" type="select" label="Output mapped reads, unmapped reads, or both?"> + <option value="both">Both mapped and unmapped reads, as two files</option> + <option value="pos">Just mapped reads, as a single file</option> + <option value="neg">Just unmapped reads, as a single file</option> + </param> + <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml --> + <when value="both" /> + <when value="pos" /> + <when value="neg" /> + </conditional> + <param name="pair_mode" type="select" label="Paired read treatment"> + <option value="lax" selected="true">Treat as a pair, allow either read to be mapped</option> + <option value="strict">Treat as a pair, require both reads to be mapped</option> + <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter... + <option value="solo">Treat independently (will split partners when only one maps)</option> + --> + </param> + </inputs> + <outputs> + <data name="output_pos" format="input" metadata_source="input_file" label="$input_file.name (mapped)"> + <filter>output_choice_cond["output_choice"] != "neg"</filter> + </data> + <data name="output_neg" format="input" metadata_source="input_file" label="$input_file.name (unmapped)"> + <filter>output_choice_cond["output_choice"] != "pos"</filter> + </data> + </outputs> + <tests> + <test> + <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" /> + <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" /> + <param name="pair_mode" value="lax" /> + <param name="output_choice" value="pos" /> + <output name="output_pos" file="SRR639755_sample_lax.fastq" ftype="fastqsanger" /> + </test> + <test> + <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" /> + <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" /> + <param name="pair_mode" value="strict" /> + <param name="output_choice" value="pos" /> + <output name="output_pos" file="SRR639755_sample_strict.fastq" ftype="fastqsanger" /> + </test> + </tests> + <help> +**What it does** + +By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in +two, those sequences (or read pairs) which do or don't map in the provided +SAM/BAM file. You can opt to have a single output file of just the mapping reads, +or just the non-mapping ones. + +**Example Usage** + +You might wish to perform a contamination screan by mapping your reads against +known contaminant reference sequences, then use this tool to select only the +unmapped reads for further analysis (e.g. *de novo* assembly). + +Similarly you might wish to map your reads against a known bacterial reference, +then take the non-mapping sequences forward for analysis if looking for novel +plasmids. + + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite: + +Peter J.A. Cock (2014), Galaxy tool for filtering reads by mapping +http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping + +This tool uses Biopython to read and write SFF files, so you may also wish to +cite the Biopython application note (and Galaxy too of course): + +Cock et al (2009). Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This tool is available to install into other Galaxy Instances via the Galaxy +Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btp163</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_filter_by_mapping/tool_dependencies.xml Tue Jan 27 08:31:13 2015 -0500 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="biopython" version="1.64"> + <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="samtools" version="0.1.19"> + <repository changeset_revision="923adc89c666" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>