Mercurial > repos > peterjc > seq_filter_by_mapping
changeset 1:8ff0ac66f1a3 draft
v0.0.4; Report FASTQ counts; misc internal changes
author | peterjc |
---|---|
date | Wed, 13 May 2015 11:08:58 -0400 |
parents | 1d773da0ccf0 |
children | 48e71dfd51b3 |
files | tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml |
diffstat | 4 files changed, 54 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_filter_by_mapping/README.rst Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/README.rst Wed May 13 11:08:58 2015 -0400 @@ -1,7 +1,7 @@ Galaxy tool to filter FASTA, FASTQ or SFF sequences by SAM/BAM mapping ====================================================================== -This tool is copyright 2014 by Peter Cock, The James Hutton Institute +This tool is copyright 2014-2015 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. @@ -62,6 +62,10 @@ ------- ---------------------------------------------------------------------- v0.0.1 - Initial version. v0.0.2 - Fixed some error messages. +v0.0.3 - Report counts for FASTQ as done for FASTA and SFF files. +v0.0.4 - Use the ``format_source=...`` tag. + - Reorder XML elements (internal change only). + - Planemo for Tool Shed upload (``.shed.yml``, internal change only). ======= ====================================================================== @@ -74,22 +78,31 @@ Much of the code was copied from my older tool: https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_filter_by_id -For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use -the following command from the Galaxy root folder:: +For pushing a release to the test or main "Galaxy Tool Shed", use the following +Planemo commands (which requires you have set your Tool Shed access details in +``~/.planemo.yml`` and that you have access rights on the Tool Shed):: + + $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + ... + +or:: - $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq + $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + ... + +To just build and check the tar ball, use:: -Check this worked:: - - $ tar -tzf seq_filter_by_mapping.tar.gz + $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + ... + $ tar -tzf shed_upload.tar.gz + test-data/SRR639755_mito_pairs.fastq.gz + test-data/SRR639755_sample_by_coord.sam + test-data/SRR639755_sample_lax.fastq + test-data/SRR639755_sample_strict.fastq tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml - test-data/SRR639755_mito_pairs.fastq.gz - test-data/SRR639755_sample_by_coord.sam - test-data/SRR639755_sample_strict.fastq - test-data/SRR639755_sample_lax.fastq Licence (MIT)
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Wed May 13 11:08:58 2015 -0400 @@ -64,7 +64,7 @@ options, args = parser.parse_args() if options.version: - print "v0.0.2" + print "v0.0.3" sys.exit(0) in_file = options.input @@ -282,6 +282,7 @@ def fastq_filter(in_file, pos_file, neg_file, wanted): """FASTQ filter.""" from Bio.SeqIO.QualityIO import FastqGeneralIterator + pos_count = neg_count = 0 handle = open(in_file, "r") if out_positive_file is not None and out_negative_file is not None: print "Generating two FASTQ files" @@ -292,8 +293,10 @@ # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) if clean_name(title.split(None, 1)[0]) in ids: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + pos_count += 1 else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + neg_count += 1 positive_handle.close() negative_handle.close() elif out_positive_file is not None: @@ -302,16 +305,23 @@ for title, seq, qual in FastqGeneralIterator(handle): if clean_name(title.split(None, 1)[0]) in ids: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + pos_count += 1 + else: + neg_count += 1 positive_handle.close() elif out_negative_file is not None: print "Generating non-matching FASTQ file" negative_handle = open(out_negative_file, "w") for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) not in ids: + if clean_name(title.split(None, 1)[0]) in ids: + pos_count += 1 + else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + neg_count += 1 negative_handle.close() handle.close() - # This does not currently bother to record record counts (faster) + return pos_count, neg_count + def sff_filter(in_file, pos_file, neg_file, wanted): """SFF filter.""" @@ -353,18 +363,15 @@ if seq_format.lower()=="sff": - # Now write filtered SFF file based on IDs wanted - pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) - # At the time of writing, Galaxy doesn't show SFF file read counts, - # so it is useful to put them in stdout and thus shown in job info. - print "%i with and %i without specified IDs" % (pos_count, neg_count) + sequence_filter = sff_filter elif seq_format.lower()=="fasta": - # Write filtered FASTA file based on IDs from tabular file - pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) - print "%i with and %i without specified IDs" % (pos_count, neg_count) + sequence_filter = fasta_filter elif seq_format.lower().startswith("fastq"): - #Write filtered FASTQ file based on IDs from mapping file - fastq_filter(in_file, out_positive_file, out_negative_file, ids) - # This does not currently track the counts + sequence_filter = fastq_filter else: sys_exit("Unsupported file type %r" % seq_format) + +pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) +print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) +fraction = float(pos_count) * 100.0 / float(pos_count + neg_count) +print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction))
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Wed May 13 11:08:58 2015 -0400 @@ -1,4 +1,4 @@ -<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.2"> +<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.4"> <description>from SAM/BAM file</description> <requirements> <requirement type="package" version="1.64">biopython</requirement> @@ -6,6 +6,11 @@ <requirement type="binary">samtools</requirement> <requirement type="package" version="0.1.19">samtools</requirement> </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <version_command interpreter="python">seq_filter_by_mapping.py --version</version_command> <command interpreter="python"> seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode @@ -19,11 +24,6 @@ ## Now loop over all the mapping files #for i in $mapping_file#${i} #end for# </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> <inputs> <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." /> <param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." /> @@ -47,10 +47,10 @@ </param> </inputs> <outputs> - <data name="output_pos" format="input" metadata_source="input_file" label="$input_file.name (mapped)"> + <data name="output_pos" format_source="input_file" metadata_source="input_file" label="$input_file.name (mapped)"> <filter>output_choice_cond["output_choice"] != "neg"</filter> </data> - <data name="output_neg" format="input" metadata_source="input_file" label="$input_file.name (unmapped)"> + <data name="output_neg" format_source="input_file" metadata_source="input_file" label="$input_file.name (unmapped)"> <filter>output_choice_cond["output_choice"] != "pos"</filter> </data> </outputs>
--- a/tools/seq_filter_by_mapping/tool_dependencies.xml Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/tool_dependencies.xml Wed May 13 11:08:58 2015 -0400 @@ -4,6 +4,6 @@ <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> <package name="samtools" version="0.1.19"> - <repository changeset_revision="923adc89c666" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="96aab723499f" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>