comparison tools/seq_filter_by_id/seq_filter_by_id.xml @ 3:44ab4c0f7683 draft

Uploaded v0.0.6, automatic dependency on Biopython 1.62, new README file, citation information, MIT licence
author peterjc
date Fri, 11 Oct 2013 04:37:12 -0400
parents
children 832c1fd57852
comparison
equal deleted inserted replaced
2:abdd608c869b 3:44ab4c0f7683
1 <tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.0.6">
2 <description>from a tabular file</description>
3 <requirements>
4 <requirement type="package" version="1.62">biopython</requirement>
5 <requirement type="python-module">Bio</requirement>
6 </requirements>
7 <version_command interpreter="python">seq_filter_by_id.py --version</version_command>
8 <command interpreter="python">
9 seq_filter_by_id.py "$input_file" "$input_file.ext"
10 #if $output_choice_cond.output_choice=="both"
11 $output_pos $output_neg
12 #elif $output_choice_cond.output_choice=="pos"
13 $output_pos -
14 #elif $output_choice_cond.output_choice=="neg"
15 - $output_neg
16 #end if
17 ## TODO - Decide on best way to expose multiple ID files via the XML wrapper.
18 ## Single tabular file, can call the Python script with either UNION or INTERSECTION
19 UNION "$input_tabular" "$columns"
20 </command>
21 <stdio>
22 <!-- Anything other than zero is an error -->
23 <exit_code range="1:" />
24 <exit_code range=":-1" />
25 </stdio>
26 <inputs>
27 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to filter on the identifiers" help="FASTA, FASTQ, or SFF format." />
28 <param name="input_tabular" type="data" format="tabular" label="Tabular file containing sequence identifiers"/>
29 <param name="columns" type="data_column" data_ref="input_tabular" multiple="True" numerical="False" label="Column(s) containing sequence identifiers" help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
30 <validator type="no_options" message="Pick at least one column"/>
31 </param>
32 <conditional name="output_choice_cond">
33 <param name="output_choice" type="select" label="Output positive matches, negative matches, or both?">
34 <option value="both">Both positive matches (ID on list) and negative matches (ID not on list), as two files</option>
35 <option value="pos">Just positive matches (ID on list), as a single file</option>
36 <option value="neg">Just negative matches (ID not on list), as a single file</option>
37 </param>
38 <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml -->
39 <when value="both" />
40 <when value="pos" />
41 <when value="neg" />
42 </conditional>
43 </inputs>
44 <outputs>
45 <data name="output_pos" format="fasta" label="With matched ID">
46 <!-- TODO - Replace this with format="input:input_fastq" if/when that works -->
47 <change_format>
48 <when input_dataset="input_file" attribute="extension" value="sff" format="sff" />
49 <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" />
50 <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" />
51 <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" />
52 <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" />
53 <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" />
54 </change_format>
55 <filter>output_choice_cond["output_choice"] != "neg"</filter>
56 </data>
57 <data name="output_neg" format="fasta" label="Without matched ID">
58 <!-- TODO - Replace this with format="input:input_fastq" if/when that works -->
59 <change_format>
60 <when input_dataset="input_file" attribute="extension" value="sff" format="sff" />
61 <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" />
62 <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" />
63 <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" />
64 <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" />
65 <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" />
66 </change_format>
67 <filter>output_choice_cond["output_choice"] != "pos"</filter>
68 </data>
69 </outputs>
70 <tests>
71 <test>
72 <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
73 <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" />
74 <param name="columns" value="1" />
75 <param name="output_choice" value="pos" />
76 <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
77 </test>
78 </tests>
79 <help>
80 **What it does**
81
82 By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in
83 two, those sequences with or without an ID present in the tabular file column(s)
84 specified. You can opt to have a single output file of just the matching records,
85 or just the non-matching ones.
86
87 Note that the order of sequences in the original sequence file is preserved, as
88 is any Roche XML Manifest in an SFF file. Also, if any sequences share an
89 identifier (which would be very unusual in SFF files), duplicates are not removed.
90
91 **Example Usage**
92
93 You may have performed some kind of contamination search, for example running
94 BLASTN against a database of cloning vectors or bacteria, giving you a tabular
95 file containing read identifiers. You could use this tool to extract only the
96 reads without BLAST matches (i.e. those which do not match your contaminant
97 database).
98
99 You may have a file of FASTA sequences which has been used with some analysis
100 tool giving tabular output, which has then been filtered on some criteria.
101 You can then use this tool to divide the original FASTA file into those entries
102 matching or not matching your criteria (those with or without their identifier
103 in the filtered tabular file).
104
105 **References**
106
107 If you use this Galaxy tool in work leading to a scientific publication please
108 cite the following papers:
109
110 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
111 Galaxy tools and workflows for sequence analysis with applications
112 in molecular plant pathology. PeerJ 1:e167
113 http://dx.doi.org/10.7717/peerj.167
114
115 This tool uses Biopython to read and write SFF files, so you may also wish to
116 cite the Biopython application note (and Galaxy too of course):
117
118 Cock et al (2009). Biopython: freely available Python tools for computational
119 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
120 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
121
122 This tool is available to install into other Galaxy Instances via the Galaxy
123 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
124 </help>
125 </tool>