comparison tools/filters/seq_filter_by_id.xml @ 0:5844f6a450ed

Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 17:24:30 -0400
parents
children 262f08104540
comparison
equal deleted inserted replaced
-1:000000000000 0:5844f6a450ed
1 <tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.0.1">
2 <description>from a tabular file</description>
3 <command interpreter="python">
4 seq_filter_by_id.py $input_tabular $columns $input_file $input_file.ext
5 #if $output_choice_cond.output_choice=="both"
6 $output_pos $output_neg
7 #elif $output_choice_cond.output_choice=="pos"
8 $output_pos -
9 #elif $output_choice_cond.output_choice=="neg"
10 - $output_neg
11 #end if
12 </command>
13 <inputs>
14 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to filter on the identifiers" description="FASTA, FASTQ, or SFF format." />
15 <param name="input_tabular" type="data" format="tabular" label="Tabular file containing sequence identifiers"/>
16 <param name="columns" type="data_column" data_ref="input_tabular" multiple="True" numerical="False" label="Column(s) containing sequence identifiers" help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
17 <validator type="no_options" message="Pick at least one column"/>
18 </param>
19 <conditional name="output_choice_cond">
20 <param name="output_choice" type="select" label="Output positive matches, negative matches, or both?">
21 <option value="both">Both positive matches (ID on list) and negative matches (ID not on list), as two files</option>
22 <option value="pos">Just positive matches (ID on list), as a single file</option>
23 <option value="neg">Just negative matches (ID not on list), as a single file</option>
24 </param>
25 <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml -->
26 <when value="both" />
27 <when value="pos" />
28 <when value="neg" />
29 </conditional>
30 </inputs>
31 <outputs>
32 <data name="output_pos" format="fasta" label="With matched ID">
33 <!-- TODO - Replace this with format="input:input_fastq" if/when that works -->
34 <change_format>
35 <when input_dataset="input_file" attribute="extension" value="sff" format="sff" />
36 <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" />
37 <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" />
38 <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" />
39 <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" />
40 <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" />
41 </change_format>
42 <filter>output_choice_cond["output_choice"] != "neg"</filter>
43 </data>
44 <data name="output_neg" format="fasta" label="Without matched ID">
45 <!-- TODO - Replace this with format="input:input_fastq" if/when that works -->
46 <change_format>
47 <when input_dataset="input_file" attribute="extension" value="sff" format="sff" />
48 <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" />
49 <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" />
50 <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" />
51 <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" />
52 <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" />
53 </change_format>
54 <filter>output_choice_cond["output_choice"] != "pos"</filter>
55 </data>
56 </outputs>
57 <tests>
58 </tests>
59 <requirements>
60 <requirement type="python-module">Bio</requirement>
61 </requirements>
62 <help>
63
64 **What it does**
65
66 By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in
67 two, those sequences with or without an ID present in the tabular file column(s)
68 specified. You can opt to have a single output file of just the matching records,
69 or just the non-matching ones.
70
71 Note that the order of sequences in the original sequence file is preserved, as
72 is any Roche XML Manifest in an SFF file. Also, if any sequences share an
73 identifier (which would be very unusual in SFF files), duplicates are not removed.
74
75 **Example Usage**
76
77 You may have performed some kind of contamination search, for example running
78 BLASTN against a database of cloning vectors or bacteria, giving you a tabular
79 file containing read identifiers. You could use this tool to extract only the
80 reads without BLAST matches (i.e. those which do not match your contaminant
81 database).
82
83 You may have a file of FASTA sequences which has been run some some analysis
84 tool giving tabular output, which has then been filtered on some criteria.
85 You can then use this tool to divide the original FASTA file into those entries
86 matching or not matching your criteria (those with or without their identifier
87 in the filtered tabular file).
88
89 **Citation**
90
91 This tool uses Biopython to read and write SFF files. If you use this tool in
92 scientific work leading to a publication, please cite the Biopython application
93 note (and Galaxy too of course):
94
95 Cock et al 2009. Biopython: freely available Python tools for computational
96 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
97 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
98
99 </help>
100 </tool>