comparison tools/seq_filter_by_id/seq_filter_by_id.xml @ 5:832c1fd57852 draft

v0.2.2; New options for IDs via text parameter, ignore paired read suffix; misc changes
author peterjc
date Wed, 13 May 2015 11:03:57 -0400
parents 44ab4c0f7683
children 03e134cae41a
comparison
equal deleted inserted replaced
4:1c36cf8ef133 5:832c1fd57852
1 <tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.0.6"> 1 <tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.2">
2 <description>from a tabular file</description> 2 <description>from a tabular file</description>
3 <requirements> 3 <requirements>
4 <requirement type="package" version="1.62">biopython</requirement> 4 <requirement type="package" version="1.64">biopython</requirement>
5 <requirement type="python-module">Bio</requirement> 5 <requirement type="python-module">Bio</requirement>
6 </requirements> 6 </requirements>
7 <version_command interpreter="python">seq_filter_by_id.py --version</version_command>
8 <command interpreter="python">
9 seq_filter_by_id.py "$input_file" "$input_file.ext"
10 #if $output_choice_cond.output_choice=="both"
11 $output_pos $output_neg
12 #elif $output_choice_cond.output_choice=="pos"
13 $output_pos -
14 #elif $output_choice_cond.output_choice=="neg"
15 - $output_neg
16 #end if
17 ## TODO - Decide on best way to expose multiple ID files via the XML wrapper.
18 ## Single tabular file, can call the Python script with either UNION or INTERSECTION
19 UNION "$input_tabular" "$columns"
20 </command>
21 <stdio> 7 <stdio>
22 <!-- Anything other than zero is an error --> 8 <!-- Anything other than zero is an error -->
23 <exit_code range="1:" /> 9 <exit_code range="1:" />
24 <exit_code range=":-1" /> 10 <exit_code range=":-1" />
25 </stdio> 11 </stdio>
12 <version_command interpreter="python">seq_filter_by_id.py --version</version_command>
13 <command interpreter="python">
14 seq_filter_by_id.py -i "$input_file" -f "$input_file.ext"
15 #if $output_choice_cond.output_choice=="both"
16 -p $output_pos -n $output_neg
17 #elif $output_choice_cond.output_choice=="pos"
18 -p $output_pos
19 #elif $output_choice_cond.output_choice=="neg"
20 -n $output_neg
21 #end if
22 #if $adv_opts.adv_opts_selector=="advanced" and $adv_opts.strip_suffix
23 -s
24 #end if
25 #if $id_opts.id_opts_selector=="tabular":
26 ## TODO - Decide on best way to expose multiple ID files via the XML wrapper.
27 ## Single tabular file, can call the Python script with either UNION or INTERSECTION
28 -l UNION "$id_opts.input_tabular" "$id_opts.columns"
29 #else
30 -t "$id_opts.id_list"
31 #end if
32 </command>
26 <inputs> 33 <inputs>
27 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to filter on the identifiers" help="FASTA, FASTQ, or SFF format." /> 34 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." />
28 <param name="input_tabular" type="data" format="tabular" label="Tabular file containing sequence identifiers"/> 35 <conditional name="id_opts">
29 <param name="columns" type="data_column" data_ref="input_tabular" multiple="True" numerical="False" label="Column(s) containing sequence identifiers" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> 36 <param name="id_opts_selector" type="select" label="Filter using the ID list from">
30 <validator type="no_options" message="Pick at least one column"/> 37 <option value="tabular" selected="True">tabular file</option>
31 </param> 38 <option value="list">provided list</option>
39 <!-- add UNION or INTERSECTION of multiple tabular files here? -->
40 </param>
41 <when value="tabular">
42 <param name="input_tabular" type="data" format="tabular" label="Tabular file containing sequence identifiers"/>
43 <param name="columns" type="data_column" data_ref="input_tabular" multiple="True" numerical="False"
44 label="Column(s) containing sequence identifiers"
45 help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
46 <validator type="no_options" message="Pick at least one column"/>
47 </param>
48 </when>
49 <when value="list">
50 <param name="id_list" type="text" size="20x80" area="True" format="tabular"
51 label="List of sequence identifiers (white space separated)"
52 help="You can use both spaces and new lines to separate your identifiers.">
53 <sanitizer>
54 <valid>
55 <!-- default includes underscore, hyphen, etc -->
56 <add value="%"/>
57 <add value="|"/>
58 </valid>
59 </sanitizer>
60 </param>
61 </when>
62 </conditional>
32 <conditional name="output_choice_cond"> 63 <conditional name="output_choice_cond">
33 <param name="output_choice" type="select" label="Output positive matches, negative matches, or both?"> 64 <param name="output_choice" type="select" label="Output positive matches, negative matches, or both?">
34 <option value="both">Both positive matches (ID on list) and negative matches (ID not on list), as two files</option> 65 <option value="both">Both positive matches (ID on list) and negative matches (ID not on list), as two files</option>
35 <option value="pos">Just positive matches (ID on list), as a single file</option> 66 <option value="pos">Just positive matches (ID on list), as a single file</option>
36 <option value="neg">Just negative matches (ID not on list), as a single file</option> 67 <option value="neg">Just negative matches (ID not on list), as a single file</option>
38 <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml --> 69 <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml -->
39 <when value="both" /> 70 <when value="both" />
40 <when value="pos" /> 71 <when value="pos" />
41 <when value="neg" /> 72 <when value="neg" />
42 </conditional> 73 </conditional>
74 <conditional name="adv_opts">
75 <param name="adv_opts_selector" type="select" label="Advanced Options">
76 <option value="basic" selected="True">Hide Advanced Options</option>
77 <option value="advanced">Show Advanced Options</option>
78 </param>
79 <when value="basic" />
80 <when value="advanced">
81 <param name="strip_suffix" type="boolean" value="false" label="Remove typical pair read name suffices when matching identifiers?" help="Will remove suffices including Illumina /1 and /2, Roche 454 .f and .r, and assorted Sanger names like .p* and .q*" />
82 </when>
83 </conditional>
43 </inputs> 84 </inputs>
44 <outputs> 85 <outputs>
45 <data name="output_pos" format="fasta" label="With matched ID"> 86 <data name="output_pos" format_source="input_file" metadata_source="input_file" label="$input_file.name with matched ID">
46 <!-- TODO - Replace this with format="input:input_fastq" if/when that works -->
47 <change_format>
48 <when input_dataset="input_file" attribute="extension" value="sff" format="sff" />
49 <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" />
50 <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" />
51 <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" />
52 <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" />
53 <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" />
54 </change_format>
55 <filter>output_choice_cond["output_choice"] != "neg"</filter> 87 <filter>output_choice_cond["output_choice"] != "neg"</filter>
56 </data> 88 </data>
57 <data name="output_neg" format="fasta" label="Without matched ID"> 89 <data name="output_neg" format_source="input_file" metadata_source="input_file" label="$input_file.name without matched ID">
58 <!-- TODO - Replace this with format="input:input_fastq" if/when that works -->
59 <change_format>
60 <when input_dataset="input_file" attribute="extension" value="sff" format="sff" />
61 <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" />
62 <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" />
63 <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" />
64 <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" />
65 <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" />
66 </change_format>
67 <filter>output_choice_cond["output_choice"] != "pos"</filter> 90 <filter>output_choice_cond["output_choice"] != "pos"</filter>
68 </data> 91 </data>
69 </outputs> 92 </outputs>
70 <tests> 93 <tests>
71 <test> 94 <test>
73 <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" /> 96 <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" />
74 <param name="columns" value="1" /> 97 <param name="columns" value="1" />
75 <param name="output_choice" value="pos" /> 98 <param name="output_choice" value="pos" />
76 <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" /> 99 <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
77 </test> 100 </test>
101 <test>
102 <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
103 <param name="input_tabular" value="k12_hypothetical_alt.tabular" ftype="tabular" />
104 <param name="columns" value="1" />
105 <param name="output_choice" value="pos" />
106 <param name="adv_opts_selector" value="advanced" />
107 <param name="strip_suffix" value="true" />
108 <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
109 </test>
110 <test>
111 <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
112 <param name="id_opts_selector" value="list" />
113 <param name="id_list" value="gi|16127999|ref|NP_414546.1|" />
114 <param name="output_choice" value="pos" />
115 <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
116 </test>
117 <test>
118 <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" />
119 <param name="id_opts_selector" value="list" />
120 <param name="id_list" value="WTSI_1055_1a05 WTSI_1055_1g02" />
121 <param name="output_choice" value="pos" />
122 <param name="adv_opts_selector" value="advanced" />
123 <param name="strip_suffix" value="true" />
124 <output name="output_pos" file="sanger-sample.fastq" ftype="fastq" />
125 </test>
126 <test>
127 <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" />
128 <param name="id_opts_selector" value="tabular" />
129 <param name="input_tabular" value="sanger-pairs-names.tabular" ftype="tabular" />
130 <param name="columns" value="1" />
131 <param name="output_choice" value="both" />
132 <param name="adv_opts_selector" value="advanced" />
133 <param name="strip_suffix" value="true" />
134 <output name="output_pos" file="sanger-pairs-mixed.fastq" ftype="fastq" />
135 <output name="output_neg" file="empty_file.dat" ftype="fastq" />
136 </test>
137 <test>
138 <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" />
139 <param name="input_tabular" value="sanger-pairs-names.tabular" ftype="tabular" />
140 <param name="columns" value="1" />
141 <param name="output_choice" value="both" />
142 <param name="adv_opts_selector" value="advanced" />
143 <param name="strip_suffix" value="false" />
144 <output name="output_pos" file="empty_file.dat" ftype="fastq" />
145 <output name="output_neg" file="sanger-pairs-mixed.fastq" ftype="fastq" />
146 </test>
78 </tests> 147 </tests>
79 <help> 148 <help>
80 **What it does** 149 **What it does**
81 150
82 By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in 151 By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in
83 two, those sequences with or without an ID present in the tabular file column(s) 152 two, those sequences with or without an ID present in the tabular file column(s)
84 specified. You can opt to have a single output file of just the matching records, 153 specified. You can opt to have a single output file of just the matching records,
85 or just the non-matching ones. 154 or just the non-matching ones.
155
156 Instead of providing the identifiers in a tabular file, you can alternatively
157 provide them as a parameter (type or paste them into the text box). This is a
158 useful shortcut for extracting a few sequences of interest without first having
159 to prepare a tabular file.
86 160
87 Note that the order of sequences in the original sequence file is preserved, as 161 Note that the order of sequences in the original sequence file is preserved, as
88 is any Roche XML Manifest in an SFF file. Also, if any sequences share an 162 is any Roche XML Manifest in an SFF file. Also, if any sequences share an
89 identifier (which would be very unusual in SFF files), duplicates are not removed. 163 identifier (which would be very unusual in SFF files), duplicates are not removed.
90 164
120 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 194 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
121 195
122 This tool is available to install into other Galaxy Instances via the Galaxy 196 This tool is available to install into other Galaxy Instances via the Galaxy
123 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id 197 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
124 </help> 198 </help>
199 <citations>
200 <citation type="doi">10.7717/peerj.167</citation>
201 <citation type="doi">10.1093/bioinformatics/btp163</citation>
202 </citations>
125 </tool> 203 </tool>