comparison graphprot_train_predict.xml @ 1:20429f4c1b95 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit f3fb925b83a4982e0cf9a0c11ff93ecbb8e4e6d5"
author bgruening
date Wed, 22 Jan 2020 10:14:41 -0500
parents
children 7bbb7bf6304f
comparison
equal deleted inserted replaced
0:215925e588c4 1:20429f4c1b95
1 <tool id="graphprot_predict_profile" name="GraphProt" version="1.1.7+galaxy1">
2 <description>- Train models and predict RBP binding profiles</description>
3 <requirements>
4 <requirement type="package" version="1.1.7">graphprot</requirement>
5 </requirements>
6
7 <command detect_errors="exit_code"><![CDATA[
8 #if $action_type.action_type_selector == 'train':
9 python '$__tool_directory__/graphprot_train_wrapper.py'
10 --data-id GraphProt
11 --pos '$action_type.pos_fasta_file'
12 --neg '$action_type.neg_fasta_file'
13 $action_type.train_str_model
14 #if $action_type.hpo_options.hpo_mode_type.hpo_mode_type_selector == 'take':
15 --opt-set-size $action_type.hpo_options.hpo_mode_type.opt_set_size
16 #elif $action_type.hpo_options.hpo_mode_type.hpo_mode_type_selector == 'supply':
17 --opt-pos '$action_type.hpo_options.hpo_mode_type.pos_parop_fasta'
18 --opt-neg '$action_type.hpo_options.hpo_mode_type.neg_parop_fasta'
19 #end if
20 $action_type.training_options.disable_cv
21 $action_type.training_options.disable_motifs
22 --min-train $action_type.training_options.min_train
23
24 #elif $action_type.action_type_selector == 'predict':
25 python '$__tool_directory__/graphprot_predict_wrapper.py'
26 --data-id GraphProt
27 --fasta '$action_type.input_fasta_file'
28 --model '$action_type.model_file'
29 --params $action_type.params_file
30 #if $action_type.genomic_sites_bed_file:
31 --gen-site-bed '$action_type.genomic_sites_bed_file'
32 #end if
33 --sc-thr $action_type.prediction_options.score_thr
34 --max-merge-dist $action_type.prediction_options.max_merge_dist
35 --ap-extlr $action_type.prediction_options.ap_extlr
36 $action_type.prediction_options.conf_out
37 $action_type.prediction_options.ws_pred_out
38 #end if
39
40
41 ]]></command>
42
43 <inputs>
44 <conditional name="action_type">
45
46 <param name="action_type_selector" type="select" label="Select an action">
47 <option value="train" selected="true">Train a model</option>
48 <option value="predict">Predict on input sequences</option>
49 </param>
50
51 <when value="train">
52 <param name="pos_fasta_file" type="data" format="fasta"
53 label="Positive sequences FASTA file" argument="-fasta"
54 help="Positive sequences (== RBP binding sites) FASTA file for model training"/>
55 <param name="neg_fasta_file" type="data" format="fasta"
56 label="Negative sequences FASTA file" argument="-negfasta"
57 help="Negative sequences FASTA file for model training"/>
58 <param name="train_str_model" label="Train a structure model" type="boolean"
59 truevalue="--str-model" falsevalue="" checked="False"
60 help="Train a structure model (default: train a sequence model)"/>
61
62 <section name="hpo_options" title="Hyperparameter optimization settings">
63
64 <conditional name="hpo_mode_type">
65 <param name="hpo_mode_type_selector" type="select" label="Select strategy">
66 <option value="take" selected="true">Take sequences for optimization from input</option>
67 <option value="supply">Supply sequences for optimization</option>
68 </param>
69 <when value="take">
70 <param name="opt_set_size" type="integer" value="500"
71 label="Number of sequences for hyperparameter optimization"
72 help="Hyperparameter optimization set size (taken away from both positive and negative input sequences) (default: 500)"/>
73 </when>
74 <when value="supply">
75 <param name="pos_parop_fasta" type="data" format="fasta"
76 label="Positive sequences FASTA file"
77 help="Positive (== RBP binding sites) sequences FASTA file for hyperparameter optimization"/>
78 <param name="neg_parop_fasta" type="data" format="fasta"
79 label="Negative sequences FASTA file"
80 help="Negative sequences FASTA file for hyperparameter optimization"/>
81 </when>
82 </conditional>
83 </section>
84
85 <section name="training_options" title="Training options">
86 <param name="disable_cv" label="Disable 10-fold cross validation" type="boolean"
87 truevalue="--disable-cv" falsevalue="" checked="False"
88 help="Disable 10-fold cross validation step. As a result, no generalization performance results (.cv_results) are output. On the other hand, run time is reduced considerably (default: false)"/>
89 <param name="disable_motifs" label="Disable motif generation" type="boolean"
90 truevalue="--disable-motifs" falsevalue="" checked="False"
91 help="Disable motif generation step, therefore no _motif and _motif.png files are output (default: false)"/>
92 <param name="min_train" type="integer" value="750"
93 label="Minimum number of training sites demanded"
94 help="Minimum number of training sites demanded (for both negatives and positives). In general, try to get more training sites if possible (>> 1000), before lowering this number (default: 750)"/>
95 </section>
96 </when>
97 <when value="predict">
98
99 <param name="input_fasta_file" type="data" format="fasta"
100 label="Input FASTA file" argument="-fasta"
101 help="FASTA file with sequences for which to predict binding profiles or whole site scores"/>
102 <param name="model_file" type="data" format="data"
103 label="GraphProt model file" argument="-model"
104 help="GraphProt model to use for predictions"/>
105 <param name="params_file" type="data" format="txt"
106 label="Model parameter file" argument="-params"
107 help="Parameter file containing model parameters"/>
108 <param name="genomic_sites_bed_file" type="data" format="bed" optional="True"
109 label="Genomic BED file with coordinates from input sequences"
110 help="BED file specifying the genomic regions of the input sequences, to also output peak regions with their genomic coordinates (default: false)"/>
111
112 <section name="prediction_options" title="Prediction options">
113 <param name="score_thr" type="float" value="0"
114 label="Set GraphProt average profile peak score threshold for reporting peak regions"
115 help="Regions with peak score higher or equal the given value are reported (default: 0)"/>
116
117 <param name="ap_extlr" type="integer" value="5" min="0" max="10"
118 label="Smoothing parameter for calculating the average profile"
119 help="Defines the average profile up- and downstream extension to produce the average profile. The mean over small sequence windows (window_length = set_value*2 + 1) is used to get the average profile position-wise scores. A value of 0 means no additional smoothing (== original profile scores), while 10 is applies fairly strong smoothing (default: 5)"/>
120
121 <param name="max_merge_dist" type="integer" value="0" min="0" max="10"
122 label="Maximum distance between two peak regions for merging" argument="-merge-dist"
123 help="By default all non-overlapping regions will be reported. E.g. a distance of 1 means that two regions above the set threshold score will be merged if there is 1 nucleotide that separates the two regions"/>
124
125 <param name="ws_pred_out" label="Predict whole site instead of profile scores" type="boolean"
126 truevalue="--ws-pred" falsevalue="" checked="False"
127 help="Run a whole site prediction instead of calculating profiles (default: false)"/>
128 <param name="conf_out" label="Output high-confidence (p50) peak regions" type="boolean"
129 truevalue="--conf-out" falsevalue="" checked="False"
130 help="Output filtered peak regions BED file or predictions file (if whole site scores prediction enabled), using the median positive training site score (stored in .params file) for filtering (default: false)"/>
131 </section>
132 </when>
133 </conditional>
134 </inputs>
135
136 <outputs>
137 <data name="model_out_file" format="txt" from_work_dir="GraphProt.model" label="${tool.name} on ${on_string}: GraphProt model file">
138 <filter>action_type["action_type_selector"] == "train"</filter>
139 </data>
140
141 <data name="params_out_file" format="txt" from_work_dir="GraphProt.params" label="${tool.name} on ${on_string}: GraphProt model parameters file">
142 <filter>action_type["action_type_selector"] == "train"</filter>
143 </data>
144
145 <data name="cv_results_out_file" format="txt" from_work_dir="GraphProt.cv_results" label="${tool.name} on ${on_string}: GraphProt cross validation results file">
146 <filter>action_type["action_type_selector"] == "train" and not action_type["training_options"]["disable_cv"]</filter>
147 </data>
148
149 <data name="seq_motif_out_file" format="txt" from_work_dir="GraphProt.sequence_motif" label="${tool.name} on ${on_string}: GraphProt sequence motif text file">
150 <filter>action_type["action_type_selector"] == "train" and not action_type["training_options"]["disable_motifs"]</filter>
151 </data>
152 <data name="seq_motif_png_out_file" format="png" from_work_dir="GraphProt.sequence_motif.png" label="${tool.name} on ${on_string}: GraphProt sequence motif png file">
153 <filter>action_type["action_type_selector"] == "train" and not action_type["training_options"]["disable_motifs"]</filter>
154 </data>
155
156 <data name="str_motif_out_file" format="txt" from_work_dir="GraphProt.structure_motif" label="${tool.name} on ${on_string}: GraphProt structure motif text file">
157 <filter>action_type["action_type_selector"] == "train" and not action_type["training_options"]["disable_motifs"]</filter>
158 </data>
159
160 <data name="str_motif_png_out_file" format="png" from_work_dir="GraphProt.structure_motif.png" label="${tool.name} on ${on_string}: GraphProt structure motif png file">
161 <filter>action_type["action_type_selector"] == "train" and not action_type["training_options"]["disable_motifs"] and action_type["train_str_model"]</filter>
162 </data>
163
164 <data name="avg_profile_out_file" format="txt" from_work_dir="GraphProt.avg_profile" label="${tool.name} on ${on_string}: GraphProt average profile file">
165 <filter>action_type["action_type_selector"] == "predict" and not action_type["prediction_options"]["ws_pred_out"]</filter>
166 </data>
167
168 <data name="peaks_out_file" format="bed" from_work_dir="GraphProt.avg_profile.peaks.bed" label="${tool.name} on ${on_string}: GraphProt average profile peaks BED file">
169 <filter>action_type["action_type_selector"] == "predict" and not action_type["prediction_options"]["ws_pred_out"]</filter>
170 </data>
171
172 <data name="p50_peaks_out_file" format="bed" from_work_dir="GraphProt.avg_profile.p50.peaks.bed" label="${tool.name} on ${on_string}: GraphProt average profile p50 peaks BED file">
173 <filter>action_type["action_type_selector"] == "predict" and action_type["prediction_options"]["conf_out"] and not action_type["prediction_options"]["ws_pred_out"]</filter>
174 </data>
175
176 <data name="genomic_peaks_out_file" format="bed" from_work_dir="GraphProt.avg_profile.genomic_peaks.bed" label="${tool.name} on ${on_string}: GraphProt average profile genomic peaks BED file">
177 <filter>action_type["action_type_selector"] == "predict" and action_type["genomic_sites_bed_file"] and not action_type["prediction_options"]["ws_pred_out"]</filter>
178 </data>
179
180 <data name="genomic_p50_peaks_out_file" format="bed" from_work_dir="GraphProt.avg_profile.p50.genomic_peaks.bed" label="${tool.name} on ${on_string}: GraphProt average profile p50 genomic peaks BED file">
181 <filter>action_type["action_type_selector"] == "predict" and action_type["prediction_options"]["conf_out"] and action_type["genomic_sites_bed_file"] and not action_type["prediction_options"]["ws_pred_out"]</filter>
182 </data>
183
184 <data name="predictions_out_file" format="txt" from_work_dir="GraphProt.predictions" label="${tool.name} on ${on_string}: GraphProt whole site predictions file">
185 <filter>action_type["action_type_selector"] == "predict" and action_type["prediction_options"]["ws_pred_out"]</filter>
186 </data>
187
188 <data name="p50_predictions_out_file" format="txt" from_work_dir="GraphProt.p50.predictions" label="${tool.name} on ${on_string}: GraphProt whole site p50 predictions file">
189 <filter>action_type["action_type_selector"] == "predict" and action_type["prediction_options"]["ws_pred_out"] and action_type["prediction_options"]["conf_out"]</filter>
190 </data>
191
192 </outputs>
193
194 <tests>
195
196 <test expect_num_outputs="5">
197 <param name="action_type_selector" value="train"/>
198 <param name="pos_fasta_file" value="test_positives.train.fa" ftype="fasta"/>
199 <param name="neg_fasta_file" value="test_negatives.train.fa" ftype="fasta"/>
200 <param name="hpo_mode_type_selector" value="supply"/>
201 <param name="pos_parop_fasta" value="test_positives.parop.fa" ftype="fasta"/>
202 <param name="neg_parop_fasta" value="test_negatives.parop.fa" ftype="fasta"/>
203 <param name="disable_cv" value="True"/>
204 <param name="disable_motifs" value="False"/>
205 <param name="min_train" value="500"/>
206 <output name="model_out_file" file="test.model"/>
207 <output name="params_out_file" file="test.params"/>
208 <output name="seq_motif_out_file" file="test.sequence_motif"/>
209 </test>
210
211 <test expect_num_outputs="5">
212 <param name="action_type_selector" value="predict"/>
213 <param name="input_fasta_file" value="test_predict.fa" ftype="fasta"/>
214 <param name="model_file" value="test.model" ftype="txt"/>
215 <param name="params_file" value="test.params" ftype="txt"/>
216 <param name="genomic_sites_bed_file" value="test_predict.bed" ftype="bed"/>
217 <param name="conf_out" value="True"/>
218 <output name="genomic_peaks_out_file" file="test_predict.avg_profile.genomic_peaks.bed"/>
219 <output name="avg_profile_out_file" file="test_predict.avg_profile"/>
220 <output name="peaks_out_file" file="test_predict.avg_profile.peaks.bed"/>
221 <output name="p50_peaks_out_file" file="test_predict.avg_profile.p50.peaks.bed"/>
222 <output name="genomic_p50_peaks_out_file" file="test_predict.avg_profile.p50.genomic_peaks.bed"/>
223 </test>
224
225 <test expect_num_outputs="2">
226 <param name="action_type_selector" value="predict"/>
227 <param name="input_fasta_file" value="test_predict.fa" ftype="fasta"/>
228 <param name="model_file" value="test.model" ftype="txt"/>
229 <param name="params_file" value="test.params" ftype="txt"/>
230 <param name="ws_pred_out" value="True"/>
231 <param name="conf_out" value="True"/>
232 <output name="predictions_out_file" file="test_predict.predictions"/>
233 <output name="p50_predictions_out_file" file="test_predict.p50.predictions"/>
234 </test>
235
236 </tests>
237 <help>
238
239 Use GraphProt to train a model or to predict RBP binding profiles using a pretrained RBP model.
240
241
242 **Model training**
243
244 To train a GraphProt model, a FASTA file with positive sequences (= RBP binding sites, usually determined by CLIP-seq) and a FASTA file with negative sequences (non-binding, e.g. randomly selected genomic sites) needs to be supplied. By default a sequence model is trained, since they often show similar performance compared to structure models while taking considerably less time to train. For hyperparameter optimization, a portion of the input FASTA sequences (usually n = 500) is taken away, but you can also provide separate optimization sets. After hyperparameter optimization, a model is trained using the input training sequences (minus the optimization set if not specified otherwise) with the determined optimized parameters. After that, a 10-fold cross validation is run on the training sequences to estimate the generalization performance of the model. Sequence and structure motifs (if structure model training enabled) are also output. Both cross validation and motif output can be disabled to further decrease the runtime.
245
246 By default, the model training output files are:
247
248 1) a .model file storing the model parameters
249
250 2) a .params file storing model hyperparameters and additional information
251
252 3) a .cv_results file containing the cross validation results
253
254 4) _motif and motif.png files (sequence and / or structure)
255
256
257 **Profile prediction**
258
259 This mode computes whole site or position-wise (= profile) binding scores for a given set input FASTA sequences.
260
261 By default, binding profiles are calculated, followed by average profile computation and extraction of peak regions from the average profiles. The average binding profile is more smooth regarding the position-wise (per nucleotide) scores than the initial profile GraphProt outputs and is the recommended way to extract peaks. Note that the amount of smoothness can be controlled in the prediction options (with the lowest value 0 equaling the initial profile). A peak is defined as a contiguous region in the average profile with scores >= the set score threshold (by default 0, can be changed). In addition, a set of high confidence peak regions (p50) can be output. Here the threshold gets set to the median of the scores obtained from the positive training set during model training (information stored in parameters file). Moreover, the peak regions can be converted to genomic regions, if the genomic regions for the input FASTA sequences are supplied.
262
263 Apart from predicting binding profiles, whole site predictions can be output as well. Here the output files are the scores for each input sequence, and optionally the p50 filtered set just like with the average profile peaks.
264
265
266 Summing up, the profile predictions output files are:
267
268 1) an avg_profile file containing the position-wise (per nucleotide) binding profile scores
269
270 2) one or several BED files containing the peak regions (all peaks, p50 peaks, all genomic peaks, p50 genomic peaks)
271
272 3) if whole site prediction is enabled, a .predictions file and optionally a .p50.predictions file
273
274 </help>
275 <citations>
276 <citation type="doi">10.1186/gb-2014-15-1-r17</citation>
277 </citations>
278 </tool>
279
280