comparison hicTrainTADClassifier.xml @ 0:2ddd36e02c20 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hicexplorer commit 07802a6bd441d9da888cfb8283f8c2135704f7c9
author iuc
date Wed, 18 Oct 2023 11:13:37 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2ddd36e02c20
1 <tool id="hicexplorer_hictraintadclassifier" name="@BINARY@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
2 <description>train a TAD detection ML model</description>
3 <macros>
4 <token name="@BINARY@">hicTrainTADClassifier</token>
5 <import>macros.xml</import>
6 </macros>
7 <expand macro="requirements" />
8 <command detect_errors="exit_code"><![CDATA[
9 #import re
10
11 @BINARY@
12 --matrices '$matrix_cooler_multiple'
13 --mode train_new
14 --domain_file '$domainFile'
15 --resolution $resolution
16 @CHROMOSOME_LIST@
17 --normalization_method $normalization_method_select
18 --threads @THREADS@
19
20 #if $protein_conditional.protein_selector == 'yes':
21 --protein_file '$proteinFile'
22 #if $protein_conditional.threshold:
23 --threshold $protein_conditional.threshold
24 #end if
25 #if $protein_conditional.leniency:
26 --leniency $protein_conditional.leniency
27 #end if
28 #end if
29
30 #if $ml_conditional.ml_selector == 'yes':
31 $ml_conditional.unselect_border_cases
32 $ml_conditional.concatenate_before_resample
33 $ml_conditional.use_cleanlab
34 --estimators_per_step $ml_conditional.estimators_per_step
35 --distance $ml_conditional.distance
36 --impute_value $ml_conditional.impute_value
37 --resampling_method $ml_conditional.sampling_method_selector
38 #end if
39 --out_file trained_model
40 ]]>
41 </command>
42 <inputs>
43 <expand macro="matrix_cooler_multiple_macro" />
44 <param name="domainFile" format="tabular" type="data" multiple="true" label="Domain file or list of files containing tad boundaries"/>
45 <param name="normalization_method_select" type="select" label="Normalization method">
46 <option value="obs_exp" selected="True">Observed / expected</option>
47 <option value="range">0 - 1 range</option>
48 </param>
49 <param argument="--resolution" type="integer" min="0" max="100000000" label="Resolution of the matrix" help="The matrix resolution of the Hi-C interaction matrix." value="10000" />
50
51 <conditional name="protein_conditional">
52 <param name="protein_selector" type="select" label="Consider protein locations">
53 <option value="yes">Yes</option>
54 <option value="no" selected="True">No</option>
55 </param>
56 <when value="yes">
57 <param name="proteinFile" type="data" format="bed" label="Protein file in bed format" />
58 <param name="threshold" type="float" optional="True" label="Threshold" help="Consider only protein peaks with at least the threshold value"/>
59 <param name="leniency" type="float" optional="True" label="Leniency" help="Leniency for protein quality check. Widens peaks of protein file by leniency*resolution"/>
60 </when>
61 <when value="no" />
62 </conditional>
63
64 <conditional name="ml_conditional">
65 <param name="ml_selector" type="select" label="Configure ML model options">
66 <option value="yes">Yes</option>
67 <option value="no" selected="True">No</option>
68 </param>
69 <when value="yes">
70 <param argument="--unselect_border_cases" type="boolean" truevalue="--unselect_border_cases" falsevalue="" label="Unselect border cases" help="Set whether genes at the border of the matrix up to set distance will not be used for training and testing" />
71 <param argument="--concatenate_before_resample" type="boolean" truevalue="--concatenate_before_resample" falsevalue="" label="Concatenate before resample" help="Whether features build from matrix list are concatenated and resampled together or resampled separatly per matrix. Not important for random undersampling, but alter for other resampling methods and check if performance increases." />
72 <param argument="--use_cleanlab" type="boolean" truevalue="--use_cleanlab" falsevalue="" label="Use cleanlab" help="Use Confident Learning with the cleanlab library" />
73 <param argument="--estimators_per_step" type="integer" min="5" max="1000" label="Estimators per step" help="How many estimators are added in each training step for the classifier" value="20" />
74 <param argument="--distance" type="integer" min="5" max="30" label="Distance" help="max distance between TADs to be used in calculation" value="15" />
75 <param argument="--impute_value" type="float" label="Resolution of the matrix" help="non-numerical float values in matrix will be replaced by this value" value="-1.0" />
76 <param name="sampling_method_selector" type="select" label="Configure ML model options">
77 <option value="undersample_cluster_centroids">Undersample cluster centroids</option>
78 <option value="undersample_random" selected="True">undersample random</option>
79 <option value="passed_method">passed method</option>
80 </param>
81 </when>
82 <when value="no" />
83 </conditional>
84
85 <expand macro="chromosome_list" />
86
87 <param name="saved_classifier" format="binary" optional="True" type="data" label="Use a self-trained classifier"/>
88
89 </inputs>
90 <outputs>
91 <data name="trained_classifier" from_work_dir="trained_model.BIN" format="binary" label="Trained classifier" />
92 </outputs>
93 <tests>
94 <test>
95 <param name="matrix_cooler_multiple" value="hicTrainTADClassifier/small_test_matrix.cool" />
96 <param name="normalization_method_select" value="obs_exp" />
97 <param name="domainFile" value="hicTrainTADClassifier/multiFDR_dekker_domains.bed" />
98 <param name="resolution" value="10000" />
99 <conditional name="ml_conditional">
100 <param name="ml_selector" value="yes" />
101 <param name="unselect_border_cases" value="true" />
102 </conditional>
103 <conditional name="protein_conditional">
104 <param name="protein_selector" value="no" />
105 </conditional>
106 <output name="trained_classifier" file="hicTrainTADClassifier/model.BIN" ftype="binary" />
107 </test>
108 </tests>
109 <help><![CDATA[
110
111 Train TAD predictor
112 ====================
113
114 This program can be used to train new classifiers for hicTADClassifier. These classifiers can later be run to call boundaries for TADs. By default, an EasyEnsembleClassifier as described in Liu et al.: “Exploratory Undersampling for Class-Imbalance Learning” will be trained, but you can pass any sklearn classifier that allows for a warm start. You may also vary the resampling method and a range of hyperparameters to fine tune the model. Do mind to set the correct normalization method and resolution for the classifier. The program will check and raise warnings, when resolutions and normalization methods are mixed up.
115 Also, a protein track file in the narrowPeak format with a threshold value may be passed to filter out low quality boundaries.
116
117 The resulting classifier will be pickled at the specified out_file. A quick example can be seen here, where we varied the feature distance:
118 ## <!-- <param name="proteinFile" value='hicTrainTADClassifier/ctcf_chr2.csv' /> -->
119
120 Usage
121 -----
122
123 .. code-block:: text
124
125 $ hicTrainTADClassifier -m 'train_new' -f 'my_test_matrix.cool' -d 'domains.bed' -o 'new_classifier.data' -n 'range' -r 10000 --distance 18
126
127
128 For more information about HiCExplorer please consider our documentation on readthedocs.io_.
129
130 .. _readthedocs.io: http://hicexplorer.readthedocs.io/en/latest/index.html
131
132 ]]> </help>
133 <expand macro="citations" />
134 </tool>