annotate immuneml_train_recept.xml @ 2:9bf78cb6b91d draft

"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
author immuneml
date Thu, 01 Jul 2021 14:43:41 +0000
parents 629e7e403e19
children ed3932e6d616
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
1 <tool id="immuneml_train_classifiers" name="Train immune receptor classifiers (simplified interface)" version="@VERSION@.0">
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
2 <description></description>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
3 <macros>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
4 <import>prod_macros.xml</import>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
5 </macros>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
6 <expand macro="requirements" />
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
7 <command><![CDATA[
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
8
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
9 #if $iml_input
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
10 cp -r ${iml_input.extra_files_path}/result/* . &&
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
11 (mv repertoires/* . &>/dev/null || : ) &&
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
12 rm -rf repertoires &&
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
13 #end if
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
14
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
15 python '$__tool_directory__/build_yaml_from_arguments_wrapper.py' --output_path $specs.files_path
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
16 #if $labels
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
17 --labels "$labels"
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
18 #end if
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
19 #if $ml_methods
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
20 #set methods_splitted = str($ml_methods).replace(",", " ")
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
21 --ml_methods $methods_splitted
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
22 #end if
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
23 #if $training_percentage
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
24 --training_percentage $training_percentage
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
25 #end if
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
26 #if $split_count
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
27 --split_count $split_count
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
28 #end if
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
29
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
30 --gap_type $gap_cond.gap_type
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
31 #if $gap_cond.gap_type == "ungapped"
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
32 --k $gap_cond.k
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
33 #end if
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
34 #if $gap_cond.gap_type == "gapped"
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
35 --k_left $gap_cond.k_left
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
36 --k_right $gap_cond.k_right
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
37 --min_gap $gap_cond.min_gap
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
38 --max_gap $gap_cond.max_gap
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
39 #end if
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
40 --position_type $position_type
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
41
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
42 && cp ${specs.files_path}/specs.yaml yaml_copy &&
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
43
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
44 immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
45
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
46 && mv ${html_outfile.files_path}/index.html ${html_outfile}
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
47 && mv ${specs.files_path}/specs.yaml ${specs}
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
48 && mv ${html_outfile.files_path}/immuneML_output.zip $archive
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
49 && mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model}
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
50 ]]>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
51 </command>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
52 <inputs>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
53 <param name="iml_input" type="data" format="iml_dataset" label="Input dataset (immune receptors)" help="This field accepts receptor datasets in the ImmuneML dataset format, as created by the Create Dataset tool."/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
54 <param type="text" name="labels" optional="false" label="Which property (“label”) of the receptors would you like to predict?" help="A receptor property to predict could for example be the epitope it binds to. Such properties must be present as receptor sequence metadata. For example, when using data in VDJdb format, the default fields are named ‘epitope’, ‘epitope_gene’ and ‘epitope_species’. One of these labels must be chosen here."/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
55
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
56 <conditional name="gap_cond">
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
57 <param type="select" name="gap_type" label="The immune receptors will be classified based on subsequences found in the CDR3 region. I assume that the signal that determines the receptor label is:" display="radio">
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
58 <option value="ungapped">A contiguous subsequence of amino acids</option>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
59 <option value="gapped">Two subsequences of amino acids, possibly separated by a gap</option>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
60 </param>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
61 <when value="gapped">
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
62 <param type="integer" name="k_left" label="Given a gapped signal, the sequence length before the gap is:" value="2" min="0"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
63 <param type="integer" name="k_right" label="And the sequence length after the gap is:" value="2" min="0"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
64 <param type="integer" name="min_gap" label="While the minimal gap length is:" value="0" min="0"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
65 <param type="integer" name="max_gap" label="And the maximal gap length is:" value="5" min="0"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
66 </when>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
67 <when value="ungapped">
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
68 <param type="integer" name="k" label="Given a contiguous subsequence of amino acids containing a signal, the expected length of this subsequence is:" value="3" min="0"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
69 </when>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
70 </conditional>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
71 <param type="boolean" name="position_type" label="If the same subsequence occurs in a different position in two receptors, is this expected to be the same signal? "
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
72 truevalue="invariant" falsevalue="positional" checked="true"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
73
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
74 <param type="select" name="ml_methods" label="Which ML methods would you like to include?" help="For each ML method, the optimal hyper parameter settings are determined and the performance of the methods is compared to each other."
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
75 display="checkboxes" multiple="true">
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
76 <option value="RandomForestClassifier">Random forest</option>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
77 <option value="SimpleLogisticRegression">Logistic regression</option>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
78 <option value="SVM">Support Vector Machine</option>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
79 <option value="KNN">K-nearest neighbors</option>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
80 </param>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
81
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
82 <param type="integer" name="training_percentage" label="Percentage of data that is used for training + validation (the remainder is used for testing):" value="70" min="50" max="90" help="This part of the data is used for training the classifier i.e., learning the relevant patterns in the data and determining the optimal hyper parameter settings for the classifier. The remaining data is used to test the performance of the classifier. There is no golden rule that determines the optimal percentage of training data, but typically a value between 60 and 80% is chosen."/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
83 <param type="integer" name="split_count" label="Number of times to repeat the training process with different random splits of data:" value="5" min="0" help="The more often the experiment is repeated, the better the performance of the ML models can be estimated, but the longer it will take for the analysis to complete."/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
84
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
85 </inputs>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
86 <outputs>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
87 <data format="txt" name="specs" label="receptor_classification.yaml"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
88 <data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
89 <data format="zip" name="archive" label="Archive: receptor classification"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
90 <data format="html" name="html_outfile" label="Summary: receptor classification"/>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
91 </outputs>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
92
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
93
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
94 <help><![CDATA[
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
95 The purpose of this tool is to train machine learning (ML) models to predict a characteristic per immune receptor, such as
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
96 antigen specificity. One or more ML models are trained to classify receptors based on the information within the CDR3 sequence(s). Finally, the performance
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
97 of the different methods is compared.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
98 Alternatively, if you want to predict a property per immune repertoire, such as disease status, check out the
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
99 `Train immune repertoire classifiers (simplified interface) <https://galaxy.immuneml.uio.no/root?tool_id=novice_immuneml_interface>`_ tool instead.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
100
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
101 The full documentation can be found `here <https://docs.immuneml.uio.no/galaxy/galaxy_simple_receptors.html>`_.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
102
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
103 **Basic terminology**
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
104
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
105 In the context of ML, the characteristics to predict per receptor are called **labels** and the values that these labels can
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
106 take on are **classes**. One could thus have a label named ‘epitope’ with possible classes ‘binding_gluten’ and ‘not_binding_gluten’.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
107 The labels and classes must be present in the receptor metadata.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
108
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
109 When training an ML model, the goal is for the model to learn **signals** within the data which discriminate between the different
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
110 classes. An ML model that predicts classes is also referred to as a **classifier**. A signal can have a variety of definitions,
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
111 including the presence of a specific subsequence or conserved positions. Our assumptions about what makes up a ‘signal’
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
112 determines how we should represent our data to the ML model. This representation is called **encoding**. In this tool, the encoding is automatically chosen based on
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
113 the user's assumptions about the dataset.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
114
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
115 .. image:: https://docs.immuneml.uio.no/_images/receptor_classification_overview.png
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
116 :height: 500
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
117
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
118 |
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
119 |
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
120
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
121 **An overview of the components of the immuneML receptor classification tool.**
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
122 ImmuneML reads in receptor data with labels (+ and -), encodes the data, trains user-specified ML models and summarizes
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
123 the performance statistics per ML method.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
124 Encoding: position dependent and invariant encoding are shown. The specificity-associated subsequences are highlighted
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
125 with color. The different colors represent independent elements of the antigen specificity signal. Each color represents
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
126 one subsequence, and position dependent subsequences can only have the same color when they occur in the same position,
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
127 although different colors (i.e., nucleotide or amino acid sequences) may occur in the same position.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
128 Training: the training and validation data is used to train ML models and find the optimal hyperparameters through
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
129 5-fold cross-validation. The test set is left out and is used to obtain a fair estimate of the model performance.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
130
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
131
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
132 **Encoding**
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
133
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
134 Encodings for immune receptor data represent the immune receptor based on the subsequences (e.g., 3 – 5 amino acids long, also referred to as k-mers)
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
135 in the CDR3 regions. The CDR3 regions are divided into overlapping subsequences and the (antigen specificity)
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
136 signal may be characterized by the presence or absence of certain sequence motifs in the CDR3 region.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
137 A graphical representation of how a CDR3 sequence can be divided into k-mers, and how these k-mers can relate to specific positions in a 3D immune receptor
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
138 (here: antibody) is shown in this figure:
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
139
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
140 .. image:: https://docs.immuneml.uio.no/_images/3mer_to_3d.png
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
141 :height: 250
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
142
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
143 |
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
144
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
145 The subsequences may be position dependent or invariant. Position invariant means that if a subsequence, e.g.,
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
146 ‘EDNA’ occurs in different positions in the CDR3 it will still be considered the same signal. This is not the case for
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
147 position dependent subsequences, if ‘EDNA’ often occurs in the beginning of the CDR3 in antigen binding receptors,
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
148 then finding ‘EDNA’ in the end of a CDR3 in a new receptor will be considered unrelated. Positions are determined based
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
149 on the IMGT numbering scheme.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
150
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
151 Finally, it is possible to introduce gaps in the encoding of subsequences (not shown in the Figure). In this case, a
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
152 motif is defined by two subsequences separated by a region of varying nucleotide or amino acid length. Thus, the
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
153 subsequences ‘EDNA’, ‘EDGNA’ and ‘EDGAGAGNA’ may all be considered to be part of the same motif: ‘ED’ followed by ‘NA’
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
154 with a gap of 0 – 5 amino acids in between.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
155
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
156 Note that in any case, the subsequences that are associated with the ‘positive’ class may still be present in the ‘negative’
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
157 class, albeit at a lower rate.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
158
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
159 **Training a machine learning model**
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
160
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
161 Training an ML model means optimizing the **parameters** for the model with the goal of predicting the correct class of an (unseen) immune receptor.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
162 Different ML methods require different procedures for training. In addition to the model parameters there are the **hyperparameters**, these
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
163 hyperparameters do not directly change the predictions of a model, but they control the learning process (for example: the learning speed).
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
164
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
165 The immune receptors are divided into sets with different purposes: the training and validation sets are used for finding the optimal parameters
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
166 and hyperparameters respectively. The test set is held out, and is only used to estimate the performance of a trained model.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
167
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
168 In this tool, a range of plausible hyperparameters have been predefined for each ML method. The optimal hyperparameters are found by splitting the
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
169 training/validation data into 5 equal portions, where 4 portions are used to train the ML model (with different hyperparameters) and the remaining
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
170 portion is used to validate the performance of these hyperparameters settings. This is repeated 5 times such that each portion has been used for
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
171 validation once. With the best hyperparameters found in the 5 repetitions, a final model is trained using all 5 portions of the data. This procedure
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
172 is also referred to as 5-fold cross-validation. Note that this 5-fold cross-validation is separate from the number of times the splitting into
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
173 training + validation and testing sets is done (see the overview figure).
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
174
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
175 Finally, the whole process is repeated one or more times with different randomly selected receptors in the test set, to see how robust the performance
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
176 of the ML methods is. The number of times to repeat this splitting into training + validation and test sets is determined in the last question.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
177
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
178 **Tool output**
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
179
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
180 This Galaxy tool will produce the following history elements:
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
181
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
182 - Summary: receptor classification: a HTML page that allows you to browse through all results, including prediction accuracies on
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
183 the various data splits and plots showing the performance of classifiers and learned parameters.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
184
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
185 - Archive: receptor classification: a .zip file containing the complete output folder as it was produced by immuneML. This folder
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
186 contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
187 Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
188
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
189 - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding).
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
190 This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_apply_ml_models.html>`_.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
191
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
192 - receptor_classification.yaml: the YAML specification file that was used by immuneML internally to run the analysis. This file can be
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
193 downloaded, altered, and run again by immuneML using the `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
194
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
195 **More analysis options**
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
196
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
197 A limited selection of immuneML options is available through this tool. If you wish to have full control of the analysis, consider using
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
198 the `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
199 This tool provides other encodings and machine learning methods to choose from, as well as
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
200 data preprocessing and settings for hyperparameter optimization. The interface of the YAML-based tool expects more independence and knowledge about
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
201 machine learning from the user.
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
202
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
203 ]]>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
204 </help>
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
205
629e7e403e19 "planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
immuneml
parents:
diff changeset
206 </tool>