changeset 0:629e7e403e19 draft

"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
author immuneml
date Thu, 01 Jul 2021 11:36:43 +0000
parents
children 08042588bd02
files README.md build_dataset_yaml_wrapper.py build_yaml_from_arguments_wrapper.py immuneml_create_dataset.xml immuneml_simulate_dataset.xml immuneml_simulate_events.xml immuneml_train_ml_model.xml immuneml_train_recept.xml immuneml_train_repert.xml immuneml_yaml.xml metadata.png prod_macros.xml repertoire_classification_overview.png test.py
diffstat 14 files changed, 985 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,23 @@
+# immuneml_tools
+Galaxy tool wrappers for immuneML.
+https://immuneml.uio.no/
+
+## Installation:
+The tools can be installed from a Galaxy toolshed. You can also install them offline by editing Galaxy config files in the usual way.
+
+### New datatype `iml_dataset`
+No matter how you install the tools, you will need to define a new datatype, which is done as follows:
+
+1. In your `galaxy.yml` look up the name of your `datatypes_config_file`. If the name is not yet defined, set
+```
+datatypes_config_file: datatypes_conf.xml
+```
+2. Make `datatypes_conf.xml` by copying `datatypes_conf.xml.sample` unless a `datatypes_config_file` was already defined.
+3. Add the following line to your `datatypes_config_file`:
+```
+<datatype extension="iml_dataset" type="galaxy.datatypes.text:Html" subclass="True"/>
+```
+The line has to be inside `<registration>` along with the other datatypes.
+
+### The immuneML conda package
+Galaxy will need to install the immuneML conda package. This conda installation typically takes several minutes. 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build_dataset_yaml_wrapper.py	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,4 @@
+import sys
+from immuneML.api.galaxy.build_dataset_yaml import main
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build_yaml_from_arguments_wrapper.py	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,4 @@
+import sys
+from immuneML.api.galaxy.build_yaml_from_arguments import main
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_create_dataset.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,178 @@
+<tool id="immune_ml_dataset" name="Create dataset" version="@VERSION@.0">
+  <description></description>
+  <macros>
+        <import>prod_macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <command><![CDATA[
+      #set $input_orig_names = []
+      #for $input in $interface_cond.data_input
+         #if $input
+            #set input_orig_names += ["./"+str($input.element_identifier)]
+            ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
+         #end if
+      #end for
+
+      #if $interface_cond.interface == "simple"
+        python3 '$__tool_directory__/build_dataset_yaml_wrapper.py'
+        --output_path . --file_name specs.yaml
+        #if $interface_cond.dataset_cond.dataset_type == "repertoire"
+            --is_repertoire True
+            --format "$interface_cond.dataset_cond.metadata_cond.data_format"
+            #if $interface_cond.dataset_cond.metadata_cond.data_format != "IReceptor"
+                --metadata_file "$interface_cond.dataset_cond.metadata_cond.metadata_input" &&
+                cp $interface_cond.dataset_cond.metadata_cond.metadata_input "$interface_cond.dataset_cond.metadata_cond.metadata_input.element_identifier"
+            #end if
+        #else
+            --is_repertoire False
+            --format "$interface_cond.dataset_cond.data_format"
+            --metadata_columns "$interface_cond.dataset_cond.metadata_columns"
+            #if $interface_cond.dataset_cond.dataset_type == "sequence"
+                --paired False
+            #elif $interface_cond.dataset_cond.dataset_type == "receptor"
+                --paired True
+                --receptor_chains $interface_cond.dataset_cond.receptor_type
+            #end if
+        #end if
+         && mv ./specs.yaml create_dataset.yaml &&
+      #else
+        cp $yaml_input create_dataset.yaml &&
+      #end if
+
+      immune-ml ./create_dataset.yaml ${html_outfile.files_path} --tool DatasetGenerationTool &&
+
+      mv ${html_outfile.files_path}/index.html ${html_outfile} &&
+      mv ./create_dataset.yaml ${specs}
+
+      ]]>
+  </command>
+  <inputs>
+      <conditional name="interface_cond">
+          <param type="select" name="interface" label="Which interface would you like to use?" display="radio">
+              <option value="simple">Simplified (limited options)</option>
+              <option value="advanced">Advanced (full control through YAML) </option>
+          </param>
+          <when value="simple">
+              <conditional name="dataset_cond">
+                  <param type="select" name="dataset_type" label="Dataset type" display="radio" help="Repertoire datasets
+                   should be used when making predictions per repertoire, such as predicting a disease state. Sequence or
+                    receptor datasets should be used when predicting values for unpaired (single-chain) and paired immune
+                     receptors respectively, like antigen specificity.">
+                      <option value="repertoire">Repertoire dataset</option>
+                      <option value="sequence">Sequence dataset (single chain)</option>
+                      <option value="receptor">Receptor dataset (paired chains)</option>
+                  </param>
+                  <when value="repertoire">
+                      <conditional name="metadata_cond">
+                          <param type="select" name="data_format" label="Data format" display="radio">
+                              <option value="AIRR">AIRR</option>
+                              <option value="IReceptor">iReceptor Gateway</option>
+                              <option value="ImmunoSEQRearrangement">immunoSEQ: rearrangement-level files</option>
+                              <option value="ImmunoSEQSample">immunoSEQ: sample-level files</option>
+                              <option value="MiXCR">MiXCR</option>
+                              <option value="VDJdb">VDJdb</option>
+                              <option value="TenxGenomics">10x Genomics ‘Clonotype consensus annotations’ (CSV)</option>
+                          </param>
+                          <when value="AIRR">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire ARR file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="ImmunoSEQRearrangement">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire immunoSEQ rearrangement file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="ImmunoSEQSample">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire immunoSEQ sample file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="MiXCR">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire MiXCR file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="VDJdb">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire VDJdb file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="TenxGenomics">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire 10x Genomics file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                      </conditional>
+                  </when>
+                  <when value="sequence">
+                      <param type="select" name="data_format" label="Data format" display="radio">
+                          <option value="AIRR">AIRR</option>
+                          <option value="IReceptor">iReceptor Gateway</option>
+                          <option value="ImmunoSEQRearrangement">ImmunoSEQ: rearrangement-level files</option>
+                          <option value="ImmunoSEQSample">ImmunoSEQ: sample-level files</option>
+                          <option value="MiXCR">MiXCR</option>
+                          <option value="VDJdb">VDJdb</option>
+                          <option value="TenxGenomics">10xGenomics ‘Clonotype consensus annotations’ (CSV)</option>
+                      </param>
+                      <param type="text" name="metadata_columns" optional="false" label="Metadata columns" help="Please
+                      specify the names of the columns that contain metadata. The metadata columns specified here can be
+                      used as labels for prediction. Multiple metadata columns may be specified and separated by comma,
+                      for example: Epitope,Epitope gene,Epitope species"/>
+                  </when>
+                  <when value="receptor">
+                      <param type="select" name="data_format" label="Data format" display="radio">
+                          <option value="AIRR">AIRR</option>
+                          <option value="IReceptor">iReceptor Gateway</option>
+                          <option value="VDJdb">VDJdb</option>
+                          <option value="TenxGenomics">10xGenomics ‘Clonotype consensus annotations’ (CSV)</option>
+                      </param>
+                      <param type="select" name="receptor_type" label="Receptor type" display="radio">
+                          <option value="TRA_TRB">T cell alpha/beta</option>
+                          <option value="TRG_TRD">T cell gamma/delta</option>
+                          <option value="IGH_IGL">B cell heavy/light</option>
+                          <option value="IGH_IGK">B cell heavy/kappa</option>
+                      </param>
+                      <param type="text" name="metadata_columns" optional="false" label="Metadata columns" help="Please
+                      specify the names of the columns that contain metadata. The metadata columns specified here can be
+                      used as labels for prediction. Multiple metadata columns may be specified and separated by comma,
+                      for example: Epitope,Epitope gene,Epitope species"/>
+                  </when>
+              </conditional>
+              <param name="data_input" type="data" multiple="true" label="Data files" min="1" max="2000" help="This field should include individual repertoire or receptor files, or iReceptor zip files. Multiple files can be selected by holding down the control/command or shift key, or by clicking 'browse datasets' (folder button on the right). Important: make sure all the files you want to include in the dataset are highlighted in blue or gray."/>
+          </when>
+          <when value="advanced">
+              <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
+              <param name="data_input" type="data" multiple="true" label="Data and metadata files" optional="true" help="This field should include individual repertoire or receptor files, or iReceptor zip files, and optionally a metadata file. Multiple files can be selected by holding down the control/command or shift key, or by clicking 'browse datasets' (folder button on the right). Important: make sure all the files you want to include in the dataset are highlighted."/>
+          </when>
+      </conditional>
+  </inputs>
+    <outputs>
+        <data format="txt" name="specs" label="create_dataset.yaml"/>
+        <data format="iml_dataset" name="html_outfile" label="ImmuneML dataset"/>
+    </outputs>
+
+
+    <help><![CDATA[
+
+        In Galaxy, an immuneML dataset is simply a Galaxy collection containing all relevant files (including an optional metadata file).
+        The Create dataset Galaxy tool allows users to import data from various formats and create immuneML datasets in Galaxy.
+        These datasets are in an optimized binary (Pickle) format, which ensures that you can quickly import the dataset into
+        Galaxy tools without having to repeatedly specify the import parameters.
+
+        Before creating a dataset, the relevant data files must first be uploaded to the Galaxy interface. This can be done either
+        by uploading files from your local computer (use the 'Upload file' tool under the 'Get local data' menu), or by fetching
+        remote data from the iReceptor Plus Gateway or VDJdb (see `How to import remote AIRR datasets in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_import_remote_data.html>`_).
+
+        The imported immuneML dataset is stored in a Galaxy collection, which will appear as a history item on the right side of the screen,
+        and can later be selected as input to other tools.
+
+        The tool has a simplified and an advanced interface. The simplified interface is fully button-based, and relies
+        on default settings for importing datasets. The advanced interface gives full control over import settings through a YAML
+        specification. In most cases, the simplified interface will suffice.
+
+        For the exhaustive documentation of this tool and more information about immuneML datasets, see the tutorial `How to make an immuneML dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_dataset.html>`_.
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - ImmuneML dataset: a sequence, receptor or repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+          (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+        - create_dataset.yaml: the YAML specification file that was used by immuneML to create the dataset.
+          This file can be downloaded and altered (for example to export files in AIRR format, or use non-standard import parameters),
+          and run again using the 'Advanced' interface.
+
+    ]]>
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_simulate_dataset.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,61 @@
+<tool id="immuneml_simulate_dataset" name="Simulate a synthetic immune receptor or repertoire dataset" version="@VERSION@.0">
+  <description></description>
+    <macros>
+        <import>prod_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+  <command><![CDATA[
+
+      cp "$yaml_input" yaml_copy &&
+      immune-ml ./yaml_copy ${html_outfile.files_path} --tool DataSimulationTool &&
+
+      mv ${html_outfile.files_path}/index.html ${html_outfile} && 
+      mv ${html_outfile.files_path}/immuneML_output.zip $archive
+
+      ]]>
+  </command>
+  <inputs>
+      <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
+  </inputs>
+    <outputs>
+        <data format="zip" name="archive" label="Archive: dataset simulation"/>
+        <data format="iml_dataset" name="html_outfile" label="ImmuneML dataset (simulated sequences)"/>
+    </outputs>
+
+
+  <help><![CDATA[
+
+        This Galaxy tool allows you to quickly make a dummy dataset.
+        The tool generates a SequenceDataset, ReceptorDataset or RepertoireDataset consisting of random CDR3 sequences, which could be used for benchmarking machine learning methods or encodings,
+        or testing out other functionalities.
+        The amino acids in the sequences are chosen from a uniform random distribution, and there is no underlying structure in the sequences.
+
+        You can control:
+
+        - The amount of sequences in the dataset, and in the case of a RepertoireDataset, the amount of repertoires
+
+        - The length of the generated sequences
+
+        - Labels, which can be used as a target when training ML models
+
+        Note that since these labels are randomly assigned, they do not bear any meaning and it is not possible to train a ML model with high classification accuracy on this data.
+        Meaningful labels can be added using the `Simulate immune events into existing repertoire/receptor dataset <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_simulation>`_ Galaxy tool.
+
+        For the exhaustive documentation of this tool and an example YAML specification, see the tutorial `How to simulate an AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_simulate_dataset.html>`_.
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - ImmuneML dataset (simulated sequences): a sequence, receptor or repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+          (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+        - Archive: dataset simulation: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+          contains the output of the DatasetExport instruction including raw data files.
+          Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+
+    ]]>
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_simulate_events.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,59 @@
+<tool id="immuneml_simulation" name="Simulate immune events into an existing repertoire dataset" version="@VERSION@.0">
+  <description></description>
+    <macros>
+        <import>prod_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+  <command><![CDATA[
+ 
+      #if $iml_input
+         cp -r ${iml_input.extra_files_path}/result/* . &&
+	 (mv repertoires/* . &>/dev/null || :) &&
+	 rm -rf repertoires &&
+      #end if
+      
+      cp "$yaml_input" yaml_copy &&
+      immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxySimulationTool &&
+
+      mv ${html_outfile.files_path}/index.html ${html_outfile} && 
+      mv ${html_outfile.files_path}/immuneML_output.zip $archive 
+
+      ]]>
+  </command>
+  <inputs>
+      <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
+      <param name="iml_input" type="data" format="iml_dataset" label="Dataset input" optional="true" help="This field accepts datasets in iml_dataset format, as created by the Create Dataset tool."/>
+  </inputs>
+    <outputs>
+        <data format="zip" name="archive" label="Archive: immune signal simulation"/>
+        <data format="iml_dataset" name="html_outfile" label="ImmuneML dataset (simulated immune signals)"/>
+    </outputs>
+
+
+  <help><![CDATA[
+
+        This Galaxy tool can be used to implant short disease-associated motifs into an existing Repertoire dataset.
+        Such a dataset with simulated immune signals can be used to benchmark different ML methods.
+        Any type of repertoire dataset (experimental or simulated) can be used as a starting point for an immune event simulation, as long as it contains amino acid sequences.
+
+        If you instead want to simulate a synthetic dataset from scratch, start with the
+        tool `Simulate a synthetic immune receptor or repertoire dataset <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_simulate_dataset>`_.
+
+        For the exhaustive documentation of this tool and an example YAML specification, see the tutorial `How to simulate immune events into an existing AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_simulate_signals.html>`_.
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - ImmuneML dataset (simulated immune signals): a repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+          (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+        - Archive: immune signal simulation: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+          contains the output of the Simulation instruction including all raw data files.
+          Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+
+    ]]>
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_train_ml_model.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,72 @@
+<tool id="immuneml_train_ml_model" name="Train machine learning models" version="@VERSION@.0">
+  <description></description>
+    <macros>
+        <import>prod_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+  <command><![CDATA[
+
+      #if $iml_input
+         cp -r ${iml_input.extra_files_path}/result/* . &&
+	 (mv repertoires/* . &>/dev/null || :) &&
+	 rm -rf repertoires &&
+      #end if
+
+      #set $input_orig_names = []
+      #if $data_input
+          #for $input in $data_input
+             #set input_orig_names += [str($input.element_identifier)]
+             ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
+          #end for#
+      #end if
+
+      cp "$yaml_input" yaml_copy &&
+      immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel &&
+      mv ${html_outfile.files_path}/index.html ${html_outfile} &&
+      mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model} &&
+      mv ${html_outfile.files_path}/immuneML_output.zip $archive
+      ]]>
+  </command>
+  <inputs>
+      <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
+      <param name="data_input" type="data" multiple="true" label="Additional files" optional="true" help="This field should include individual repertoire files, metadata files, receptor data and others."/>
+      <param name="iml_input" type="data" format="iml_dataset" label="Dataset input" optional="true" help="This field accepts an ImmuneML dataset, as created by the Create Dataset tool."/>
+  </inputs>
+    <outputs>
+        <data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/>
+        <data format="zip" name="archive" label="Archive: ML model training"/>
+        <data format="html" name="html_outfile" label="Summary: ML model training"/>
+    </outputs>
+
+
+  <help>
+      <![CDATA[
+
+      This tool can be used to run hyperparameter optimization over several different ML settings,
+      which include ML models and their parameters, encodings and preprocessing steps. Nested cross-validation is used to identify the optimal combination of ML settings.
+
+      This is a YAML-based Galaxy tool, if you prefer a button-based interface that assumes less ML knowledge,
+      see `Train immune receptor classifiers (easy interface) <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_classifiers>`_ and
+      `Train immune repertoire classifiers (easy interface) <https://galaxy.immuneml.uio.no/root?tool_id=novice_immuneml_interface>`_.
+
+      For more details on how to train ML models in Galaxy, see `the documentation <https://docs.immuneml.uio.no/galaxy/galaxy_train_ml_models.html>`_.
+
+      **Tool output**
+
+      This Galaxy tool will produce the following history elements:
+
+      - Summary: ML model training: a HTML page that allows you to browse through all results, including prediction accuracies on
+        the various data splits and report results.
+
+      - Archive: ML model training: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+        contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
+        Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+      - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding, and
+        optionally preprocessing steps). This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_apply_ml_models.html>`_.
+
+    ]]>
+
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_train_recept.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,206 @@
+<tool id="immuneml_train_classifiers" name="Train immune receptor classifiers (simplified interface)" version="@VERSION@.0">
+  <description></description>
+    <macros>
+        <import>prod_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command><![CDATA[
+
+        #if $iml_input
+           cp -r ${iml_input.extra_files_path}/result/* . &&
+           (mv repertoires/* . &>/dev/null || : ) &&
+           rm -rf repertoires &&
+        #end if
+
+        python '$__tool_directory__/build_yaml_from_arguments_wrapper.py' --output_path $specs.files_path
+        #if $labels
+            --labels "$labels"
+        #end if
+        #if $ml_methods
+            #set methods_splitted = str($ml_methods).replace(",", " ")
+            --ml_methods $methods_splitted
+        #end if
+        #if $training_percentage
+            --training_percentage $training_percentage
+        #end if
+        #if $split_count
+            --split_count $split_count
+        #end if
+
+        --gap_type $gap_cond.gap_type
+        #if $gap_cond.gap_type == "ungapped"
+            --k $gap_cond.k
+        #end if
+        #if $gap_cond.gap_type == "gapped"
+            --k_left $gap_cond.k_left
+            --k_right $gap_cond.k_right
+            --min_gap $gap_cond.min_gap
+            --max_gap $gap_cond.max_gap
+        #end if
+       --position_type $position_type
+
+       && cp ${specs.files_path}/specs.yaml yaml_copy &&
+
+      immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel
+
+      && mv ${html_outfile.files_path}/index.html ${html_outfile}
+      && mv ${specs.files_path}/specs.yaml ${specs}
+      && mv ${html_outfile.files_path}/immuneML_output.zip $archive
+      && mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model}
+      ]]>
+</command>
+    <inputs>
+        <param name="iml_input" type="data" format="iml_dataset" label="Input dataset (immune receptors)" help="This field accepts receptor datasets in the ImmuneML dataset format, as created by the Create Dataset tool."/>
+        <param type="text" name="labels" optional="false" label="Which property (“label”) of the receptors would you like to predict?" help="A receptor property to predict could for example be the epitope it binds to. Such properties must be present as receptor sequence metadata. For example, when using data in VDJdb format, the default fields are named ‘epitope’, ‘epitope_gene’ and ‘epitope_species’. One of these labels must be chosen here."/>
+
+        <conditional name="gap_cond">
+            <param type="select" name="gap_type" label="The immune receptors will be classified based on subsequences found in the CDR3 region. I assume that the signal that determines the receptor label is:" display="radio">
+                <option value="ungapped">A contiguous subsequence of amino acids</option>
+                <option value="gapped">Two subsequences of amino acids, possibly separated by a gap</option>
+            </param>
+            <when value="gapped">
+                <param type="integer" name="k_left" label="Given a gapped signal, the sequence length before the gap is:" value="2" min="0"/>
+                <param type="integer" name="k_right" label="And the sequence length after the gap is:" value="2" min="0"/>
+                <param type="integer" name="min_gap" label="While the minimal gap length is:" value="0" min="0"/>
+                <param type="integer" name="max_gap" label="And the maximal gap length is:" value="5" min="0"/>
+            </when>
+            <when value="ungapped">
+                <param type="integer" name="k" label="Given a contiguous subsequence of amino acids containing a signal, the expected length of this subsequence is:" value="3" min="0"/>
+            </when>
+        </conditional>
+        <param type="boolean" name="position_type" label="If the same subsequence occurs in a different position in two receptors, is this expected to be the same signal? "
+                      truevalue="invariant" falsevalue="positional" checked="true"/>
+
+        <param type="select" name="ml_methods" label="Which ML methods would you like to include?" help="For each ML method, the optimal hyper parameter settings are determined and the performance of the methods is compared to each other."
+            display="checkboxes" multiple="true">
+          <option value="RandomForestClassifier">Random forest</option>
+          <option value="SimpleLogisticRegression">Logistic regression</option>
+          <option value="SVM">Support Vector Machine</option>
+          <option value="KNN">K-nearest neighbors</option>
+        </param>
+
+        <param type="integer" name="training_percentage" label="Percentage of data that is used for training + validation (the remainder is used for testing):" value="70" min="50" max="90" help="This part of the data is used for training the classifier i.e., learning the relevant patterns in the data and determining the optimal hyper parameter settings for the classifier. The remaining data is used to test the performance of the classifier. There is no golden rule that determines the optimal percentage of training data, but typically a value between 60 and 80% is chosen."/>
+        <param type="integer" name="split_count" label="Number of times to repeat the training process with different random splits of data:" value="5" min="0" help="The more often the experiment is repeated, the better the performance of the ML models can be estimated, but the longer it will take for the analysis to complete."/>
+
+    </inputs>
+    <outputs>
+        <data format="txt" name="specs" label="receptor_classification.yaml"/>
+        <data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/>
+        <data format="zip" name="archive" label="Archive: receptor classification"/>
+        <data format="html" name="html_outfile" label="Summary: receptor classification"/>
+    </outputs>
+
+
+    <help><![CDATA[
+        The purpose of this tool is to train machine learning (ML) models to predict a characteristic per immune receptor, such as
+        antigen specificity. One or more ML models are trained to classify receptors based on the information within the CDR3 sequence(s). Finally, the performance
+        of the different methods is compared.
+        Alternatively, if you want to predict a property per immune repertoire, such as disease status, check out the
+        `Train immune repertoire classifiers (simplified interface) <https://galaxy.immuneml.uio.no/root?tool_id=novice_immuneml_interface>`_ tool instead.
+
+        The full documentation can be found `here <https://docs.immuneml.uio.no/galaxy/galaxy_simple_receptors.html>`_.
+
+        **Basic terminology**
+
+        In the context of ML, the characteristics to predict per receptor are called **labels** and the values that these labels can
+        take on are **classes**. One could thus have a label named ‘epitope’ with possible classes ‘binding_gluten’ and ‘not_binding_gluten’.
+        The labels and classes must be present in the receptor metadata.
+
+        When training an ML model, the goal is for the model to learn **signals** within the data which discriminate between the different
+        classes. An ML model that predicts classes is also referred to as a **classifier**. A signal can have a variety of definitions,
+        including the presence of a specific subsequence or conserved positions. Our assumptions about what makes up a ‘signal’
+        determines how we should represent our data to the ML model. This representation is called **encoding**. In this tool, the encoding is automatically chosen based on
+        the user's assumptions about the dataset.
+
+        .. image:: https://docs.immuneml.uio.no/_images/receptor_classification_overview.png
+            :height: 500
+
+        |
+        |
+
+        **An overview of the components of the immuneML receptor classification tool.**
+        ImmuneML reads in receptor data with labels (+ and -), encodes the data, trains user-specified ML models and summarizes
+        the performance statistics per ML method.
+        Encoding: position dependent and invariant encoding are shown. The specificity-associated subsequences are highlighted
+        with color. The different colors represent independent elements of the antigen specificity signal. Each color represents
+        one subsequence, and position dependent subsequences can only have the same color when they occur in the same position,
+        although different colors (i.e., nucleotide or amino acid sequences) may occur in the same position.
+        Training: the training and validation data is used to train ML models and find the optimal hyperparameters through
+        5-fold cross-validation. The test set is left out and is used to obtain a fair estimate of the model performance.
+
+
+        **Encoding**
+
+        Encodings for immune receptor data represent the immune receptor based on the subsequences (e.g., 3 – 5 amino acids long, also referred to as k-mers)
+        in the CDR3 regions. The CDR3 regions are divided into overlapping subsequences and the (antigen specificity)
+        signal may be characterized by the presence or absence of certain sequence motifs in the CDR3 region.
+        A graphical representation of how a CDR3 sequence can be divided into k-mers, and how these k-mers can relate to specific positions in a 3D immune receptor
+        (here: antibody) is shown in this figure:
+
+        .. image:: https://docs.immuneml.uio.no/_images/3mer_to_3d.png
+            :height: 250
+
+        |
+
+        The subsequences may be position dependent or invariant. Position invariant means that if a subsequence, e.g.,
+        ‘EDNA’ occurs in different positions in the CDR3 it will still be considered the same signal. This is not the case for
+        position dependent subsequences, if ‘EDNA’ often occurs in the beginning of the CDR3 in antigen binding receptors,
+        then finding ‘EDNA’ in the end of a CDR3 in a new receptor will be considered unrelated. Positions are determined based
+        on the IMGT numbering scheme.
+
+        Finally, it is possible to introduce gaps in the encoding of subsequences (not shown in the Figure). In this case, a
+        motif is defined by two subsequences separated by a region of varying nucleotide or amino acid length. Thus, the
+        subsequences ‘EDNA’, ‘EDGNA’ and ‘EDGAGAGNA’ may all be considered to be part of the same motif: ‘ED’ followed by ‘NA’
+        with a gap of 0 – 5 amino acids in between.
+
+        Note that in any case, the subsequences that are associated with the ‘positive’ class may still be present in the ‘negative’
+        class, albeit at a lower rate.
+
+        **Training a machine learning model**
+
+        Training an ML model means optimizing the **parameters** for the model with the goal of predicting the correct class of an (unseen) immune receptor.
+        Different ML methods require different procedures for training. In addition to the model parameters there are the **hyperparameters**, these
+        hyperparameters do not directly change the predictions of a model, but they control the learning process (for example: the learning speed).
+
+        The immune receptors are divided into sets with different purposes: the training and validation sets are used for finding the optimal parameters
+        and hyperparameters respectively. The test set is held out, and is only used to estimate the performance of a trained model.
+
+        In this tool, a range of plausible hyperparameters have been predefined for each ML method. The optimal hyperparameters are found by splitting the
+        training/validation data into 5 equal portions, where 4 portions are used to train the ML model (with different hyperparameters) and the remaining
+        portion is used to validate the performance of these hyperparameters settings. This is repeated 5 times such that each portion has been used for
+        validation once. With the best hyperparameters found in the 5 repetitions, a final model is trained using all 5 portions of the data. This procedure
+        is also referred to as 5-fold cross-validation. Note that this 5-fold cross-validation is separate from the number of times the splitting into
+        training + validation and testing sets is done (see the overview figure).
+
+        Finally, the whole process is repeated one or more times with different randomly selected receptors in the test set, to see how robust the performance
+        of the ML methods is. The number of times to repeat this splitting into training + validation and test sets is determined in the last question.
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - Summary: receptor classification: a HTML page that allows you to browse through all results, including prediction accuracies on
+          the various data splits and plots showing the performance of classifiers and learned parameters.
+
+        - Archive: receptor classification: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+          contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
+          Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+        - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding).
+          This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_apply_ml_models.html>`_.
+
+        - receptor_classification.yaml: the YAML specification file that was used by immuneML internally to run the analysis. This file can be
+          downloaded, altered, and run again by immuneML using the  `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
+
+        **More analysis options**
+
+        A limited selection of immuneML options is available through this tool. If you wish to have full control of the analysis, consider using
+        the `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
+        This tool provides other encodings and machine learning methods to choose from, as well as
+        data preprocessing and settings for hyperparameter optimization. The interface of the YAML-based tool expects more independence and knowledge about
+        machine learning from the user.
+
+    ]]>
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_train_repert.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,235 @@
+<tool id="novice_immuneml_interface" name="Train immune repertoire classifiers (simplified interface)" version="@VERSION@.0">
+  <description></description>
+    <macros>
+        <import>prod_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command><![CDATA[
+
+      #if $iml_input
+         cp -r ${iml_input.extra_files_path}/result/* . &&
+	 (mv repertoires/* . &>/dev/null || :) &&
+	 rm -rf repertoires &&
+      #end if
+
+        python '$__tool_directory__/build_yaml_from_arguments_wrapper.py' --output_path $specs.files_path
+        #if $labels
+            --labels "$labels"
+        #end if
+        #if $ml_methods
+            #set methods_splitted = str($ml_methods).replace(",", " ")
+            --ml_methods $methods_splitted
+        #end if
+        #if $training_percentage
+            --training_percentage $training_percentage
+        #end if
+        #if $split_count
+            --split_count $split_count
+        #end if
+        #if $sequence_cond.sequence_type
+            --sequence_type $sequence_cond.sequence_type
+        #end if
+        #if $sequence_cond.sequence_type == "subsequence"
+            --position_type $sequence_cond.position_type
+            --gap_type $sequence_cond.gap_cond.gap_type
+            #if $sequence_cond.gap_cond.gap_type == "ungapped"
+                --k $sequence_cond.gap_cond.k
+            #end if
+            #if $sequence_cond.gap_cond.gap_type == "gapped"
+                --k_left $sequence_cond.gap_cond.k_left
+                --k_right $sequence_cond.gap_cond.k_right
+                --min_gap $sequence_cond.gap_cond.min_gap
+                --max_gap $sequence_cond.gap_cond.max_gap
+            #end if
+        #end if
+        #if $reads
+            --reads $reads
+        #end if
+
+       && cp ${specs.files_path}/specs.yaml yaml_copy &&
+
+      immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel
+
+      && mv ${html_outfile.files_path}/index.html ${html_outfile}
+      && mv ${specs.files_path}/specs.yaml ${specs}
+      && mv ${html_outfile.files_path}/immuneML_output.zip $archive
+      && mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model}
+      ]]>
+</command>
+    <inputs>
+        <param name="iml_input" type="data" format="iml_dataset"  label="Input dataset (immune repertoires)" help="Here you can select an ImmuneML dataset containing a repertoire dataset, as produced by the ‘Create dataset’ tool. Please make sure your dataset contains enough repertoires, we recommend using at least 50. The minimum number of repertoires needed to run this tool successfully is 14 (for example: 7 diseased and 7 healthy). More repertoires are needed if your dataset is imbalanced (many more diseased or many more healthy), or if you decrease the percentage of data that is used for training. "/>
+        <param type="text" name="labels" optional="false" label="Which property (“label”) of the repertoires would you like to predict?" help="Repertoire property to predict could for example be a disease status. This property must be present as a label in the repertoire metadata."/>
+
+        <conditional name="sequence_cond">
+            <param type="select" name="sequence_type" label="I assume that the true class of a repertoire (for example: disease status) can be determined based on the presence of..." display="radio" help="See 'Encoding' in the tool description.">
+              <option value="subsequence">Similar (but not identical) CDR3 sequences, or identical subsequences</option>
+              <option value="complete">Complete and identical receptor sequences</option>
+            </param>
+            <when value="subsequence">
+               <param type="boolean" name="position_type" label="If the same CDR3 subsequence occurs in a different position in two receptors, is this expected to be the same signal? "
+                      truevalue="invariant" falsevalue="positional" checked="true"/>
+                <conditional name="gap_cond">
+                    <param type="select" name="gap_type" label="The signal is expected to correspond to:" display="radio">
+                        <option value="ungapped">Contiguous subsequences of amino acids</option>
+                        <option value="gapped">Subsequences of amino acids separated by a gap</option>
+                    </param>
+                    <when value="ungapped">
+                        <param type="integer" name="k" label="Given a contiguous subsequence of amino acids containing a signal, the expected length of this subsequence is:" value="3" min="0"/>
+                    </when>
+                    <when value="gapped">
+                        <param type="integer" name="k_left" label="Given a gapped signal, the sequence length before the gap is:" value="2" min="0"/>
+                        <param type="integer" name="k_right" label="And the sequence length after the gap is:" value="2" min="0"/>
+                        <param type="integer" name="min_gap" label="While the minimal gap length is:" value="0" min="0"/>
+                        <param type="integer" name="max_gap" label="And the maximal gap length is:" value="5" min="0"/>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+        <param type="select" name="reads" label="I assume that" display="radio" help="If only the presence/absence of a clonotype matters, the read frequency (‘count’) information is ignored. Otherwise, the importance of a sequence or subsequence is scaled by its read frequency, and large clonotypes will have more influence on the ML model and its results.">
+            <option value="unique">Only the presence/absence of a clone matters</option>
+            <option value="all">The frequency of a clone matters</option>
+        </param>
+
+        <param type="select" name="ml_methods" label="Which ML methods would you like to include?" help="For each ML method, the optimal hyper parameter settings are determined and the performance of the methods is compared to each other."
+            display="checkboxes" multiple="true">
+          <option value="RandomForestClassifier">Random forest</option>
+          <option value="SimpleLogisticRegression">Logistic regression</option>
+          <option value="SVM">Support Vector Machine</option>
+          <option value="KNN">K-nearest neighbors</option>
+        </param>
+
+        <param type="integer" name="training_percentage" label="Percentage of data that is used for training + validation (the remainder is used for testing):" value="70" min="50" max="90" help="This part of the data is used for training the classifier i.e., learning the relevant patterns in the data and determining the optimal hyper parameter settings for the classifier. The remaining data is used to test the performance of the classifier. There is no golden rule that determines the optimal percentage of training data, but typically a value between 60 and 80% is chosen."/>
+        <param type="integer" name="split_count" label="Number of times to repeat the training process with different random splits of data:" value="5" min="0" help="This is the number of times we split into random sets for training + validation and testing. The more often the experiment is repeated, the better the performance of the ML models can be estimated, but the longer it will take for the analysis to complete. "/>
+
+    </inputs>
+    <outputs>
+        <data format="txt" name="specs" label="repertoire_classification.yaml"/>
+        <data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/>
+        <data format="zip" name="archive" label="Archive: repertoire classification"/>
+        <data format="html" name="html_outfile" label="Summary: repertoire classification"/>
+    </outputs>
+
+
+    <help><![CDATA[
+        The purpose of this tool is to train machine learning (ML) models to predict a characteristic per immune repertoire, such as
+        a disease status. One or more ML models are trained to classify repertoires based on the information within the sets of CDR3 sequences. Finally, the performance
+        of the different methods is compared.
+        Alternatively, if you want to predict a property per immune receptor, such as antigen specificity, check out the
+        `Train immune receptor classifiers (simplified interface) <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_classifiers>`_ tool instead.
+
+        The full documentation can be found `here <https://docs.immuneml.uio.no/galaxy/galaxy_simple_repertoires.html>`_.
+
+        **Basic terminology**
+
+        In the context of ML, the characteristics to predict per repertoire are called **labels** and the values that these labels can take on are **classes**.
+        One could thus have a label named ‘CMV_status’ with possible classes ‘positive’ and ‘negative’. The labels and classes must be present in the metadata
+        file, in columns where the header and values correspond to the label and classes respectively.
+
+        .. image:: https://docs.immuneml.uio.no/_images/metadata_repertoire_classification.png
+            :height: 150
+
+        |
+
+        When training an ML model, the goal is for the model to learn **signals** within the data which discriminate between the different classes. An ML model
+        that predicts classes is also referred to as a **classifier**. A signal can have a variety of definitions, including the presence of specific receptors,
+        groups of similar receptors or short CDR3 subsequences in an immune repertoire. Our assumptions about what makes up a ‘signal’ determines how we
+        should represent our data to the ML model. This representation is called **encoding**. In this tool, the encoding is automatically chosen based on
+        the user's assumptions about the dataset.
+
+
+        .. image:: https://docs.immuneml.uio.no/_images/repertoire_classification_overview.png
+            :height: 500
+
+        |
+        |
+
+        **An overview of the components of the immuneML repertoire classification tool.**
+        immuneML reads in repertoire data with labels (+ and -), encodes the
+        data, trains user-specified ML models and summarizes the performance statistics per ML method.
+        Encoding: different forms of encoding are shown; full sequence encoding and position dependent and invariant subsequence encoding.
+        The disease-associated sequences or sub-sequences are highlighted with color. The different colors represent independent elements of the disease signal.
+        Each color represents one (sub)sequence, and position dependent subsequences can only have the same color when they occur in the same position,
+        although different colors (i.e., nucleotide or amino acid sequences) may occur in the same position.
+        Training: the training and validation data is used to train ML models and find the optimal hyperparameters through 5-fold cross-validation.
+        The test set is left out and is used to obtain a fair estimate of the model performance.
+
+        **Encoding**
+
+        The simplest encoding represents an immune repertoire based on the full CDR3 sequences that it contains. This means the ML models will learn to look
+        at which CDR3 sequences are more often present in the ‘positive’ or ‘negative’ classes. It also means that two similar (non-identical) CDR3 sequences
+        are treated as independent pieces of information; if a particular sequence often occurs in diseased repertoires, then finding a similar sequence in a
+        new repertoire is no evidence for this repertoire also being diseased.
+
+        Other encoding variants are based on shorter subsequences (e.g., 3 – 5 amino acids long, also referred to as k-mers) in the CDR3 regions of an immune repertoire. With this
+        encoding, the CDR3 regions are divided into overlapping subsequences and the (disease) signal may be characterized by the presence or absence of
+        certain sequence motifs in the CDR3 regions. Here, two similar CDR3 sequences are no longer independent, because they contain many identical subsequences.
+        A graphical representation of how a CDR3 sequence can be divided into k-mers, and how these k-mers can relate to specific positions in a 3D immune receptor
+        (here: antibody) is shown in this figure:
+
+        .. image:: https://docs.immuneml.uio.no/_images/3mer_to_3d.png
+            :height: 250
+
+        |
+
+        The subsequences may be position-dependent or invariant. Position invariant means that if a subsequence, e.g., ‘EDNA’ occurs in different positions
+        in the CDR3 it will still be considered the same signal. This is not the case for position dependent subsequences, if ‘EDNA’ often occurs in the
+        beginning of the CDR3 in diseased repertoires, then finding ‘EDNA’ in the end of a CDR3 in a new repertoire will be considered unrelated. Positions
+        are determined based on the IMGT numbering scheme.
+
+        Finally, it is possible to introduce gaps in the encoding of subsequences (not shown in the Figure). In this case, a motif is defined by two
+        subsequences separated by a region of varying nucleotide or amino acid length. Thus, the subsequences ‘EDNA’, ‘EDGNA’ and ‘EDGAGAGNA’ may all be
+        considered to be part of the same motif: ‘ED’ followed by ‘NA’ with a gap of 0 – 5 amino acids in between.
+
+        Note that in any case, the (sub)sequences that are associated with the ‘positive’ class may still be present in the ‘negative’ class, albeit at a lower rate.
+
+
+
+        **Training a machine learning model**
+
+        Training an ML model means optimizing the **parameters** for the model with the goal of predicting the correct class of an (unseen) immune repertoire.
+        Different ML methods require different procedures for training. In addition to the model parameters there are the **hyperparameters**, which
+        do not directly change the predictions of a model, but they control the learning process (for example: the learning speed).
+
+        The immune repertoires are divided into sets with different purposes: the training and validation sets are used for finding the optimal parameters
+        and hyperparameters respectively. The test set is held out, and is only used to estimate the performance of a trained model.
+
+        In this tool, a range of plausible hyperparameters have been predefined for each ML method. The optimal hyperparameters are found by splitting the
+        training/validation data into 5 equal portions, where 4 portions are used to train the ML model (with different hyperparameters) and the remaining
+        portion is used to validate the performance of these hyperparameter settings. This is repeated 5 times such that each portion has been used for
+        validation once. With the best hyperparameters found in the 5 repetitions, a final model is trained using all 5 portions of the data. This procedure
+        is also referred to as 5-fold cross-validation. Note that this 5-fold cross-validation is separate from the number of times the splitting into
+        training + validation and testing sets is done (see the overview figure).
+
+        Finally, the whole process is repeated one or more times with different randomly selected repertoires in the test set, to see how robust the performance
+        of the ML methods is. The number of times to repeat this splitting into training + validation and test sets is determined in the last question.
+
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - Summary: repertoire classification: a HTML page that allows you to browse through all results, including prediction accuracies on
+          the various data splits and plots showing the performance of classifiers and learned parameters.
+
+        - Archive: repertoire classification: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+          contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
+          Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+        - optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding).
+          This .zip file can subsequently be used as an input when `applying previously trained ML models to a new AIRR dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_apply_ml_models.html>`_.
+
+        - repertoire_classification.yaml: the YAML specification file that was used by immuneML internally to run the analysis. This file can be
+          downloaded, altered, and run again by immuneML using the  `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
+
+        **More analysis options**
+
+        A limited selection of immuneML options is available through this tool. If you wish to have full control of the analysis, consider using
+        the `Train machine learning models <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
+        This tool provides other encodings and machine learning methods to choose from, as well as
+        data preprocessing and settings for hyperparameter optimization. The interface of the YAML-based tool expects more independence and knowledge about
+        machine learning from the user.
+
+    ]]>
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_yaml.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,74 @@
+<tool id="immune_ml" name="Run immuneML with any YAML specification" version="@VERSION@.0">
+  <description></description>
+    <macros>
+        <import>prod_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+  <command><![CDATA[
+
+      #if $iml_input
+         cp -r ${iml_input.extra_files_path}/result/* . &&
+	 (mv repertoires/* . &>/dev/null || :) &&
+	 rm -rf repertoires &&
+      #end if
+
+      #set $input_orig_names = []
+      #if $data_input
+          #for $input in $data_input
+             #set input_orig_names += [str($input.element_identifier)]
+             ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
+          #end for#
+      #end if
+
+      cp "$yaml_input" yaml_copy &&
+      immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyYamlTool &&
+      mv ${html_outfile.files_path}/index.html ${html_outfile} && 
+      mv ${html_outfile.files_path}/immuneML_output.zip $archive
+
+      ]]>
+  </command>
+  <inputs>
+      <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
+      <param name="data_input" type="data" multiple="true" label="Additional files" optional="true" help="This field should include individual repertoire files, metadata files, receptor data and others."/>
+      <param name="iml_input" type="data"  format="iml_dataset" label="ImmuneML dataset" optional="true" help="This field accepts ImmuneML datasets, as created by the Create Dataset tool."/>
+  </inputs>
+    <outputs>
+        <data format="zip" name="archive" label="Archive: immuneML Analysis"/>
+        <data format="html" name="html_outfile" label="Summary: immuneML analysis"/>
+    </outputs>
+
+
+      <help><![CDATA[
+
+
+        This Galaxy tool can be used to run any possible YAML-based immuneML analysis in Galaxy.
+
+        It is typically recommended to use the analysis-specific Galaxy tools for
+        `creating datasets <https://galaxy.immuneml.uio.no/root?tool_id=immune_ml_dataset>`_,
+        `simulating synthetic data <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_simulate_dataset>`_,
+        `implanting synthetic immune signals <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_simulation>`_ or
+        `training <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_train_ml_model>`_ and
+        `applying <https://galaxy.immuneml.uio.no/root?tool_id=immuneml_apply_ml_model>`_ ML models instead of this tool.
+        These other tools are able to export the relevant output files to Galaxy history elements.
+
+        However, when you want to run the `ExploratoryAnalysis <https://docs.immuneml.uio.no/specification.html#exploratoryanalysis>`_ instruction,
+        or other analyses that do not have a corresponding Galaxy tool, this generic tool can be used.
+
+        For the exhaustive documentation of this tool and an example YAML specification for exploratory analysis, see the tutorial `How to run any AIRR ML analysis in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_general_yaml.html>`_.
+
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - Summary: immuneML analysis: a HTML page that allows you to browse through all results.
+
+        - ImmuneML Analysis Archive: a .zip file containing the complete output folder as it was produced by immuneML. This folder
+          contains the output of the instruction that was used, including all raw data files.
+          Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
+
+
+    ]]>
+    </help>
+
+</tool>
Binary file metadata.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/prod_macros.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,9 @@
+<macros>
+  <token name="@VERSION@">2.0.1</token>
+    <xml name="requirements">
+        <requirements>
+          <requirement type="package" version="@VERSION@">immuneML</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+</macros>
Binary file repertoire_classification_overview.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test.py	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,60 @@
+import argparse
+import os
+from shutil import copyfile
+
+#immuneml --inputs file1 file2 file3 --output_dir /some/path --yaml_path abc.yml --metadata abc.csv --tool galaxy_yaml_tool
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Tool for detecting known and novel MicroRNAs')
+    parser.add_argument('-o', '--output_dir', help='Output directory', default='.', required=True)
+    parser.add_argument('-i', '--inputs', help='Input directory', default='.', required=True, nargs='+')
+    parser.add_argument('-y', '--yaml', help='Yaml input', default='.', required=True)
+    parser.add_argument('-m', '--metadata', help='Metadata input', default='.', required=False)
+    parser.add_argument('-t', '--tool', help='Tool', default='.', required=False)
+
+    return parser.parse_args()
+
+
+def main():
+    print('main')
+    args = get_args()
+
+    print(args.output_dir)
+    print(args.inputs)
+
+    #os.mkdir(args.output_dir)
+    i = 0
+    html_files_links = ''
+    for f in args.inputs:
+        filename = str(i) + '.txt'
+        copyfile(f, os.path.join(args.output_dir, str(i) + '.txt'))
+        i += 1
+        html_files_links += '<li><a href="' + filename + '" title="' + filename + '">Input file ' + str(i) + '</a></li>'
+
+    copyfile(args.yaml, os.path.join(args.output_dir, 'yaml_file.txt'))
+    copyfile(args.metadata, os.path.join(args.output_dir, 'metadata_file.txt'))
+    copyfile('pipout.txt', os.path.join(args.output_dir, 'pipout.txt'))
+    copyfile('immuneout.txt', os.path.join(args.output_dir, 'immuneout.txt'))
+    html_files_links += '<li><a href="yaml_file.txt" title="YAML file">YAML file</a></li>'
+    html_files_links += '<li><a href="metadata_file.txt" title="Metadata file">Metadata file</a></li>'
+    html_files_links += '<li><a href="pipout.txt" title="Pip output">Pip output</a></li>'
+    html_files_links += '<li><a href="immuneout.txt" title="ImmuneML output">ImmuneML output</a></li>'
+
+    html_output = open(os.path.join(args.output_dir, 'output.html'), 'w')
+
+    html_test = '''<nav>
+    <ul>'''
+
+    html_test += html_files_links
+    html_test += '''</ul>
+    </nav>'''
+    html_output.write(html_test)
+    html_output.close()
+
+
+if __name__ == '__main__':
+    main()
+
+
+
+