diff immuneml_create_dataset.xml @ 0:629e7e403e19 draft

"planemo upload commit 2fed2858d4044a3897a93a5604223d1d183ceac0-dirty"
author immuneml
date Thu, 01 Jul 2021 11:36:43 +0000
parents
children ed3932e6d616
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/immuneml_create_dataset.xml	Thu Jul 01 11:36:43 2021 +0000
@@ -0,0 +1,178 @@
+<tool id="immune_ml_dataset" name="Create dataset" version="@VERSION@.0">
+  <description></description>
+  <macros>
+        <import>prod_macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <command><![CDATA[
+      #set $input_orig_names = []
+      #for $input in $interface_cond.data_input
+         #if $input
+            #set input_orig_names += ["./"+str($input.element_identifier)]
+            ([ -e ./"$input.element_identifier" ] && echo "File '$input.element_identifier' already exists in the input folder, skipping." || ln -s $input "$input.element_identifier") &&
+         #end if
+      #end for
+
+      #if $interface_cond.interface == "simple"
+        python3 '$__tool_directory__/build_dataset_yaml_wrapper.py'
+        --output_path . --file_name specs.yaml
+        #if $interface_cond.dataset_cond.dataset_type == "repertoire"
+            --is_repertoire True
+            --format "$interface_cond.dataset_cond.metadata_cond.data_format"
+            #if $interface_cond.dataset_cond.metadata_cond.data_format != "IReceptor"
+                --metadata_file "$interface_cond.dataset_cond.metadata_cond.metadata_input" &&
+                cp $interface_cond.dataset_cond.metadata_cond.metadata_input "$interface_cond.dataset_cond.metadata_cond.metadata_input.element_identifier"
+            #end if
+        #else
+            --is_repertoire False
+            --format "$interface_cond.dataset_cond.data_format"
+            --metadata_columns "$interface_cond.dataset_cond.metadata_columns"
+            #if $interface_cond.dataset_cond.dataset_type == "sequence"
+                --paired False
+            #elif $interface_cond.dataset_cond.dataset_type == "receptor"
+                --paired True
+                --receptor_chains $interface_cond.dataset_cond.receptor_type
+            #end if
+        #end if
+         && mv ./specs.yaml create_dataset.yaml &&
+      #else
+        cp $yaml_input create_dataset.yaml &&
+      #end if
+
+      immune-ml ./create_dataset.yaml ${html_outfile.files_path} --tool DatasetGenerationTool &&
+
+      mv ${html_outfile.files_path}/index.html ${html_outfile} &&
+      mv ./create_dataset.yaml ${specs}
+
+      ]]>
+  </command>
+  <inputs>
+      <conditional name="interface_cond">
+          <param type="select" name="interface" label="Which interface would you like to use?" display="radio">
+              <option value="simple">Simplified (limited options)</option>
+              <option value="advanced">Advanced (full control through YAML) </option>
+          </param>
+          <when value="simple">
+              <conditional name="dataset_cond">
+                  <param type="select" name="dataset_type" label="Dataset type" display="radio" help="Repertoire datasets
+                   should be used when making predictions per repertoire, such as predicting a disease state. Sequence or
+                    receptor datasets should be used when predicting values for unpaired (single-chain) and paired immune
+                     receptors respectively, like antigen specificity.">
+                      <option value="repertoire">Repertoire dataset</option>
+                      <option value="sequence">Sequence dataset (single chain)</option>
+                      <option value="receptor">Receptor dataset (paired chains)</option>
+                  </param>
+                  <when value="repertoire">
+                      <conditional name="metadata_cond">
+                          <param type="select" name="data_format" label="Data format" display="radio">
+                              <option value="AIRR">AIRR</option>
+                              <option value="IReceptor">iReceptor Gateway</option>
+                              <option value="ImmunoSEQRearrangement">immunoSEQ: rearrangement-level files</option>
+                              <option value="ImmunoSEQSample">immunoSEQ: sample-level files</option>
+                              <option value="MiXCR">MiXCR</option>
+                              <option value="VDJdb">VDJdb</option>
+                              <option value="TenxGenomics">10x Genomics ‘Clonotype consensus annotations’ (CSV)</option>
+                          </param>
+                          <when value="AIRR">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire ARR file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="ImmunoSEQRearrangement">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire immunoSEQ rearrangement file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="ImmunoSEQSample">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire immunoSEQ sample file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="MiXCR">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire MiXCR file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="VDJdb">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire VDJdb file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                          <when value="TenxGenomics">
+                              <param name="metadata_input" type="data" format="txt" label="Metadata file" multiple="false" help="The metadata file describes metadata information for all repertoires included in the dataset. Every repertoire 10x Genomics file described in the metadata file must be selected under 'Data files'."/>
+                          </when>
+                      </conditional>
+                  </when>
+                  <when value="sequence">
+                      <param type="select" name="data_format" label="Data format" display="radio">
+                          <option value="AIRR">AIRR</option>
+                          <option value="IReceptor">iReceptor Gateway</option>
+                          <option value="ImmunoSEQRearrangement">ImmunoSEQ: rearrangement-level files</option>
+                          <option value="ImmunoSEQSample">ImmunoSEQ: sample-level files</option>
+                          <option value="MiXCR">MiXCR</option>
+                          <option value="VDJdb">VDJdb</option>
+                          <option value="TenxGenomics">10xGenomics ‘Clonotype consensus annotations’ (CSV)</option>
+                      </param>
+                      <param type="text" name="metadata_columns" optional="false" label="Metadata columns" help="Please
+                      specify the names of the columns that contain metadata. The metadata columns specified here can be
+                      used as labels for prediction. Multiple metadata columns may be specified and separated by comma,
+                      for example: Epitope,Epitope gene,Epitope species"/>
+                  </when>
+                  <when value="receptor">
+                      <param type="select" name="data_format" label="Data format" display="radio">
+                          <option value="AIRR">AIRR</option>
+                          <option value="IReceptor">iReceptor Gateway</option>
+                          <option value="VDJdb">VDJdb</option>
+                          <option value="TenxGenomics">10xGenomics ‘Clonotype consensus annotations’ (CSV)</option>
+                      </param>
+                      <param type="select" name="receptor_type" label="Receptor type" display="radio">
+                          <option value="TRA_TRB">T cell alpha/beta</option>
+                          <option value="TRG_TRD">T cell gamma/delta</option>
+                          <option value="IGH_IGL">B cell heavy/light</option>
+                          <option value="IGH_IGK">B cell heavy/kappa</option>
+                      </param>
+                      <param type="text" name="metadata_columns" optional="false" label="Metadata columns" help="Please
+                      specify the names of the columns that contain metadata. The metadata columns specified here can be
+                      used as labels for prediction. Multiple metadata columns may be specified and separated by comma,
+                      for example: Epitope,Epitope gene,Epitope species"/>
+                  </when>
+              </conditional>
+              <param name="data_input" type="data" multiple="true" label="Data files" min="1" max="2000" help="This field should include individual repertoire or receptor files, or iReceptor zip files. Multiple files can be selected by holding down the control/command or shift key, or by clicking 'browse datasets' (folder button on the right). Important: make sure all the files you want to include in the dataset are highlighted in blue or gray."/>
+          </when>
+          <when value="advanced">
+              <param name="yaml_input" type="data" format="txt" label="YAML specification" multiple="false"/>
+              <param name="data_input" type="data" multiple="true" label="Data and metadata files" optional="true" help="This field should include individual repertoire or receptor files, or iReceptor zip files, and optionally a metadata file. Multiple files can be selected by holding down the control/command or shift key, or by clicking 'browse datasets' (folder button on the right). Important: make sure all the files you want to include in the dataset are highlighted."/>
+          </when>
+      </conditional>
+  </inputs>
+    <outputs>
+        <data format="txt" name="specs" label="create_dataset.yaml"/>
+        <data format="iml_dataset" name="html_outfile" label="ImmuneML dataset"/>
+    </outputs>
+
+
+    <help><![CDATA[
+
+        In Galaxy, an immuneML dataset is simply a Galaxy collection containing all relevant files (including an optional metadata file).
+        The Create dataset Galaxy tool allows users to import data from various formats and create immuneML datasets in Galaxy.
+        These datasets are in an optimized binary (Pickle) format, which ensures that you can quickly import the dataset into
+        Galaxy tools without having to repeatedly specify the import parameters.
+
+        Before creating a dataset, the relevant data files must first be uploaded to the Galaxy interface. This can be done either
+        by uploading files from your local computer (use the 'Upload file' tool under the 'Get local data' menu), or by fetching
+        remote data from the iReceptor Plus Gateway or VDJdb (see `How to import remote AIRR datasets in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_import_remote_data.html>`_).
+
+        The imported immuneML dataset is stored in a Galaxy collection, which will appear as a history item on the right side of the screen,
+        and can later be selected as input to other tools.
+
+        The tool has a simplified and an advanced interface. The simplified interface is fully button-based, and relies
+        on default settings for importing datasets. The advanced interface gives full control over import settings through a YAML
+        specification. In most cases, the simplified interface will suffice.
+
+        For the exhaustive documentation of this tool and more information about immuneML datasets, see the tutorial `How to make an immuneML dataset in Galaxy <https://docs.immuneml.uio.no/galaxy/galaxy_dataset.html>`_.
+
+        **Tool output**
+
+        This Galaxy tool will produce the following history elements:
+
+        - ImmuneML dataset: a sequence, receptor or repertoire dataset which can be used as an input to other immuneML tools. The history element contains a summary HTML page describing general characteristics of the dataset, including the name of the dataset
+          (which is used in the dataset definition of a yaml specification), the dataset type and size, available labels, and a link to download the raw data files.
+
+        - create_dataset.yaml: the YAML specification file that was used by immuneML to create the dataset.
+          This file can be downloaded and altered (for example to export files in AIRR format, or use non-standard import parameters),
+          and run again using the 'Advanced' interface.
+
+    ]]>
+    </help>
+
+</tool>