view dimet_metabologram.xml @ 7:06f8d9ba09cd draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/DIMet commit 6da96d865a3a557cfa3ad09e1cfa830519e73748
author iuc
date Tue, 06 Aug 2024 17:36:05 +0000
parents c7ff01bd331b
children
line wrap: on
line source

<tool id="dimet_@EXECUTABLE@" name="dimet @EXECUTABLE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05">
    <description>
        Integration of transcriptomics and (tracer) metabolomics differential data (by DIMet)
    </description>
    <macros>
        <token name="@EXECUTABLE@">metabologram</token>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
    @INIT_CONFIG@
    @INIT_METABOLOGRAM@
    @INIT_TRANSCRIPTS@
    @INIT_GROUPS@
    @INIT_COMPARISONS_METABOLOGRAM@
    @INIT_PATHWAYS@
    @INIT_STAT_TEST@
    HYDRA_FULL_ERROR=1 python -m dimet
    '++hydra.run.dir=.'
    '++figure_path=figures'
    '++table_path=tables'
    '++analysis={
        dataset:{
            _target_: dimet.data.DataIntegrationConfig,
            name: "I am a synthetic data example"
        },
        method:{
            _target_: dimet.method.MetabologramIntegrationConfig,
            label: "metabologram",
            name: "Perform data integration via metabologram visualization",
            abs_values_scale_color_bar: {transcripts: null, metabolites:null},
            colors_divergent_palette: ['royalblue', 'white', 'red'],
            edge_color: ['black','black'],
            line_width:['1','1.2'],
            display_label_and_value: ${output_options.write_values},
            font_size: ${output_options.font_size},
            fig_width: ${output_options.fig_width},
            figure_format:${output_options.figure_format},
            color_nan_elements:'gray',
            fig_height: ${output_options.fig_height}
        },
        columns_metabolites: {ID: metabolite, values: log2FC},
        columns_transcripts: {ID: ${deg_one_id}, values: ${deg_one_values}},
        compartment: ${compartments},
        label: metabologram
    }'
    '++analysis.method.qualityDistanceOverSpan='${qualityDistanceOverSpan}''
    '++analysis.dataset.label='
    '+analysis.timepoints=${timepoints}'
    '++analysis.method.statistical_test=${statistical_test}'
    '++analysis.method.grouping=${groups}'
    '++analysis.method.correction_method=${correction_method}'
    '++analysis.method.impute_values=${impute_values}'
    '++analysis.statistical_test=${statistical_test}'
    '++analysis.method.disfit_tail_option="auto"'
    '++analysis.dataset.subfolder='
    '++analysis.dataset.conditions=${$conditions}'
    '++analysis.dataset.pathways=${pathways}'
    '++analysis.dataset.transcripts=${transcripts}'
    '++analysis.comparisons=${comparisons}'
    #if $metadata_path:
        '++analysis.dataset.metadata=metadata'
    #end if
    #if str( $data_input.data_input_selector ) == "abundance":
        #if $data_input.abundance_file:
            '++analysis.dataset.abundances=abundance'
        #end if
    #else:
        #if $data_input.me_or_frac_contrib_file:
            '++analysis.dataset.mean_enrichment=me_or_frac_contrib'
        #end if
    #end if
    @REMOVE_CONFIG@
    ]]></command>
    <inputs>
        <expand macro="input_parameters_metabologram"/>
        <expand macro="deg_list"/>
        <expand macro="compartments_metabologram"/>

        <param name="correction_method" type="select" value="bonferroni" display="radio" label="Select multiple test correction to apply" help="Please enter at max 1 method">
                <option value="bonferroni">bonferroni</option>
                <option value="holm-sidak">holm-sidak</option>
                <option value="holm">holm</option>
                <option value="simes-hochberg">simes-hochberg</option>
                <option value="hommel">hommel</option>
                <option value="fdr_bh">fdr_bh</option>
                <option value="fdr_by">fdr_by</option>
                <option value="fdr_tsbh">fdr_tsbh</option>
                <option value="fdr_tsbky">fdr_tsbky</option>
        </param>
        <param name="qualityDistanceOverSpan" type="float" min="-1.0" max="-0.1" value="-0.3" label="quality Distance Over Span" help="Default value is -0.3."/>
        <section name="output_options" title="Output options">
            <param name="figure_format" type="select" value="pdf" display="radio" label="Select output figure format" help="Please enter at max 1 format">
                <option value="pdf">Pdf</option>
                <option value="svg">Svg</option>
            </param>
            <param name="write_values" type="boolean" value="false" label="Write the values alongside the metabolites and genes names (e.g. L-Alanine: -0.44)"
                   help="Default value is false."/>
            <param name="fig_width" type="integer" min="5" max="20" value="7" label="width of figures"
                   help="Default value is 7."/>
            <param name="fig_height" type="integer" min="5" max="20" value="7" label="heigt of each figure"
                   help="Default value is 7."/>
            <param name="font_size" type="integer" min="1" max="20" value="12" label=" figure font size"
                   help="Default value is 12."/>
        </section>
    </inputs>


    <outputs>
        <collection name="report" type="list">
            <discover_datasets pattern="__designation_and_ext__" directory="figures"/>
        </collection>
    </outputs>
    <tests>
        <test>

            <param name="path_kegg_metabolites" ftype="tabular" value="pathways_kegg_metabolites.csv"/>
            <param name="path_kegg_transcripts" ftype="tabular" value="pathways_kegg_transcripts.csv"/>
            <param name="abundance_file" ftype="tabular" value="rawAbundances.csv"/>
            <param name="metadata_path" ftype="tabular" value="example2_metadata.csv"/>
            <param name="statistical_test_type" value="parametric"/>
            <param name="stat_test" value="Tt"/>
            <repeat name="deg_list">
                <param name="input" ftype="tabular" value="DEG_comparison_1.csv"/>
                <param name="idcol" ftype="integer" value="2"/>
                <param name="valuecol" ftype="integer" value="3"/>
                <param name="timepoint" value='T0'/>
                <repeat name="factor_list">
                    <param name="condition" value="L-Cycloserine"/>
                </repeat>
                <repeat name="factor_list">
                    <param name="condition" value="Control"/>
                </repeat>
            </repeat>
            <section name="output_options">
                <param name="show_values" value="false"/>
                <param name="figure_format" value="svg"/>
                <param name="figure_width" value="7"/>
                <param name="figure_height" value="7"/>
                <param name="font_size" value="12"/>
            </section>
            <output_collection name="report" type="list" count="3">
                <element file="AMINOACIDS-L-Cycloserine-T0-Control-T0--DEG_comparison1-abundances-cell.svg" name="AMINOACIDS-L-Cycloserine-T0-Control-T0--DEG_comparison1-abundances-cell" ftype="svg" compare="sim_size" delta="100"/>
                <element file="CENTRAL_CARBON_METABOLISM-L-Cycloserine-T0-Control-T0--DEG_comparison1-abundances-cell.svg" name="CENTRAL_CARBON_METABOLISM-L-Cycloserine-T0-Control-T0--DEG_comparison1-abundances-cell" ftype="svg" compare="sim_size" delta="100"/>
                <element file="legend-abundances-cell.svg" name="legend-abundances-cell" ftype="svg" compare="sim_size" delta="100"/>
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[
This module is part of DIMet: Differential analysis of Isotope-labeled targeted Metabolomics data (https://pypi.org/project/DIMet/).

DIMet Metabologram integrates tracer metabolomics and transcriptomics, in a pathway based fashion.
More precisely, the differential information (Fold Changes (log2 transformed, or not)) of both types of omics
must be given as input, plus the files defining the pathways. You can use the minimal data examples from https://zenodo.org/records/10579891  that contain a minimal example data for running our DIMet Metabologram tool.

The figures in .pdf format are of publication quality, and as they are vectorial images you can open them and customize aesthetics with a professional image software such as Inkscape, Adobe Illustrator, Sketch,  CorelDRAW, etc.


**Input data files**

This tool requires the following tab-delimited .csv files:

1. **The metabolomics data**:

  1.1 The measures' (or quantifications') files, that can be of two types:

    - The total **abundances** (of the metabolites) file,
      OR,

    - The mean **enrichment** or labelled fractional contributions

  1.2 The metadata, a unique file with the description of the samples in your measures' files. This is compulsory, see section **Metadata File Information**.


2. **The transcriptomics data**:

  2.1 The table with the results of the differential expression analysis (performed with an external tool). Please provide the file with the results of the differential expression analysis of your transcriptomics data. It is recommended to use only the statistically significant **Differentially Expressed Genes (DEG)**.  We provide examples in the zenodo (see section **Available data for testing**)



3. **The pathways files**:


  3.1 A file with the pathways and respective gene symbols, which must match with those present in the transcriptomics data. The names of the columns must be the pathways' names, see the minimal data example downloaded from zenodo as explained above. Example:

    ====================== ================== ==================
    **PENTHOSE PHOSPHATE** **FATTY ACIDS**    ...
    ====================== ================== ==================
    RPE                    SCD                ...
    PGM1                   FADS1              ...
    PKF                    BAAT               ...
    DERA                   ACOT2              ...
    ...                    ...                ...
    ====================== ================== ==================



  3.2 A file with the pathways and respective metabolites ID, that must match with those in your metabolomics data. The names of the columns must be the pathways' names, see the minimal data example downloaded from zenodo as explained above. Example:

    ====================== ================== ==================
    **PENTHOSE PHOSPHATE** **FATTY ACIDS**    ...
    ====================== ================== ==================
    Xyl_P                  Acetyl_CoA         ...
    Glc_6P                 Palmitate          ...
    Rib_6P                 Stearate           ...
    ...                    ...                ...
    ====================== ================== ==================

**Measures' files**

The measure's files must be organized as matrices:
- The first column must contain Metabolite IDs that are unique (not repeated) within the file.

- The rest of the columns correspond to the samples

- The rows correspond to the metabolites

- The values must be tab separated, with the first row containing the sample/column labels.

See the following examples of measures files:


Example - Metabolites **abundances**:

    =============== ================== ================== ================== ================== ================== ==================
    ID              **MCF001089_TD01** **MCF001089_TD02** **MCF001089_TD03** **MCF001089_TD04** **MCF001089_TD05** **MCF001089_TD06**
    =============== ================== ================== ================== ================== ================== ==================
    2_3-PG          8698823.9926       10718737.7217      10724373.9         8536484.5          22060650           28898956
    2-OHGLu         36924336           424336             92060650           45165              84951950           965165051
    Glc6P           2310               2142               2683               1683               012532068          1252172
    Gly3P           399298             991656565          525195             6365231            89451625           4952651963
    IsoCit          0                  0                  0                  84915613           856236             954651610
    =============== ================== ================== ================== ================== ================== ==================

Example - mean **enrichment** or labeled fractional contributions:

    =============== ================== ================== ================== ================== ================== ==================
    ID              **MCF001089_TD01** **MCF001089_TD02** **MCF001089_TD03** **MCF001089_TD04** **MCF001089_TD05** **MCF001089_TD06**
    =============== ================== ================== ================== ================== ================== ==================
    2_3-PG          0.9711             0.968              0.9909             0.991              0.40               0.9
    2-OHGLu         0.01719            0.0246             0.554              0.555              0.73               0.68
    Glc6P           0.06               0.66               2683               0.06               2068               2172
    Gly3P           0.06               0.06               0.06               1                  5                  3
    IsoCit          0.06               1                  0.49               0.36               6                  10
    =============== ================== ================== ================== ================== ================== ==================





**Metadata File Information**

Provide a tab-separated file that has the names of the samples in the first column and one header row.
Column names must be exactly in this order:

   name_to_plot
   condition
   timepoint
   timenum
   compartment
   original_name


Example **Metadata File**:


    ==================== =============== ============= ============ ================ =================
    **name_to_plot**     **condition**   **timepoint** **timenum**  **compartment**   **original_name**
    -------------------- --------------- ------------- ------------ ---------------- -----------------
    Control_cell_T0-1    Control         T0            0            cell             MCF001089_TD01
    Control_cell_T0-2    Control         T0            0            cell             MCF001089_TD02
    Control_cell_T0-3    Control         T0            0            cell             MCF001089_TD03
    Tumoral_cell_T0-1    Tumoral         T0            0            cell             MCF001089_TD04
    Tumoral_cell_T0-2    Tumoral         T0            0            cell             MCF001089_TD05
    Tumoral_cell_T0-3    Tumoral         T0            0            cell             MCF001089_TD06
    Tumoral_cell_T24-1   Tumoral         T24           24           cell             MCF001089_TD07
    Tumoral_cell_T24-2   Tumoral         T24           24           cell             MCF001089_TD08
    Tumoral_cell_T24-3   Tumoral         T24           24           cell             MCF001090_TD01
    Control_med_T24-1    Control         T24           24           med              MCF001090_TD02
    Control_med_T24-2    Control         T24           24           med              MCF001090_TD03
    Tumoral_med_T24-1    Tumoral         T24           24           med              MCF001090_TD04
    Tumoral_med_T24-2    Tumoral         T24           24           med              MCF001090_TD05
    Control_med_T0-1     Control         T0            0            med              MCF001090_TD06
    Tumoral_med_T0-1     Tumoral         T0            0            med              MCF001090_TD07
    Tumoral_med_T0-2     Tumoral         T0            0            med              MCF001090_TD08
    ==================== =============== ============= ============ ================ =================


The column **original_name** must have the names of the samples as given in your data.

The column **name_to_plot** must have the names as you want them to be (or set identical to original_name if you prefer). To set names that
are meaningful is a better choice, as we will take them to display the results.

The column **timenum** must contain only the numeric part of the timepoint, for example 2,0, 10, 100 (this means, without letters ("T", "t", "s", "h" etc)
nor any other symbol). Make sure these time numbers are in the same units (but do not write the units here!).

The column **compartment** is an abbreviation, coined by you, for the compartments. This will be used for the results' files names: the longer the
compartments names are, the longer the output files' names! Please pick short and clear abbreviations to fill this column.


**Running the analysis**

You can precise how you want your analysis to be executed with the parameters, which are of two types:

**a. Parameters proper to the metabolome differential analysis (that runs automatically before the integration)**:

- **Conditions and timepoint**:

For each comparison to run on the metabolomics data, you can define the Conditions to compare, at a chosen time point, through the box 'Deregulated gene set': follow the instructions and choose the correct order of Conditions (the last must be the reference or control).
To add a second comparison, click the 'Insert Deregulated gene set' button.
You can insert and define as many boxes ('Deregulated gene set') as comparisons you want to perform.

- **statistical_test**: choose the specific statistical test to be applied.

 Kruskal-Wallis, Mann-Whitney, Wilcoxon’s signed rank test, Wilcoxon’s rank sum test
 t-test, and permutation test are currently offered (we use the trusted functions from scipy library https://docs.scipy.org/doc/scipy/reference/stats.html).

For the permutation test, we have established as test statistic, the absolute difference of geometric means of the two compared groups.

- **qualityDistanceOverSpan**: a normalized distance between the intervals of values of the compared groups, that is the cutoff for

considering a minimal acceptable "separation", and therefore, to be suitable for statistical testing. A 'distance/span' == 1 is a perfect separation,
whereas if 'distance/span' < 0 there is no separation.
To use with caution in case of important dispersion of your intra-group values. Default is -0.3 (not stringent)

- **correction_method**: one of the methods for multiple testing correction available in statsmodels library (bonferroni, fdr_bh, sidak, among others, see https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html).

- **compartment**: one of the compartments present in your data.

**b. Parameters proper to the integration with transcriptomics (that runs automatically after the metabolome differential analysis)**:

- **Deregulated genes set file**: Corresponds to the transcriptomics data, as explained in **Input data files** section.

For each 'Deregulated gene set' box, a single **Differentially Expressed Genes (DEG)** file must be added, and must match with the metabolomics comparison set in the same box. Additional files can be added with the 'Insert deregulated gene set' button,
see also the subsection 'Conditions and timepoint' above.

- **pathways** : files for the pathways, as explained in **Input data files** section


Overall, the metabologram module provides hints on use that will guide you, in the same menus and boxes of the parameters.


**Output**

The output consists of one figure by metabologram,  and a color key bar legend valid for all metabolograms produced

**Available data for testing**

You can test our tool with the data from our manuscript https://zenodo.org/record/10579862 (the pertinent
files for you are located in the subfolders inside the data folder).
You can also use the minimal data examples from https://zenodo.org/record/10579891

 ]]>
    </help>
    <expand macro="citations" />
</tool>