view dimsp/dimsp_filter.xml @ 2:a8155d5840e9 draft default tip

Uploaded
author metaboflow_cam
date Tue, 22 Oct 2019 04:06:06 -0400
parents
children
line wrap: on
line source

<!--
wl-29-11-2018, Thu: commence
wl-30-11-2018, Fri: add some help information
wl-18-03-2019, Mon: add macro and test
wl-21-03-2019, Thu: planemo test for sample, qc and blank
wl-01-04-2019, Mon: add more test and all pass.
--> 

<tool id="dimsp_filter" name="dims_filtering" version="0.1.0">
  <description>
    Direct-infusion mass spectrometry lipidomics peak filtering
  </description>
  <macros>
    <import>macros.xml</import>
  </macros>
  <expand macro="requirements" />
  <expand macro="stdio" />

  <!-- =============================================================== -->
  <command detect_errors="exit_code">
    <![CDATA[
    
      Rscript ${__tool_directory__}/dimsp_filter.R
        --peak_file '$peak_file'

        ## input group information directly or load a file?
        --grp_file_sel '$grp.grp_file_sel' 
        #if $grp.grp_file_sel=='yes':
          --grp_file '$grp.grp_file'
        #else
          --groups '$grp.groups'
        #end if

        ## QC filtering
        #if $qc_filter.qc:
          --qc_rsd_thres '$qc_filter.qc_set.qc_rsd_thres'
          --qc_mv_filter '$qc_filter.qc_set.qc_mv_filter'
          --qc_mv_qc_sam '$qc_filter.qc_set.qc_mv_qc_sam'
          --qc_mv_thres  '$qc_filter.qc_set.qc_mv_thres'
        #end if
        
        ## blank filtering
        #if $bl_filter.bl:
          --bl_method    '$bl_filter.bl_set.bl_method'
          --bl_factor    '$bl_filter.bl_set.bl_factor'
          --bl_mv_filter '$bl_filter.bl_set.bl_mv_filter'
          --bl_mv_thres  '$bl_filter.bl_set.bl_mv_thres'
        #end if

        ## MV filtering on samples
        #if $mv_filter.mv: 
          --mv_thres '$mv_filter.mv_set.mv_thres'
        #end if

        ## Merge data (sample, qc and blank)
        --merge $merge

        ## MV imputation
        --mv_impute '$mv_impute'

        ## output
        --pdf_file     '$pdf_file'
        --filter_file  '$filter_file'

    ]]>
  </command>

  <!-- =============================================================== -->
  <inputs>
    <param name="peak_file" type="data"  format="tabular"
           label="DIMS peak table"
           help="DIMS peak table with columns of peaks and rows of samples. 
                 The first two columns are annotation and mz values." />

    <!-- =============================================================== -->
    <conditional name="grp">
      <param name="grp_file_sel" type="select"
             label="Group information for filtering. Input from a file?" 
             help="You can load group from a file or enter below manually.">
        <option value="yes" selected="true">Use group file</option>
        <option value="no">Enter group manually</option>
      </param>

      <when value="yes">
        <param name="grp_file" type="data"  format="txt" 
               label="Group file" 
               help="A data matrix containing only one column. The cell value 
               must be 'sample','qc' or 'blank'. The contents in the column 
               must include: 1) sample; or 2) sample, qc; or 3) sample,blank
               or 4) sample, qc, blank. And number of each content must be no
               less than 2.  " />
             <!-- wl-30-11-2018, Fri: cannot use format="tabular". It does
                  not treat one column data file as tabular format.  -->   
      </when>
      <when value="no">
        <param name="groups" type="text" value=""
               label="Specify group information"
               help="Enter group contents delimited by a comma. E.e., 'sample',
               'qc','blank'. The length of contents must be the same as the 
               number of replicates. Any other names are not allowed." />
      </when>
      
    </conditional>

    <!-- =============================================================== -->
    <conditional name="qc_filter">
      <param name="qc"
             type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true"
             label="QC Filtering"
             help="Performs QC filtering if you provide QC information" />

      <when value="TRUE">
        <section name="qc_set" title="QC Filtering">

          <param name="qc_rsd_thres" type="float" value="20" 
                 label="Specify RSD threshold"
                 help="Inter RSD threshold. The default is 20." /> 

          <param name="qc_mv_filter" type="boolean" 
                 truevalue="TRUE" falsevalue="FALSE" checked="true"
                 label="MV filtering or not"
                 help="Performs MV filtering inside QC filtering or not"/>

          <param name="qc_mv_qc_sam" type="boolean" 
                 truevalue="TRUE" falsevalue="FALSE" checked="true"
                 label="MV filtering on 'qc' or 'sample' replicates"
                 help="Performs MV filtering on 'qc' or 'sample' replicates. 
                       TRUE for 'qc' and FALSE for 'sample'."/>

          <param name="qc_mv_thres" type="float" value="0.30" 
                 label="Specify MV percentage threshold"
                 help="MV percentage threshold for mv filtering inside QC
                       filtering" /> 
        </section>

      </when>

      <when value="FALSE">
      </when>

    </conditional>

    <!-- =============================================================== -->
    <conditional name="bl_filter">
      <param name="bl"
             type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true"
             label="Blank Filtering"
             help="Performs blank filtering if you provide blank information" />

      <when value="TRUE">
        <section name="bl_set" title="Blank Filtering">

          <param name="bl_method" type="select"
                 label="Select a method for blank filtering"
                 help="Blank filtering method. Currently support 'mean', 
                       'median' and 'max'.">
            <option value="mean" selected="true">mean</option>
            <option value="median">median</option>
            <option value="max">max</option>
          </param>

          <param name="bl_factor" type="float" value="1" 
                 label="Specify factor"
                 help="Multiplying factor for blank filtering" /> 

          <param name="bl_mv_filter" type="boolean" 
                 truevalue="TRUE" falsevalue="FALSE" checked="true"
                 label="MV filtering or not"
                 help="Performs MV filtering on 'sample' inside blank 
                       filtering or not"/>

          <param name="bl_mv_thres" type="float" value="0.30" 
                 label="Specify MV percentage threshold"
                 help="MV percentage threshold for MV filtering inside 
                       blank filtering" /> 
        </section>

      </when>

      <when value="FALSE">
      </when>

    </conditional>

    <!-- =============================================================== -->
    <conditional name="mv_filter">
      <param name="mv"
             type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true"
             label="MV Filtering on 'sample'"
             help="Performs MV filtering on 'sample'. Should do if no MV 
                   filtering is performed on 'sample' inside QC or blank filtering." />
      <when value="TRUE">
        <section name="mv_set" title="Missing value Filtering">
          <param name="mv_thres" type="float" value="0.30" 
                 label="Specify MV percentage threshold"
                 help="MV percentage threshold for MV filtering" /> 
        </section>
      </when>

      <when value="FALSE">
      </when>
    </conditional>

    <!-- =============================================================== -->
    <param name="merge" type="boolean" truevalue="True" falsevalue="False" 
           checked="false" label="Merge 'sample', 'qc' and 'blank' or not" 
           help="Merge 'sample', 'qc' and 'blank' for further analysis or 
                 not" /> 

    <param name="mv_impute" type="select"
           label="Select a method for MV imputation"
           help="MV imputation method. Currently support 'mean', 'median', 
                 'min', 'knn' and 'pca'.">
           <option value="knn" selected="true">knn</option>
           <option value="pca">pca</option>
           <option value="mean">mean</option>
           <option value="median">median</option>
           <option value="min">min</option>
    </param>

  </inputs>


  <!-- =============================================================== -->
  <outputs>
    <data format="tabular" name="filter_file" 
          label="Filtered peak signals on ${on_string}"/>
    <data format="pdf" name="pdf_file" 
          label="Histogram and boxplot for both RSD and MV percentage 
                 on ${on_string}" />
  </outputs>

  <!-- =============================================================== -->
  <tests>
    <!-- test for sample, qc and blank with group file -->
    <test>
      <param name="peak_file" value="pos_peak.tsv" />
      <param name="grp_file_sel" value="yes" />
      <param name="grp_file" value="grp_sam_qc_bl.tsv" />
      <param name="qc" value="TRUE" /> 
      <param name="qc_rsd_thres" value="60.0" />
      <param name="qc_mv_qc_sam" value="TRUE" />
      <param name="bl" value="TRUE" /> 
      <param name="mv" value="FALSE" /> 
      <param name="merge" value="TRUE" /> 
      <output name="filter_file" file="res/sam_qc_bl_peak_filter.tsv" compare="diff"/>
      <output name="pdf_file" file="res/sam_qc_bl_hist_box.pdf" compare="sim_size"/>
    </test>
    <!-- test for sample, qc and blank with group. Case-insensitive -->
    <test>
      <param name="peak_file" value="pos_peak.tsv" />
      <param name="grp_file_sel" value="no" />
      <param name="groups" value="Sample,sample, samplE, sample, sample, sample, sample, sample, sample, sample, qc, qc, blank, blank" />
      <param name="qc_rsd_thres" value="60.0" />
      <param name="mv" value="TRUE" /> 
      <param name="merge" value="false" /> 
      <param name="mv_impute" value="mean" />
      <output name="filter_file" file="res/sam_qc_bl_peak_filter_1.tsv" compare="diff"/>
      <output name="pdf_file" file="res/sam_qc_bl_hist_box_1.pdf" compare="sim_size"/>
    </test>
    <!-- test for sample and qc with group file -->
    <test>
      <param name="peak_file" value="pos_peak.tsv" />
      <param name="grp_file_sel" value="yes" />
      <param name="grp_file" value="grp_sam_qc.tsv" />
      <param name="qc" value="TRUE" /> 
      <param name="qc_rsd_thres" value="60.0" />
      <param name="qc_mv_qc_sam" value="TRUE" />
      <param name="bl" value="FLASE" /> 
      <param name="mv" value="FALSE" /> 
      <param name="merge" value="TRUE" /> 
      <output name="filter_file" file="res/sam_qc_peak_filter.tsv" compare="diff"/>
      <output name="pdf_file" file="res/sam_qc_hist_box.pdf" compare="sim_size"/>
    </test>
    <!-- test for sample and bl with group file -->
    <test>
      <param name="peak_file" value="pos_peak.tsv" />
      <param name="grp_file_sel" value="yes" />
      <param name="grp_file" value="grp_sam_bl.tsv" />
      <param name="qc" value="FLASE" /> 
      <param name="bl" value="TRUE" /> 
      <param name="mv" value="FALSE" /> 
      <param name="merge" value="FLASE" /> 
      <output name="filter_file" file="res/sam_bl_peak_filter.tsv" compare="diff"/>
      <output name="pdf_file" file="res/sam_bl_hist_box.pdf" compare="sim_size"/>
    </test>
    <!-- test for sample only with group file. mv filtering will performed even 'mv' is F -->
    <test>
      <param name="peak_file" value="pos_peak.tsv" />
      <param name="grp_file_sel" value="yes" />
      <param name="grp_file" value="grp_sam.tsv" />
      <param name="qc" value="FLASE" /> 
      <param name="bl" value="FLASE" /> 
      <param name="mv" value="FALSE" /> 
      <param name="merge" value="TRUE" /> 
      <output name="filter_file" file="res/sam_peak_filter.tsv" compare="diff"/>
      <output name="pdf_file" file="res/sam_hist_box.pdf" compare="sim_size"/>
    </test>
  </tests>

  <!-- =============================================================== -->
<help>
DIMS Filtering
==============

Description
-----------

This tool performs filtering on the DIMS peaks produced by
**DIMS Processing**. Current three methods are implemented:

QC filtering
  Filtering based on the RSD of QC samples. The peaks with RSD larger than
  the designated threshold will be removed. Also missing value (MV)
  filtering can be applied to 'qc' or 'sample'.

Blank filtering
  Filtering based on the characteristics of blank samples. The peaks whose
  characteristics in 'sample' is less than in 'blank' will be removed. An
  option of MV filtering on 'sample' can be employed.

MV filtering
  Filtering based on the missing value percentages. The peaks with the
  percentages larger than the designated threshold will be removed. The
  operation is performed on 'sample'. If MV filtering is not performed on
  'sample' inside 'QC filtering' and 'Blank filtering', this procedure
  should be employed. Without MV filtering, even 'MV imputation' will fail
  in some cases.

Inputs
------

**\1. Peak Table**

The input file is a peak table in tubular format. It can be produced by
**DIMS Processing**. The following is an example with the first two columns
of information such as annotation results and mz values and other columns
are samples. The row corresponds the peaks. The filtering will only perform
on intensity data and leave other data (such as 'name' and 'mz') unchanged.

+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| name                 | mz       | 07_sample | 08_sample |  23_pool |  24_pool | 71_blank | 72_blank |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| TG_6:0_[M+NH4]1+     | 236.1129 |  22828.96 |  19742.27 | 11294.65 | 11772.57 |  4715.92 | 17952.02 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| TG_6:0_[M+Na]1+      | 241.0683 |      0.00 |      0.00 |   149.14 |     0.00 |   231.95 |    99.92 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| LPC_2:0_[M+H]1+      | 300.1207 |      0.00 |      0.00 |     0.00 |    98.52 |     0.00 |     0.00 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| LPC-2O_3:0_[M+H]1+   | 300.1571 |      0.00 |    107.11 |   301.06 |   825.36 |     0.00 |   375.61 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| LPC_3:1_[M+H]1+      | 312.1207 |    128.80 |     60.71 |     0.00 |     0.00 |     0.00 |   106.92 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| PC-O_3:0_[M+H]1+     | 314.1363 |      0.00 |     59.78 |     0.00 |   108.20 |     0.00 |    62.81 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| LPC-2O_4:0_[M+H]1+   | 314.1727 |    342.96 |    193.56 |   627.61 |  4370.63 |   410.80 |   718.93 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| PC-O_4:0_[M+H]1+     |  328.152 |    109.82 |      0.00 |   148.11 |   227.95 |     0.00 |    92.32 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| LPC_O-5:0_[M+H]1+    | 328.1884 |    105.31 |    377.61 |     0.00 |   204.87 |    73.78 |    92.58 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| PC_4:0_[M+H]1+       | 342.1312 |      0.00 |      0.00 |     0.00 |     0.00 |     0.00 |     0.00 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| LPC_5:0_[M+H]1+      | 342.1676 |     83.94 |      0.00 |   143.46 |  2550.10 |     0.00 |   290.45 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| Cer_20:1_[M+H]1+     | 342.3003 |    585.12 |   2147.71 |  1506.26 |  1202.96 |     0.00 |   511.88 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+
| MG_16:0_[M+NH4]1+    | 348.3108 |   7284.15 |  13460.34 | 22220.10 |  1939.08 |  4642.73 |  3129.56 |
+----------------------+----------+-----------+-----------+----------+----------+----------+----------+

|

**\2. Group**
  
This group information indicates the categorical group for the samples
in peak table. The group item must be **sample**, **qc** or **blank**. 
Each group must at least include **sample** with or without **qc** or
**blank**.  Number of each item must be no less than 2. Two methods for
providing the group information:
   
- **Input manually**: type items separated by commas. For example, group has 
  'sample', 'qc' and 'blank':
  ::

    sample, sample, qc, qc, blank, blank

  Group has 'sample' and 'qc': 
  ::

    qc, sample, sample, sample, qc, qc
   
  Group has only 'sample':
  ::

    sample, sample, sample,cample,sample,sample

- **Load a tubular table**: The user can load one-column tabular data. For
  example:


  +--------+   
  | sample |        
  +--------+        
  | sample |        
  +--------+        
  | qc     |        
  +--------+        
  | qc     |        
  +--------+        
  | blank  |        
  +--------+        
  | blank  |        
  +--------+        

    |

  Note that this table does not includes any header.   

Parameters
----------

**\1. QC Filtering**

Use this option if you provide 'sample' in **Group** and intend to perform
QC filtering.

- **RSD threshold**: RSD threshold on 'sample'. The default value is 20.
  **Note** that if a large quantity of peaks are removed, user can
  increase this threshold based on the RSD distribution and run again.

- **MV filtering**: A boolean value indicate whether MV filtering is
  applied or not.

- **MV filtering on 'sample' or 'qc'**: A boolean value indicating MV
  filtering is applied to whether 'qc' or 'sample'. If TRUE, MV filtering
  is performed on 'qc'. 

- **MV threshold**: MV percentage threshold. The default is 30.

**\2. Blank Filtering**

Use this option if you provide 'blank' in **Group** and intend to perform
blank filtering.

- **Method**: method to calculate the characteristic of peaks. The
  available methods are: *mean* (default), *median* and *max*.

- **Multifying factor**: A factor to multiply the characteristics of
  *blank*. The default is 1.

- **MV filtering**: A boolean value indicate whether MV filtering is
  applied to 'sample' or not.

- **MV threshold**: MV percentage threshold. The default is 30.

**\3. MV Filtering**

Use this option if MV filtering is not performed on 'sample' inside **QC
Filtering** or **Blank Filtering**. Or **Group** only include 'sample'.

- **MV threshold**: MV percentage threshold. The default is 30.


**\4. Merge Data**

A boolean value indicating whether or not merging 'sample','qc' and 'blank'.
If not, only 'sample' will be imputed and output.

**\5. MV Imputation**

Missing value imputation. Currently two categorical methods are available:

- **Univariate**: includes *mean*, *median* and *min*
- **Multivariate**: includes *knn* (default) and *pca*


Outputs
----------

**\1. Filtered Peak Table**

A tabular format file containing filtered peaks. The file structure is the
same as input peak table.

**\2. Distribution plots of RSD and MV percentage**

A PDF file consisting of histogram and boxplot for both RSD and MV. The
plots give rough guardian for how to select the right thresholds of RSD
and MV. If the filtering results are not satisfied, for example, too many
or too few of peaks' removal, user can run filtering again by setting
new thresholds of RSD and MV based on the distribution plots. 

It should be noted that threshold of RSD has large influences on filtering.
If too many peaks have been removed, user should consider to increase
threshold or not to perform QC filtering at all.

</help>
  <citations>
  </citations>
</tool>