view w4mclassfilter.xml @ 12:38f509903a0b draft

"planemo upload for repository https://github.com/HegemanLab/w4mclassfilter_galaxy_wrapper/tree/master commit b9712e554d16ed26f6c6d0c2e8cd74552b49f694"
author eschen42
date Tue, 01 Oct 2019 16:57:58 -0400
parents 9f5c0e23c205
children c18040b6e8b9
line wrap: on
line source

<tool id="w4mclassfilter" name="W4m Data Subset" version="0.98.13">
    <description>Filter W4m data by values or metadata</description>
    <!-- Here is the hyphenation standard that I *try* to apply consistently in my documentation: http://www.sandranoonan.com/dont-let-hyphenation-drive-crazy/ -->
    <requirements>
        <requirement type="package" version="3.6.1">r-base</requirement>
        <requirement type="package" version="1.1_5">r-batch</requirement>
        <requirement type="package" version="0.98.13">w4mclassfilter</requirement>
    </requirements>
    <command detect_errors="aggressive"><![CDATA[
    unset R_HOME;
    Rscript $__tool_directory__/w4mclassfilter_wrapper.R
    dataMatrix_in         '$dataMatrix_in'
    sampleMetadata_in     '$sampleMetadata_in'
    variableMetadata_in   '$variableMetadata_in'
    sampleclassNames      '$sampleclassNames'
    inclusive             '$inclusive'
    wildcards             '$wildcards'
    classnameColumn       '$classnameColumn'
    samplenameColumn      'sampleMetadata'
    variable_range_filter '$variableRangeFilter'
    transformation        '$transformation'
    imputation            '$imputation'
    dataMatrix_out        '$dataMatrix_out'
    sampleMetadata_out    '$sampleMetadata_out'
    variableMetadata_out  '$variableMetadata_out'
    ]]></command>
    <inputs>
        <param name="dataMatrix_in" format="tabular" label="Data matrix file" type="data"
            help="variables &#10006; samples" />
        <param name="sampleMetadata_in" format="tabular" label="Sample metadata file" type="data"
            help="sample metadata, one row per sample" />
        <param name="variableMetadata_in" format="tabular" label="Variable metadata file" type="data"
            help="variable metadata, one row per variable" />
        <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class"
            help="name of the column in sample metadata that has the values to be tested against the 'Names of sample-classes' input parameter - defaults to 'class'">
            <sanitizer>
                <valid initial="string.letters">
                    <add preset="string.digits"/>
                    <add value="&#46;"    /> <!-- dot, period -->
                    <add value="&#95;"    /> <!-- underscore -->
                </valid>
            </sanitizer>
        </param>
        <param name="sampleclassNames" label="Names of sample-classes" type="text" value = ""
            help="comma-separated names (or regular expressions to match names) of sample-classes to filter in or out; defaults to no names">
            <sanitizer>
                <valid initial="string.letters">
                    <add preset="string.digits"/>
                    <add value="&#123;"   /> <!-- l-cube, left-curly-bracket -->
                    <add value="&#124;"   /> <!-- pipe -->
                    <add value="&#125;"   /> <!-- r-cube, right-curly-bracket -->
                    <add value="&#36;"    /> <!-- dollar, dollar-sign -->
                    <add value="&#40;"    /> <!-- left-paren -->
                    <add value="&#41;"    /> <!-- right-paren -->
                    <add value="&#42;"    /> <!-- splat, asterisk -->
                    <add value="&#43;"    /> <!-- plus -->
                    <add value="&#45;"    /> <!-- dash, hyphen -->
                    <add value="&#44;"    /> <!-- comma -->
                    <add value="&#46;"    /> <!-- dot, period -->
                    <add value="&#58;"    /> <!-- colon -->
                    <add value="&#59;"    /> <!-- semi, semicolon -->
                    <add value="&#63;"    /> <!-- what, question mark -->
                    <add value="&#91;"    /> <!-- l-squib, left-squre-bracket -->
                    <add value="&#92;"    /> <!-- whack, backslash -->
                    <add value="&#93;"    /> <!-- r-squib, right-squre-bracket -->
                    <add value="&#94;"    /> <!-- hat, caret -->
                    <add value="&#95;"    /> <!-- underscore -->
                </valid>
            </sanitizer>
        </param>
        <param name="wildcards" label="Use 'wild cards' or 'regular expressions'" type="select"
            help="'wild-cards' (the default) - use '*' and '?' to match class names; 'regular-expressions' - use regular expressions to match class names">
            <option value="TRUE" selected="true">wild-cards</option>
            <option value="FALSE">regular-expressions</option>
        </param>
        <param name="inclusive" label="Exclude/include named classes" type="select"
            help="'filter-out' (the default) - exclude only the named sample-classes; 'filter-in' - include only the named sample-classes">
            <option value="TRUE">filter-in</option>
            <option value="FALSE" selected="true">filter-out</option>
        </param>
        <param name="variableRangeFilter" label="Variable-range filters" type="text" value = ""
            help="comma-separated filters, each specified as 'variableMetadataColumnName:min:max'; default is no filters.    (See help below.)">
            <sanitizer>
                <valid initial="string.letters">
                    <add preset="string.digits"/>
                    <add value="&#44;"    /> <!-- comma -->
                    <add value="&#46;"    /> <!-- dot, period -->
                    <add value="&#58;"    /> <!-- colon -->
                    <add value="&#95;"    /> <!-- underscore -->
                </valid>
            </sanitizer>
        </param>
        <param name="transformation" label="Data-transformation" type="select"
            help="'none' (the default) - do not transform data; 'log2' - log base 2 of data; 'log10' - log base 10 of data; in all cases, negative and missing values are imputed to zero">
            <option value="none" selected="true">none</option>
            <option value="log2">log2</option>
            <option value="log10">log10</option>
        </param>
        <param name="imputation" label="Imputation of missing values" type="select"
            help="'zero' (the default) - replace missing values with zero; 'center' - replace missing values with feature-median; 'none' - perform no imputation">
            <option value="zero" selected="true">zero</option>
            <option value="center">center</option>
            <option value="none">none</option>
        </param>
    </inputs>
    <outputs>
        <data name="dataMatrix_out" format="tabular" label="${dataMatrix_in.name}.subset" ></data>
        <data name="sampleMetadata_out" format="tabular" label="${sampleMetadata_in.name}.subset" ></data>
        <data name="variableMetadata_out" format="tabular" label="${variableMetadata_in.name}.subset" ></data>
    </outputs>
    <tests>
      <!-- test 1 -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="M"/>
        <param name="wildcards" value="FALSE"/>
        <param name="inclusive" value="TRUE"/>
        <param name="variableRangeFilter" value="FEATMAX:2e6:,mz:200:,rt::800"/>
        <param name="transformation" value="none"/>
        <output name="dataMatrix_out">
          <assert_contents>
            <has_text text="747080" />
            <not_has_text text="13420742" />
            <not_has_text text="47259" />
          </assert_contents>
        </output>
        <output name="sampleMetadata_out">
          <assert_contents>
            <has_text text="HU_017" />
            <has_text text="HU_034" />
            <has_text text="HU_078" />
            <has_text text="HU_091" />
            <has_text text="HU_093" />
            <has_text text="HU_099" />
            <has_text text="HU_130" />
            <has_text text="HU_134" />
            <has_text text="HU_138" />
            <not_has_text text="HU_028" />
            <not_has_text text="HU_051" />
            <not_has_text text="HU_060" />
            <not_has_text text="HU_110" />
            <not_has_text text="HU_149" />
            <not_has_text text="HU_152" />
            <not_has_text text="HU_175" />
            <not_has_text text="HU_178" />
            <not_has_text text="HU_185" />
            <not_has_text text="HU_204" />
            <not_has_text text="HU_208" />
          </assert_contents>
        </output>
        <output name="variableMetadata_out">
          <assert_contents>
            <has_text     text="HMDB00208" />
            <has_text     text="HMDB01032" />
            <has_text     text="HMDB01101.1" />
            <has_text     text="HMDB13189" />
            <not_has_text text="HMDB00191" />
            <not_has_text text="HMDB00251" />
            <not_has_text text="HMDB00299" />
            <not_has_text text="HMDB00512" />
            <not_has_text text="HMDB00518" />
            <not_has_text text="HMDB00715" />
            <not_has_text text="HMDB00822" />
            <not_has_text text="HMDB03193" />
            <not_has_text text="HMDB04824" />
            <not_has_text text="HMDB10348" />
            <not_has_text text="HMDB59717" />
          </assert_contents>
        </output>
      </test>
      <!-- test 2 -->
	  <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <!-- test that hyphens in regular expressions work -->
        <param name="sampleclassNames" value="HU_[0-9][0-9][0-9]"/>
        <param name="inclusive" value="TRUE"/>
        <param name="wildcards" value="FALSE"/>
        <param name="classnameColumn" value="sampleMetadata"/>
		<!-- test that variableRangeFilter works with tranformation -->
		<param name="variableRangeFilter" value="FEATMAX:6.30103:,mz:200:,rt::800"/>
        <param name="transformation" value="log10"/>
        <param name="imputation" value="zero"/>
        <output name="dataMatrix_out" md5="5644d2ea01d072ee1d0c40e29e9d0089">
          <assert_contents>
			<has_text text="5.8733671" />
          </assert_contents>
        </output>
        <output name="sampleMetadata_out">
          <assert_contents>
            <has_text text="HU_017" />
            <has_text text="HU_028" />
            <has_text text="HU_034" />
            <has_text text="HU_051" />
            <has_text text="HU_060" />
            <has_text text="HU_078" />
            <has_text text="HU_091" />
            <has_text text="HU_093" />
            <has_text text="HU_099" />
            <has_text text="HU_110" />
            <has_text text="HU_130" />
            <has_text text="HU_134" />
            <has_text text="HU_138" />
            <has_text text="HU_149" />
            <has_text text="HU_152" />
            <has_text text="HU_175" />
            <has_text text="HU_178" />
            <has_text text="HU_185" />
            <has_text text="HU_208" />
            <not_has_text text="HU_204" />
          </assert_contents>
        </output>
        <output name="variableMetadata_out">
          <assert_contents>
            <has_text     text="HMDB00191" />
            <has_text     text="HMDB00208" />
            <has_text     text="HMDB01032" />
            <has_text     text="HMDB01101.1" />
            <has_text     text="HMDB13189" />
            <not_has_text text="HMDB00251" />
            <not_has_text text="HMDB00299" />
            <not_has_text text="HMDB00512" />
            <not_has_text text="HMDB00518" />
            <not_has_text text="HMDB00715" />
            <not_has_text text="HMDB00822" />
            <not_has_text text="HMDB03193" />
            <not_has_text text="HMDB04824" />
            <not_has_text text="HMDB10348" />
            <not_has_text text="HMDB59717" />
          </assert_contents>
        </output>
      </test>
      <!-- test 3 -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="M"/>
        <param name="inclusive" value="TRUE"/>
        <param name="transformation" value="none"/>
        <output name="dataMatrix_out">
          <assert_contents>
            <not_has_text text="HU_028" />
            <not_has_text text="HU_051" />
            <not_has_text text="HU_060" />
            <not_has_text text="HU_110" />
            <not_has_text text="HU_149" />
            <not_has_text text="HU_152" />
            <not_has_text text="HU_175" />
            <not_has_text text="HU_178" />
            <not_has_text text="HU_185" />
            <not_has_text text="HU_204" />
            <not_has_text text="HU_208" />
            <has_text     text="HU_017" />
            <has_text     text="HU_034" />
            <has_text     text="HU_078" />
            <has_text     text="HU_091" />
            <has_text     text="HU_093" />
            <has_text     text="HU_099" />
            <has_text     text="HU_130" />
            <has_text     text="HU_134" />
            <has_text     text="HU_138" />
            <has_text     text="HMDB03193" />
            <not_has_text text="HMDB00822" />
            <has_text     text="HMDB01101" />
            <has_text     text="HMDB01101.1" />
            <has_text     text="HMDB10348" />
            <has_text     text="HMDB59717" />
            <has_text     text="HMDB13189" />
            <has_text     text="HMDB00299" />
            <has_text     text="HMDB00191" />
            <has_text     text="HMDB00518" />
            <has_text     text="HMDB00715" />
            <has_text     text="HMDB01032" />
            <has_text     text="HMDB00208" />
            <has_text     text="HMDB04824" />
            <has_text     text="HMDB00512" />
            <has_text     text="HMDB00251" />
          </assert_contents>
        </output>
      </test>
      <!-- test 4 -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="*"/>
        <param name="wildcards" value="TRUE"/>
        <param name="inclusive" value="TRUE"/>
        <param name="imputation" value="zero"/>
        <output name="dataMatrix_out" md5="b2eac4946d3803a07606286b50451af4">
          <assert_contents>
            <not_has_text text="NA" />
          </assert_contents>
        </output>
        <output name="sampleMetadata_out">
          <assert_contents>
            <not_has_text text="HU_204" />
            <has_text text="HU_028" />
            <has_text text="HU_051" />
            <has_text text="HU_060" />
            <has_text text="HU_110" />
            <has_text text="HU_149" />
            <has_text text="HU_152" />
            <has_text text="HU_175" />
            <has_text text="HU_178" />
            <has_text text="HU_185" />
            <has_text text="HU_208" />
            <has_text text="HU_017" />
            <has_text text="HU_034" />
            <has_text text="HU_078" />
            <has_text text="HU_091" />
            <has_text text="HU_093" />
            <has_text text="HU_099" />
            <has_text text="HU_130" />
            <has_text text="HU_134" />
            <has_text text="HU_138" />
          </assert_contents>
        </output>
      </test>
      <!-- test 5 -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="M"/>
        <param name="wildcards" value="FALSE"/>
        <param name="inclusive" value="TRUE"/>
        <output name="sampleMetadata_out">
          <assert_contents>
            <not_has_text text="HU_028" />
            <not_has_text text="HU_051" />
            <not_has_text text="HU_060" />
            <not_has_text text="HU_110" />
            <not_has_text text="HU_149" />
            <not_has_text text="HU_152" />
            <not_has_text text="HU_175" />
            <not_has_text text="HU_178" />
            <not_has_text text="HU_185" />
            <not_has_text text="HU_204" />
            <not_has_text text="HU_208" />
            <has_text     text="HU_017" />
            <has_text     text="HU_034" />
            <has_text     text="HU_078" />
            <has_text     text="HU_091" />
            <has_text     text="HU_093" />
            <has_text     text="HU_099" />
            <has_text     text="HU_130" />
            <has_text     text="HU_134" />
            <has_text     text="HU_138" />
          </assert_contents>
        </output>
      </test>
      <!-- test 6 -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="M"/>
        <param name="wildcards" value="FALSE"/>
        <param name="inclusive" value="TRUE"/>
        <output name="variableMetadata_out">
          <assert_contents>
            <has_text     text="HMDB03193" />
            <not_has_text text="HMDB00822" />
            <has_text     text="HMDB01101" />
            <has_text     text="HMDB01101.1" />
            <has_text     text="HMDB10348" />
            <has_text     text="HMDB59717" />
            <has_text     text="HMDB13189" />
            <has_text     text="HMDB00299" />
            <has_text     text="HMDB00191" />
            <has_text     text="HMDB00518" />
            <has_text     text="HMDB00715" />
            <has_text     text="HMDB01032" />
            <has_text     text="HMDB00208" />
            <has_text     text="HMDB04824" />
            <has_text     text="HMDB00512" />
            <has_text     text="HMDB00251" />
          </assert_contents>
        </output>
      </test>
      <!-- test 7 -->
      <test>
        <param name="dataMatrix_in" value="input_nofilter_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="M"/>
        <param name="wildcards" value="FALSE"/>
        <param name="inclusive" value="TRUE"/>
        <output name="variableMetadata_out">
          <assert_contents>
            <has_text     text="HMDB03193" />
            <not_has_text text="HMDB00822" />
            <has_text     text="HMDB01101" />
            <has_text     text="HMDB01101.1" />
            <has_text     text="HMDB10348" />
            <has_text     text="HMDB59717" />
            <not_has_text text="HMDB13189" />
            <has_text     text="HMDB00299" />
            <has_text     text="HMDB00191" />
            <has_text     text="HMDB00518" />
            <has_text     text="HMDB00715" />
            <has_text     text="HMDB01032" />
            <has_text     text="HMDB00208" />
            <has_text     text="HMDB04824" />
            <has_text     text="HMDB00512" />
            <has_text     text="HMDB00251" />
          </assert_contents>
        </output>
      </test>
      <!-- test 8 -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="[Mm],[fF]"/>
        <param name="wildcards" value="FALSE"/>
        <param name="inclusive" value="TRUE"/>
        <output name="sampleMetadata_out">
          <assert_contents>
            <has_text text="HU_028" />
            <has_text text="HU_051" />
            <has_text text="HU_060" />
            <has_text text="HU_110" />
            <has_text text="HU_149" />
            <has_text text="HU_152" />
            <has_text text="HU_175" />
            <has_text text="HU_178" />
            <has_text text="HU_185" />
            <not_has_text text="HU_204" />
            <has_text text="HU_208" />
            <has_text text="HU_017" />
            <has_text text="HU_034" />
            <has_text text="HU_078" />
            <has_text text="HU_091" />
            <has_text text="HU_093" />
            <has_text text="HU_099" />
            <has_text text="HU_130" />
            <has_text text="HU_134" />
            <has_text text="HU_138" />
          </assert_contents>
        </output>
      </test>
      <!-- test 9 -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value=""/>
        <param name="sampleclassNames" value="M"/>
        <param name="wildcards" value="FALSE"/>
        <param name="inclusive" value="TRUE"/>
        <output name="sampleMetadata_out">
          <assert_contents>
            <has_text text="HU_028" />
            <has_text text="HU_051" />
            <has_text text="HU_060" />
            <has_text text="HU_110" />
            <has_text text="HU_149" />
            <has_text text="HU_152" />
            <has_text text="HU_175" />
            <has_text text="HU_178" />
            <has_text text="HU_185" />
            <not_has_text text="HU_204" />
            <has_text text="HU_208" />
            <has_text text="HU_017" />
            <has_text text="HU_034" />
            <has_text text="HU_078" />
            <has_text text="HU_091" />
            <has_text text="HU_093" />
            <has_text text="HU_099" />
            <has_text text="HU_130" />
            <has_text text="HU_134" />
            <has_text text="HU_138" />
          </assert_contents>
        </output>
      </test>
      <!-- test 10 - extends test4 with no imputation rather than zero imputation -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="*"/>
        <param name="wildcards" value="TRUE"/>
        <param name="inclusive" value="TRUE"/>
        <param name="imputation" value="none"/>
        <output name="dataMatrix_out" md5="cc9ab8bdb70b68b43b19b7327d285166">
          <assert_contents>
            <not_has_text text="HU_204" />
            <has_text text="NA" />
            <has_text text="HU_028" />
          </assert_contents>
        </output>
        <output name="sampleMetadata_out">
          <assert_contents>
            <not_has_text text="HU_204" />
            <has_text text="HU_028" />
          </assert_contents>
        </output>
      </test>
      <!-- test 11 - extends test4 with center imputation rather than zero imputation -->
      <test>
        <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
        <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
        <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
        <param name="classnameColumn" value="gender"/>
        <param name="sampleclassNames" value="*"/>
        <param name="wildcards" value="TRUE"/>
        <param name="inclusive" value="TRUE"/>
        <param name="imputation" value="center"/>
        <output name="dataMatrix_out" md5="75a4802bb8887709e4d4dec8c2c3d3cf">
          <assert_contents>
            <not_has_text text="HU_204" />
            <not_has_text text="NA" />
            <has_text text="HU_028" />
          </assert_contents>
        </output>
        <output name="sampleMetadata_out">
          <assert_contents>
            <not_has_text text="HU_204" />
            <has_text text="HU_028" />
          </assert_contents>
        </output>
      </test>
    </tests>
    <help><![CDATA[


**Author** Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu)

--------------------------------------------------------------------------


**R package**

The *w4mclassfilter* package (which is used by the W4m Data Subset tool) is available from the Hegeman lab github repository (https://github.com/HegemanLab/w4mclassfilter/releases).

-----------------------------------------------------------------------------------------------------------------------------------------


**Tool updates**

See the **NEWS** section at the bottom of this page

---------------------------------------------------

===========================================================
"W4m Data Subset" - Filter Workflow4Metabolomics data files
===========================================================

----------
Motivation
----------

GC-MS and LC-MS experiments seek to resolve as features chemicals that have distinct chromatographic retention-time ("rt") and (after ionization) mass-to-charge ratio ("m/z" or "mz").
(If the MS protocol includes fragmentation, several features may result for each chemical.)
Data for a sample are collected as MS intensities, each of which is associated with a position on a 2D plane with dimensions of rt and m/z.
Ideally, features would be sufficiently reproducible among sample-runs to distinguish features that are commmon among samples from those that differ.

The chromatographic retention-time for a chemical can vary from one chromatography run to the next.
Workflow4Metabolomics (W4m, [Giacomoni *et al.*, 2014, Guitton *et al.* 2017]) is a "flavor" of Galaxy that uses the XCMS preprocessing tools for "retention-time correction" to align features among samples.
Features may be better aligned if pooled samples and blanks are included.

Multivariate statistical techniques may be used to discover clusters of similar samples (Th]]>&#233;<![CDATA[venot *et al.*, 2015).
However, once retention-time alignment of features has been achieved among samples in GC-MS and LC-MS datasets:

- The presence of pools and blanks may confound identification and separation of clusters.
- Multivariate statistical algorithms may be impacted by missing values or dimensions that have zero variance.

-----------
Description
-----------

The **W4m Data Subset** tool **selects subsets of samples, features, or data values** for further analysis.

- The tool takes as input the data matrix, sample metadata, and variable metadata datasets produced by W4m's XCMS [Smith *et al.*, 2006] and CAMERA [Kuhl *et al.*, 2012] tools.
- The tool produces the same trio of output datasets, modified as follows.

This tool can perform several operations to reduce the number samples or features to be analyzed (although **this should be done only in a statistically sound manner** consistent with the nature of the experiment):

- Samples may be eliminated by filtering on a designated “sample class” column in sampleMetadata.
- Features may be eliminated by specifying minimum or maximum value (or both) allowable in columns of variableMetadata.
- Features may be eliminated by “range of row-maximum for each feature”, i.e., by specifying minimum or maximum intensity (or both) allowable in each row of the dataMatrix (i.e., for the feature across all samples).

This tool also performs several operations to address several data issues that may impede downstream statistical analysis:

- Samples that are missing from either sampleMetadata or dataMatrix are eliminated.
- Features that are missing from either variableMetadata or dataMatrix are eliminated.
- Features and samples that have zero variance are eliminated.
- Samples and features are sorted alphabetically in rows and columns of dataMatrix and in rows of variableMetadata and sampleMetadata.
- The names of the first columns of variableMetadata and sampleMetadata are set respectively to "variableMetadata" and "sampleMetadata".
- If desired, the values in the dataMatrix may be log-transformed.
- If desired, each missing value in dataMatrix is replaced with zero or the median value observed for the corresponding feature.

This tool may be applied several times sequentially, which may be useful for:

- analyzing subsets of samples for progressively smaller sets of treatment-levels, or
- choosing subsets of samples based on criteria in several columns of the sampleMetadata table.

-----------------
Workflow Position
-----------------

This tool can be used at any point downstream of Preprocessing.

- Possible upstream tool categories: Preprocessing, Quality Control, Statistical Analysis, Filter and Sort
- Possible downstream tool categories: Normalisation, Statistical Analysis, Quality Control, Filter and Sort

-----------
Input files
-----------

+---------------------------+------------+
| File                      |   Format   |
+===========================+============+
|     Data matrix           |   tabular  |
+---------------------------+------------+
|     Sample metadata       |   tabular  |
+---------------------------+------------+
|     Variable metadata     |   tabular  |
+---------------------------+------------+


----------
Parameters
----------

Data matrix file
	| variable x sample **dataMatrix** (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical, respectively, to the rownames of the sample metadata file and variable metadata file
	|

Sample metadata file
	| sample x metadata **sampleMetadata** (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
	|

Variable metadata file
	| variable x metadata **variableMetadata** (tabular separated values) file of the numeric and/or character variable metadata, with . as decimal and NA for missing values
	|

Column that names the sample-class (default = '``class``')
	| name of the column in **sampleMetadata** that has the values to be tested against the '``Names of sample-classes``' input parameter; only letters, digits, periods, and underscores are permitted.
	|

Names of sample-classes (default = no names)
	| comma-separated names (or regular expressions to match names) of sample-classes to include or exclude
	|

'Wild cards' or 'regular expressions' (default = '``wild-cards``')
	| '``wild-cards``' - use wild cards to match names of sample-classes (see the 'Wild card patterns to match class-names' section below)
	| '``regular-expressions``' - use regular expressions to match the named sample-classes (see the 'Regular expression patterns to match class-names' section below)
	|

Exclude/include named classes (default = '``filter-out``')
	| '``filter-in``' - include only the named sample-classes
	| '``filter-out``' - exclude only the named sample-classes
	|

Variable-range filters (default = no filters)
	| comma-separated names of variable-range filters (see the 'Variable-range filters' section below)
	|

Data-transformation (default = '``none``')
	| '``none``' - Do not transform data matrix values.
	| '``log2``' - Take the log base 2 of the values in the data matrix.
	| '``log10``' - Take the log base 10 of the values in the data matrix.
	|

Data-imputation (default = '``zero``')
	| '``none``' - Do not impute data matrix values.
	| '``zero``' - Negative and missing values are imputed to zero.
	| '``center``' - For each feature, negative and missing values are imputed to the median of other values.
	|


------------
Output files
------------

sampleMetadata
	| (tabular separated values) file identical to the **sampleMetadata** file given as an input argument, excepting lacking rows for samples that have been filtered out (by the sample-class filter, or because of zero variance, or because they were missing in the input data matrix)
	|

variableMetadata
	| (tabular separated values) file identical to the **variableMetadata** file given as an input argument, excepting lacking rows for variables (xC-MS features) that have been filtered out (by the variable-range filter, or because of zero variance, or because they were missing in the input data matrix)
	|

dataMatrix
	| (tabular separated values) file identical to the **dataMatrix** file given as an input argument, excepting lacking rows and columns for variables and samples that have been filtered out, respectively
	|


-----------------------------------------
'Wild card' patterns to match class-names
-----------------------------------------

W4m Data Subset supports use of R "wild card" patterns to select class-names.

- use '``?``' to match a single character
- use '``*``' to match zero or more characters
- the entire pattern must match the sample name

For example

- '``??.samp*``' matches '``my.sample``' but not '``my.own.sample``'
- '``*.sample``' matches '``my.sample``' and '``my.own.sample``'
- '``*.sampl``' matches neither '``my.sample``' nor '``my.own.sample``'

--------------------------------------------------
'Regular expression' patterns to match class-names
--------------------------------------------------

W4m Data Subset supports use of R "regular expression" patterns to select class-names.

R uses POSIX 1003.2 standard regular expressions, which allow precise pattern-matching and are exhaustively defined at:
http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html

However, only a few basic building blocks of regular expressions need to be mastered for most cases:

- '``^``' matches the beginning of a class-name
- '``$``' matches the end of a class-name
- '``.``' outside of square brackets matches a single character
- '``*``' matches character specified immediately before zero or more times
- square brackets specify a set of characters to be matched.

Within square brackets

- '``^``' as the first character specifies that the list of characters are those that should **not** be matched.
- '``-``' is used to specify ranges of characters

Caveat: The tool wrapper uses the comma ('``,``') to split a list of sample-class names, so **commas may not be used within regular expressions for this tool**

First Example: Consider a field of class-names consisting of '``marq3,marq6,marq9,marq12,front3,front6,front9,front12``'

- The regular expression '``^front[0-9][0-9]*$``' will match the same sample-classes as '``front3,front6,front9,front12``'
- The regular expression '``^[a-z][a-z]3$``' will match the same sample-classes as '``front3,marq3``'
- The regular expression '``^[a-z][a-z]12$``' will match the same sample-classes as '``front12,marq12``'
- The regular expression '``^[a-z][a-z][0-9]$``' will match the same sample-classes as '``front3,front6,front9,marq3,marq6,marq9``'

Second Example: Consider these regular expression patterns as possible matches to a sample-class name '``AB0123``':

- '``^[A-Z][A-Z][0-9][0-9]*$``' - MATCHES '``**^AB0123$**``'
- '``^[A-Z][A-Z]*[0-9][0-9]*$``' - MATCHES '``**^AB0123$**``'
- '``^[A-Z][0-9]*``' - MATCHES  '``**^A** B0123$``' - first character is a letter, '``*``' can specify zero characters, and end of line did not need to be matched.
- '``^[A-Z][A-Z][0-9]``' - MATCHES  '``**^AB0** 123$``' - first two characters are letters aind the third is a digit.
- '``^[A-Z][A-Z]*[0-9][0-9]$``' - NO MATCH - the name does not end with the pattern '``[A-Z][0-9][0-9]$``', i.e., it ends with four digits, not two.
- '``^[A-Z][0-9]*$``' - NO MATCH - the pattern specifies that second character and all those that follow, if present, must be digits.

----------------------
Variable-range filters
----------------------

An array of range-specification strings may be supplied in the `variableRangeFilter`
argument.  If supplied, only features having numerical values in the specified column
of `variableMetadata` that fall within the specified ranges will be retained
in the output.  Each range is a string of three colon-separated values (e.g., "mz:200:800") in the
following order:

- the **name of a column of `variableMetadata`** which must have numerical data (only letters, digits, periods, and underscores are permitted in the name itself), e.g., 'mz';
- the **minimum allowed value** in that column for the feature to be retained, e.g., '200';
- the **maximum allowed value**, e.g., '800'.

Note for the range specification strings:

- **If the "maximum" is less than the "minimum", then the range is exclusive**  (e.g., "mz:800:200" means retain only features whose mz is NOT in the range 200-800)
- **If the name supplied in the first field is 'FEATMAX',**  then the string is defining the minimum (and possibly, though less useful, maximum) intensity for each feature in the dataMatrix.  For example, "FEATMAX:1e6:" would specify  that any feature would be excluded if no sample had an intensity for that feature greater than 1000000.

  - Note, however, that when the "maximum" is greater than the "minimum" for the FEATMAX range specification, then the specification is ignored.

----------------------------------
Data transformation and imputation
----------------------------------

Data may optionally be log2- or log10-transformed.

Negative intensity values are always substituted with zeros.

Missing intensity data values may optionally be imputed.  Missing values may be substituted with zeros (as may be appropriate for univariate analysis) or with the median for the feature (as may be appropriate for multivariate analysis).  (Note that the median feature-intensity is computed for the samples *before* variable-range filters are applied.)

-----------------------------------------------------------------------------

----------------
WORKING EXAMPLES
----------------

-----------
Input Files
-----------

+------------------------------------------------------------------------------------------------------------------------------------------------------+
| Input File URL                                                                                                                                       |
+======================================================================================================================================================+
| https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/input_dataMatrix.tsv                |
+------------------------------------------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/input_sampleMetadata.tsv            |
+------------------------------------------------------------------------------------------------------------------------------------------------------+
| https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/input_variableMetadata.tsv          |
+------------------------------------------------------------------------------------------------------------------------------------------------------+

-------------------------------
Running Without Range-Filtering
-------------------------------

This example retains only samples whose 'gender' attribute is 'M'.

**Input parameters**

+---------------------------------------------+-------------------------------+
| Input Parameter                             | Value                         |
+=============================================+===============================+
| Column that names the sample-class          | gender                        |
+---------------------------------------------+-------------------------------+
| Names of sample-classes                     | M                             |
+---------------------------------------------+-------------------------------+
| Use 'wild-cards' or 'regular expressions'   | wild-cards                    |
+---------------------------------------------+-------------------------------+
| Exclude/include named classes               | filter-in                     |
+---------------------------------------------+-------------------------------+
| Variable range-filters                      | (Leave this field empty.)     |
+---------------------------------------------+-------------------------------+
| Data transforamtion                         | none                          |
+---------------------------------------------+-------------------------------+
| Missing-value imputation                    | center                        |
+---------------------------------------------+-------------------------------+

**Expected outputs**

+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
| Expected Output   | Download from URL                                                                                                                               |
+===================+=================================================================================================================================================+
| Data matrix       | https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/expected_dataMatrix.tsv        |
+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
| Sample metadata   | https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/expected_sampleMetadata.tsv    |
+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
| Variable metadata | https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/expected_variableMetadata.tsv  |
+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+

----------------------------
Running With Range-Filtering
----------------------------

This example retains only features whose mz is greater than 200, whose rt is less than 800, and whose maximum intensity across all samples is 2,000,000.
This example retains all samples (except those having zero variance for all feature), although it would be possible to filter on samples as well.

**Input parameters**

+---------------------------------------------+-----------------------------------+
| Input Parameter                             | Value                             |
+=============================================+===================================+
| Column that names the sample-class          | sampleMetadata                    |
+---------------------------------------------+-----------------------------------+
| Names of sample-classes                     | HU_13[48]                         |
+---------------------------------------------+-----------------------------------+
| Use 'wild-cards' or 'regular expressions'   | regular-expressions               |
+---------------------------------------------+-----------------------------------+
| Exclude/include named classes               | filter-out                        |
+---------------------------------------------+-----------------------------------+
| Variable range-filters                      | FEATMAX:20.93157:,mz:200:,rt::800 |
+---------------------------------------------+-----------------------------------+
| Data transforamtion                         | log2                              |
+---------------------------------------------+-----------------------------------+
| Missing-value imputation                    | zero                              |
+---------------------------------------------+-----------------------------------+

**Expected outputs**

+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
| Expected Output   | Download from URL                                                                                                                                 |
+===================+===================================================================================================================================================+
| Data matrix       | https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/rangefilter_dataMatrix.tsv       |
+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
| Sample metadata   | https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/rangefilter_sampleMetadata.tsv   |
+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
| Variable metadata | https://raw.githubusercontent.com/HegemanLab/w4mclassfilter_galaxy_wrapper/master/tools/w4mclassfilter/test-data/rangefilter_variableMetadata.tsv |
+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
    ]]></help>
    <citations>
        <!-- Giacomoni_2014 W4m 2.5 -->
        <citation type="doi">10.1093/bioinformatics/btu813</citation>
        <!-- Guitton_2017 W4m 3.0 -->
        <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
        <!-- Kuhl_2012 CAMERA -->
        <citation type="doi">10.1021/ac202450g</citation>
        <!-- Smith_2006 XCMS -->
        <citation type="doi">10.1021/ac051437y</citation>
        <!-- Thevenot_2015 Urinary metabolome statistics -->
        <citation type="doi">10.1021/acs.jproteome.5b00354</citation>
    </citations>
    <!--
     vim:noet:sw=4:ts=4
--> </tool>