Mercurial > repos > eschen42 > w4mclassfilter

--- a/README	Mon Jan 29 21:20:07 2018 -0500
+++ b/README	Sat Mar 03 22:58:14 2018 -0500
@@ -1,7 +1,8 @@
 Galaxy Wrapper for the w4mclassfilter R Package
+<https://doi.org/10.5281/zenodo.1034793>

-This is a planemo <http://planemo.readthedocs.io/en/latest/>
-oriented galaxy-tool-wrapper <https://docs.galaxyproject.org/en/latest/dev/schema.htm>
+This is a Galaxy tool-wrapper <https://docs.galaxyproject.org/en/latest/dev/schema.htm>
 to wrap the w4mclassfilter R package <https://github.com/HegemanLab/w4mclassfilter>
 for use with the Workflow4Metabolomics <http://workflow4metabolomics.org/>
-flavor of Galaxy <https://galaxyproject.org/>
+flavor of Galaxy <https://galaxyproject.org/>.
+The tool is built with Planemo <http://planemo.readthedocs.io/en/latest/>.
--- a/test-data/rangefilter_dataMatrix.tsv	Mon Jan 29 21:20:07 2018 -0500
+++ b/test-data/rangefilter_dataMatrix.tsv	Sat Mar 03 22:58:14 2018 -0500
@@ -1,6 +1,6 @@
 	HU_017	HU_028	HU_034	HU_051	HU_060	HU_078	HU_091	HU_093	HU_099	HU_110	HU_130	HU_134	HU_138	HU_149	HU_152	HU_175	HU_178	HU_185	HU_208
-HMDB00191	560002	771533	575790	392284	888498	785428	645785	591569	960658	910201	639437	1092885	1409045	2292023	1246459	1945577	710519	773384	622898
-HMDB00208	747080	13420742	595872	1172376	7172632	3143654	4059767	1433702	5593888	5402629	2477288	3346077	4230072	7621236	8960828	10335722	7037373	1574738	2540044
-HMDB01032	2569205	26023086	1604999	430453	8103558	26222916	257139	675754	59906109	263055	31151730	18648127	14989438	1554658	20249262	5588731	871010	15920	44276
-HMDB01101.1	6877586	52217	3158	10789748	229568	4763576	3878773	976436	831937	608298	1605075	72021	442510	1107705	1464339	31250	2724553	72900	30689
-HMDB13189	2644620	727587	1661412	619181	136278	2755434	593863	837865	3526136	2003278	1608814	3446611	1941527	113937	3132404	2893445	2092753	1034666	841661
+HMDB00191	19.0950724540801	19.5573683394871	19.1351832076319	18.581538968171	19.7610090032025	19.5831195045026	19.3006944055142	19.1741869272827	19.8736633887651	19.7958256457729	19.2864427002132	20.0597101691826	20.4262862563728	21.1281900906526	20.2494039981067	20.8917666482316	19.438513703552	19.5608253922588	19.2486364146654
+HMDB00208	19.5109032146715	23.6779611010349	19.1846429313023	20.1610039089984	22.7740711828923	21.5840110083096	21.9529654992985	20.451313755289	22.4154199380002	22.3652301844309	21.2403301698498	21.6740392144135	22.0122507889717	22.8615935600174	23.0952006159265	23.3011358356974	22.746605551146	20.5866803867983	21.2764220576728
+HMDB01032	21.2928905785523	24.6332887213057	20.6141409677961	18.7154961966847	22.9501240553341	24.6443247870262	17.9721889132336	19.3661386209868	25.8361997953493	18.0050049466979	24.8928089492302	24.1525273990616	23.8374429574498	20.5681658146509	24.2713659930314	22.4140893053724	19.7323297568288	13.958552715431	15.4342372710269
+HMDB01101.1	22.7134708439962	15.6722319530667	11.6247954558602	23.3631578345615	17.8085620299575	22.183613575742	21.8871689158671	19.8971659609365	19.6661147561338	19.214418735272	20.6142092807528	16.1361300108053	18.7553505325418	20.079142288524	20.4818181509538	14.9315685693242	21.3775881248719	16.1536311941017	14.9054340159519
+HMDB13189	21.3346290086766	19.4727602406215	20.6639784491577	19.2400016764968	17.0561931543153	21.3938481405109	19.1797706242895	19.6763582845664	21.7496566885488	20.9339312108399	20.6175661105571	21.7167470481744	20.8887603396637	16.7978767996065	21.5788388647145	21.4643567902489	20.9969706149343	19.9807336965562	19.6828797432404
--- a/w4mclassfilter.xml	Mon Jan 29 21:20:07 2018 -0500
+++ b/w4mclassfilter.xml	Sat Mar 03 22:58:14 2018 -0500
@@ -1,6 +1,7 @@
-<tool id="w4mclassfilter" name="Sample_Subset" version="0.98.7">
-  <!-- this file is utf-8, not ASCII, because it contains the character é -->
-  <description>Filter W4M data by sample class</description>
+<tool id="w4mclassfilter" name="W4m Data Subset" version="0.98.8">
+  <description>Filter W4m data by values or metadata</description>
+
+  <!-- Here is the hyphenation standard that I *try* to apply consistently in my documentation: http://www.sandranoonan.com/dont-let-hyphenation-drive-crazy/ -->

   <requirements>
     <!-- <requirement type="package" version="6.2">readline</requirement> -->
@@ -23,8 +24,9 @@
   inclusive '$inclusive'
   wildcards '$wildcards'
   classnameColumn '$classnameColumn'
-  samplenameColumn '$samplenameColumn'
+  samplenameColumn 'sampleMetadata'
   variable_range_filter '$variableRangeFilter'
+	transformation '$transformation'
   dataMatrix_out '$dataMatrix_out'
   sampleMetadata_out '$sampleMetadata_out'
   variableMetadata_out '$variableMetadata_out'
@@ -34,12 +36,23 @@
     <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
     <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
     <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
-    <param name="samplenameColumn" label="Column that names the sample" type="text" value = "sampleMetadata" help="name of the column in the sample metadata file that has the name of the sample - defaults to 'sampleMetadata'" />
-    <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'classes' input parameter - defaults to 'class'" />
-    <param name="sampleclassNames" label="Names of sample classes" type="text" value = "" help="comma-separated names (or comma-less regular expressions to match names) of sample-classes to filter in or out; defaults to no names">
+    <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'Names of sample-classes' input parameter - defaults to 'class'">
       <sanitizer>
         <valid initial="string.letters">
           <add preset="string.digits"/>
+          <add value="&#45;"  /> <!-- dash, hyphen -->
+          <add value="&#46;"  /> <!-- dot, period -->
+          <add value="&#95;"  /> <!-- underscore -->
+        </valid>
+      </sanitizer>
+    </param>
+    <param name="sampleclassNames" label="Names of sample-classes" type="text" value = "" help="comma-separated names (or regular expressions to match names) of sample-classes to filter in or out; defaults to no names">
+      <sanitizer>
+        <valid initial="string.letters">
+          <add preset="string.digits"/>
+          <add value="&#123;" /> <!-- l-cube, left-curly-bracket -->
+          <add value="&#124;" /> <!-- pipe -->
+          <add value="&#125;" /> <!-- r-cube, right-curly-bracket -->
           <add value="&#36;"  /> <!-- dollar, dollar-sign -->
           <add value="&#40;"  /> <!-- left-paren -->
           <add value="&#41;"  /> <!-- right-paren -->
@@ -54,38 +67,43 @@
           <add value="&#92;"  /> <!-- whack, backslash -->
           <add value="&#93;"  /> <!-- r-squib, right-squre-bracket -->
           <add value="&#94;"  /> <!-- hat, caret -->
-          <add value="&#123;" /> <!-- l-cube, left-curly-bracket -->
-          <add value="&#124;" /> <!-- pipe -->
-          <add value="&#125;" /> <!-- r-cube, right-curly-bracket -->
+          <add value="&#95;"  /> <!-- underscore -->
         </valid>
       </sanitizer>
     </param>

-    <param name="wildcards" label="Use wild-cards or regular-expressions" type="select" help="wild-cards (the default) - use '*' and '?' to match class names; regular-expressions - use comma-less regular expressions to match class names">
+    <param name="wildcards" label="Use 'wild cards' or 'regular expressions'" type="select" help="'wild-cards' (the default) - use '*' and '?' to match class names; 'regular-expressions' - use regular expressions to match class names">
       <option value="TRUE" selected="true">wild-cards</option>
       <option value="FALSE">regular-expressions</option>
     </param>
-    <param name="inclusive" label="Include named classes" type="select" help="filter-in - include only the named sample classes; filter-out (the default) - exclude only the named sample classes">
+    <param name="inclusive" label="Exclude/include named classes" type="select" help="'filter-out' (the default) - exclude only the named sample-classes; 'filter-in' - include only the named sample-classes">
       <option value="TRUE">filter-in</option>
       <option value="FALSE" selected="true">filter-out</option>
     </param>

-    <param name="variableRangeFilter" label="Variable range-filters" type="text" value = "" help="comma-separated filters, each specified as 'variableMetadataColumnName:min:max'; default is no filters.  (See help below.)">
+    <param name="variableRangeFilter" label="Variable-range filters" type="text" value = "" help="comma-separated filters, each specified as 'variableMetadataColumnName:min:max'; default is no filters.  (See help below.)">
       <sanitizer>
         <valid initial="string.letters">
           <add preset="string.digits"/>
           <add value="&#44;"  /> <!-- comma -->
+          <add value="&#45;"  /> <!-- dash, hyphen -->
+          <add value="&#46;"  /> <!-- dot, period -->
           <add value="&#58;"  /> <!-- colon -->
-          <add value="&#46;"  /> <!-- dot, period -->
+          <add value="&#95;"  /> <!-- underscore -->
         </valid>
       </sanitizer>
     </param>
+    <param name="transformation" label="Data-transformation" type="select" help="'none' (the default) - do not transform data; 'log2' - log base 2 of data; 'log10' - log base 10 of data; in all cases, negative and missing values are imputed to zero">
+      <option value="none" selected="true">none</option>
+      <option value="log2">log2</option>
+      <option value="log10">log10</option>
+    </param>

   </inputs>
   <outputs>
-    <data name="dataMatrix_out" label="${tool.name}_${dataMatrix_in.name}" format="tabular" ></data>
-    <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data>
-    <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data>
+    <data name="dataMatrix_out" label="${dataMatrix_in.name}.subset" format="tabular" ></data>
+    <data name="sampleMetadata_out" label="${sampleMetadata_in.name}.subset" format="tabular" ></data>
+    <data name="variableMetadata_out" label="${variableMetadata_in.name}.subset" format="tabular" ></data>
   </outputs>

   <tests>
@@ -93,12 +111,76 @@
       <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
       <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
       <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
+      <param name="classnameColumn" value="gender"/>
+      <param name="sampleclassNames" value="M"/>
+      <param name="wildcards" value="FALSE"/>
+      <param name="inclusive" value="filter-in"/>
+      <param name="variableRangeFilter" value="FEATMAX:6.30103:,mz:200:,rt::800"/>
+      <param name="transformation" value="log10"/>
+      <output name="dataMatrix_out">
+        <assert_contents>
+          <has_text text="5.87336711011293" />
+        </assert_contents>
+      </output>
+      <output name="sampleMetadata_out">
+        <assert_contents>
+          <not_has_text text="HU_028" />
+          <not_has_text text="HU_051" />
+          <not_has_text text="HU_060" />
+          <not_has_text text="HU_110" />
+          <not_has_text text="HU_149" />
+          <not_has_text text="HU_152" />
+          <not_has_text text="HU_175" />
+          <not_has_text text="HU_178" />
+          <not_has_text text="HU_185" />
+          <not_has_text text="HU_204" />
+          <not_has_text text="HU_208" />
+          <has_text text="HU_017" />
+          <has_text text="HU_034" />
+          <has_text text="HU_078" />
+          <has_text text="HU_091" />
+          <has_text text="HU_093" />
+          <has_text text="HU_099" />
+          <has_text text="HU_130" />
+          <has_text text="HU_134" />
+          <has_text text="HU_138" />
+        </assert_contents>
+      </output>
+      <output name="variableMetadata_out">
+        <assert_contents>
+          <not_has_text text="HMDB00191" />
+          <has_text     text="HMDB00208" />
+          <not_has_text text="HMDB00251" />
+          <not_has_text text="HMDB00299" />
+          <not_has_text text="HMDB00512" />
+          <not_has_text text="HMDB00518" />
+          <not_has_text text="HMDB00715" />
+          <not_has_text text="HMDB00822" />
+          <has_text     text="HMDB01032" />
+          <has_text     text="HMDB01101.1" />
+          <not_has_text text="HMDB03193" />
+          <not_has_text text="HMDB04824" />
+          <not_has_text text="HMDB10348" />
+          <has_text     text="HMDB13189" />
+          <not_has_text text="HMDB59717" />
+        </assert_contents>
+      </output>
+    </test>
+    <test>
+      <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
+      <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
+      <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
       <param name="classnameColumn" value="class"/>
       <param name="sampleclassNames" value=""/>
       <param name="wildcards" value="FALSE"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-out"/>
-      <param name="variableRangeFilter" value="FEATMAX:2e6:,mz:200:,rt::800"/>
+      <param name="variableRangeFilter" value="FEATMAX:20.93157:,mz:200:,rt::800"/>
+      <param name="transformation" value="log2"/>
+      <output name="dataMatrix_out">
+        <assert_contents>
+          <has_text text="19.5109032146715" />
+        </assert_contents>
+      </output>
       <output name="sampleMetadata_out">
         <assert_contents>
           <has_text text="HU_028" />
@@ -149,8 +231,8 @@
       <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
       <param name="classnameColumn" value="gender"/>
       <param name="sampleclassNames" value="M"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-in"/>
+      <param name="transformation" value="none"/>
       <output name="dataMatrix_out">
         <assert_contents>
           <not_has_text text="HU_028" />
@@ -199,7 +281,6 @@
       <param name="classnameColumn" value="gender"/>
       <param name="sampleclassNames" value="*"/>
       <param name="wildcards" value="TRUE"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-in"/>
       <output name="sampleMetadata_out">
         <assert_contents>
@@ -233,7 +314,6 @@
       <param name="classnameColumn" value="gender"/>
       <param name="sampleclassNames" value="M"/>
       <param name="wildcards" value="FALSE"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-in"/>
       <output name="sampleMetadata_out">
         <assert_contents>
@@ -267,7 +347,6 @@
       <param name="classnameColumn" value="gender"/>
       <param name="sampleclassNames" value="M"/>
       <param name="wildcards" value="FALSE"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-in"/>
       <output name="variableMetadata_out">
         <assert_contents>
@@ -297,7 +376,6 @@
       <param name="classnameColumn" value="gender"/>
       <param name="sampleclassNames" value="M"/>
       <param name="wildcards" value="FALSE"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-in"/>
       <output name="variableMetadata_out">
         <assert_contents>
@@ -327,7 +405,6 @@
       <param name="classnameColumn" value="gender"/>
       <param name="sampleclassNames" value="[Mm],[fF]"/>
       <param name="wildcards" value="FALSE"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-in"/>
       <output name="sampleMetadata_out">
         <assert_contents>
@@ -361,7 +438,6 @@
       <param name="classnameColumn" value=""/>
       <param name="sampleclassNames" value="M"/>
       <param name="wildcards" value="FALSE"/>
-      <param name="samplenameColumn" value="sampleMetadata"/>
       <param name="inclusive" value="filter-in"/>
       <output name="sampleMetadata_out">
         <assert_contents>
@@ -402,7 +478,7 @@

 **R package**

-The *w4mclassfilter* package is available from the Hegeman lab github repository (https://github.com/HegemanLab/w4mclassfilter/releases).
+The *w4mclassfilter* package (which is used by the W4m Data Subset tool) is available from the Hegeman lab github repository (https://github.com/HegemanLab/w4mclassfilter/releases).

 -----------------------------------------------------------------------------------------------------------------------------------------

@@ -413,44 +489,67 @@

 ---------------------------------------------------

-==============================================
-Filter Workflow4Metabolomics data matrix files
-==============================================
+===========================================================
+"W4m Data Subset" - Filter Workflow4Metabolomics data files
+===========================================================
+
+----------
+Motivation
+----------
+
+GC-MS and LC-MS experiments seek to resolve as features chemicals that have distinct chromatographic retention-time ("rt") and (after ionization) mass-to-charge ratio ("m/z" or "mz").
+(If the MS protocol includes fragmentation, several features may result for each chemical.)
+Data for a sample are collected as MS intensities, each of which is associated with a position on a 2D plane with dimensions of rt and m/z.
+Ideally, features would be sufficiently reproducible among sample-runs to distinguish features that are commmon among samples from those that differ.
+
+The chromatographic retention-time for a chemical can vary from one chromatography run to the next.
+Workflow4Metabolomics (W4m, [Giacomoni *et al.*, 2014, Guitton *et al.* 2017]) is a "flavor" of Galaxy that uses the XCMS preprocessing tools for "retention-time correction" to align features among samples.
+Features may be better aligned if pooled samples and blanks are included.
+
+Multivariate statistical techniques may be used to discover clusters of similar samples (Th]]>&#233;<![CDATA[venot *et al.*, 2015).
+However, once retention-time alignment of features has been achieved among samples in GC-MS and LC-MS datasets:
+
+- The presence of pools and blanks may confound identification and separation of clusters.
+- Multivariate statistical algorithms may be impacted by missing values or dimensions that have zero variance.

 -----------
 Description
 -----------

-Filter a set of retention-corrected W4M files (dataMatrix, sampleMetadata, variableMetadata) by sample-class
+The **W4m Data Subset** tool **selects subsets of samples, features, or data values** for further analysis.
+
+- The tool takes as input the data matrix, sample metadata, and variable metadata datasets produced by W4m's XCMS [Smith *et al.*, 2006] and CAMERA [Kuhl *et al.*, 2012] tools.
+- The tool produces the same trio of output datasets, modified as follows.
+
+This tool can perform several operations to reduce the number samples or features to be analyzed (although **this should be done only in a statistically sound manner** consistent with the nature of the experiment):
+
+- Samples may be eliminated by filtering on a designated “sample class” column in sampleMetadata.
+- Features may be eliminated by specifying minimum or maximum value (or both) allowable in columns of variableMetadata.
+- Features may be eliminated by “range of row-maximum for each feature”, i.e., by specifying minimum or maximum intensity (or both) allowable in each row of the dataMatrix (i.e., for the feature across all samples).
+
+This tool also performs several operations to address several data issues that may impede downstream statistical analysis:
+
+- Missing values in dataMatrix are imputed to zero.
+- The values in the dataMatrix may be log-transformed if desired.
+- Samples that are missing from either sampleMetadata or dataMatrix are eliminated.
+- Features that are missing from either variableMetadata or dataMatrix are eliminated.
+- Features and samples that have zero variance are eliminated.
+- Samples and features are sorted alphabetically in rows and columns of dataMatrix and in rows of variableMetadata and sampleMetadata.
+- The names of the first columns of variableMetadata and sampleMetadata are set respectively to "variableMetadata" and "sampleMetadata".
+
+This tool may be applied several times sequentially, which may be useful for:
+
+- analyzing subsets of samples for progressively smaller sets of treatment-levels, or
+- choosing subsets of samples based on criteria in several columns of the sampleMetadata table.

 -----------------
 Workflow Position
 -----------------

-- Upstream tool category: Preprocessing
-- Downstream tool categories: Normalisation, Statistical Analysis, Quality Control, Filter and Sort
-
-----------
-Motivation
-----------
+This tool can be used at any point downstream of Preprocessing.

-GC-MS1 and LC-MS1 experiments seek to resolve chemicals as features that have distinct chromatographic behavior and (after ionization) mass-to-charge ratio.
-Data for a sample are collected as MS intensities, each of which is associated with a position on a 2D plane with dimensions of m/z ratio and chromatographic retention time.
-Ideally, features would be sufficiently reproducible from sample-run to sample-run to identify features that are commmon among samples and those that differ.
-However, the chromatographic retention time for a chemical can vary from one run to another.
-In the Workflow4Metabolomics (W4M, [Giacomoni *et al.*, 2014, Guitton *et al.* 2017]) "flavor" of Galaxy, the XCMS [Smith *et al.*, 2006] preprocessing tools provide for "retention time correction" to align features among samples, but features may be better aligned if pooled samples and blanks are included.
-
-Multivariate statistical techniques may be used to discover clusters of similar samples, and sometimes it is desirable to apply clustering iteratively to smaller and smaller subsets of samples until observable separation of clusters is no longer significant.
-Once feature-alignment has been achieved among samples in GC-MS and LC-MS datasets, however, the presence of pools and blanks may confound identification and separation of clusters.
-Multivariate statistical algorithms also may be impacted by missing values or dimensions that have zero variance (Thévenot *et al.*, 2015).
-
-The w4mclassfilter tool provides a way to choose subsets of samples for further analysis.
-The tool takes as input the data matrix, sample metadata, and variable metadata Galaxy datasets produced by W4M and produces the same trio of datasets with data only for the selected samples.
-The tool uses a "sample-class" column in the sample metadata as the basis for including or eliminating samples for further analysis.
-Class-values to be considered are provided by the user as a comma-separated list.
-The user also provides an indication whether the list specifies classes to be included in further analysis ("filter-in") or rather to be excluded from it ("filter-out").
-Next, missing and negative intensites for features of the remaining samples are imputed to zero.
-Finally, samples or features with zero variance are eliminated.
+- Possible upstream tool categories: Preprocessing, Quality Control, Statistical Analysis, Filter and Sort
+- Possible downstream tool categories: Normalisation, Statistical Analysis, Quality Control, Filter and Sort

 -----------
 Input files
@@ -472,7 +571,7 @@
 ----------

 Data matrix file
-	| variable x sample **dataMatrix** (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and variable metadata, respectively (see below)
+	| variable x sample **dataMatrix** (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical, respectively, to the rownames of the sample metadata file and variable metadata file
 	|

 Sample metadata file
@@ -483,57 +582,58 @@
 	| variable x metadata **variableMetadata** (tabular separated values) file of the numeric and/or character variable metadata, with . as decimal and NA for missing values
 	|

-Column that names the sample (default = '``sampleMetadata``')
-	| name of the column in sample metadata that has the name of the sample
+Column that names the sample-class (default = '``class``')
+	| name of the column in **sampleMetadata** that has the values to be tested against the '``Names of sample-classes``' input parameter
 	|

-Column that names the sample-class (default = '``class``')
-	| name of the column in sample metadata that has the values to be tested against the '``classes``' input parameter
+Names of sample-classes (default = no names)
+	| comma-separated names (or regular expressions to match names) of sample-classes to include or exclude
 	|

-Names of sample classes (default = no names)
-	| comma-separated names of sample classes to include or exclude
+'Wild cards' or 'regular expressions' (default = '``wild-cards``')
+	| '``wild-cards``' - use wild cards to match names of sample-classes (see the 'Wild card patterns to match class-names' section below)
+	| '``regular-expressions``' - use regular expressions to match the named sample-classes (see the 'Regular expression patterns to match class-names' section below)
 	|

-Wild-cards (default = '``wild-cards``')
-	| '``wild-cards``' - use wild-cards to match names of sample classes (see 'Wild card patterns to match class-names' below)
-	| '``regular-expressions``' - exclude only the named sample classes (see 'Regular expression patterns to match class-names' below)
-	|
-
-Include named classes (default = '``filter-out``')
-	| '``filter-in``' - include only the named sample classes
-	| '``filter-out``' - exclude only the named sample classes
+Exclude/include named classes (default = '``filter-out``')
+	| '``filter-in``' - include only the named sample-classes
+	| '``filter-out``' - exclude only the named sample-classes
 	|

 Variable-range filters (default = no filters)
-	| comma-separated names of variable-range filters (see 'Variable-range filters' below)
+	| comma-separated names of variable-range filters (see the 'Variable-range filters' section below)
 	|

+Data-transformation (default = '``none``')
+	| '``none``' - do not transform data matrix values
+	| '``log2``' - take the log base 2 of the values in the data matrix
+	| '``log10``' - take the log base 10 of the values in the data matrix
+	| In both cases, negative and missing values are imputed to zero.
+	|


 ------------
 Output files
 ------------

-
 sampleMetadata
-	| (tabular separated values) file identical to the **sampleMetadata** file given as an input argument, excepting lacking rows for samples (xC-MS features) that have been filtered out (by the sample-class filter or because of zero variance)
+	| (tabular separated values) file identical to the **sampleMetadata** file given as an input argument, excepting lacking rows for samples that have been filtered out (by the sample-class filter, or because of zero variance, or because they were missing in the input data matrix)
 	|

 variableMetadata
-	| (tabular separated values) file identical to the **variableMetadata** file given as an input argument, excepting lacking rows for variables (xC-MS features) that have been filtered out (because of zero variance)
+	| (tabular separated values) file identical to the **variableMetadata** file given as an input argument, excepting lacking rows for variables (xC-MS features) that have been filtered out (by the variable-range filter, or because of zero variance, or because they were missing in the input data matrix)
 	|

 dataMatrix
-	| (tabular separated values) file identical to the **dataMatrix** file given as an input argument, excepting lacking rows for variables (xC-MS features) that have been filtered out (because of zero variance) and columns that have been filtered out (by the sample-class filter or because of zero variance)
+	| (tabular separated values) file identical to the **dataMatrix** file given as an input argument, excepting lacking rows and columns for variables and samples that have been filtered out, respectively
 	|


----------------------------------------
-Wild card patterns to match class-names
----------------------------------------
+-----------------------------------------
+'Wild card' patterns to match class-names
+-----------------------------------------

-Beginning with v0.98.2, w4mclassfilter supports use of R "wild card" patterns to select class-names.
+W4m Data Subset supports use of R "wild card" patterns to select class-names.

 - use '``?``' to match a single character
 - use '``*``' to match zero or more characters
@@ -545,11 +645,11 @@
 - '``*.sample``' matches '``my.sample``' and '``my.own.sample``'
 - '``*.sampl``' matches neither '``my.sample``' nor '``my.own.sample``'

-------------------------------------------------
-Regular expression patterns to match class-names
-------------------------------------------------
+--------------------------------------------------
+'Regular expression' patterns to match class-names
+--------------------------------------------------

-Beginning with v0.98.2, w4mclassfilter supports use of R "regular expression" patterns to select class-names.
+W4m Data Subset supports use of R "regular expression" patterns to select class-names.

 R uses POSIX 1003.2 standard regular expressions, which allow precise pattern-matching and are exhaustively defined at:
 http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
@@ -638,17 +738,19 @@

 **Input parameters**

-+------------------------------------+-----------------+
-| Input Parameter                    | Value           |
-+====================================+=================+
-| Names of sample classes            | M               |
-+------------------------------------+-----------------+
-| Include named classes              | filter-in       |
-+------------------------------------+-----------------+
-| Column that names the sample-class | gender          |
-+------------------------------------+-----------------+
-| Column that names the sample       | sampleMetadata  |
-+------------------------------------+-----------------+
++------------------------------------+-------------------------------+
+| Input Parameter                    | Value                         |
++====================================+===============================+
+| Names of sample-classes            | M                             |
++------------------------------------+-------------------------------+
+| Include named classes              | filter-in                     |
++------------------------------------+-------------------------------+
+| Column that names the sample-class | gender                        |
++------------------------------------+-------------------------------+
+| Variable range-filters             | (Leave this field empty.)     |
++------------------------------------+-------------------------------+
+| Data transforamtion                | none                          |
++------------------------------------+-------------------------------+

 **Expected outputs**

@@ -673,19 +775,19 @@

 **Input parameters**

-+------------------------------------+-------------------------------+
-| Input Parameter                    | Value                         |
-+====================================+===============================+
-| Names of sample classes            | (Leave this field empty.)     |
-+------------------------------------+-------------------------------+
-| Include named classes              | filter-out                    |
-+------------------------------------+-------------------------------+
-| Column that names the sample-class | class                         |
-+------------------------------------+-------------------------------+
-| Column that names the sample       | sampleMetadata                |
-+------------------------------------+-------------------------------+
-| Variable range-filters             | FEATMAX:2e6:,mz:200:,rt::800  |
-+------------------------------------+-------------------------------+
++------------------------------------+------------------------------------+
+| Input Parameter                    | Value                              |
++====================================+====================================+
+| Names of sample-classes            | (Leave this field empty.)          |
++------------------------------------+------------------------------------+
+| Include named classes              | filter-out                         |
++------------------------------------+------------------------------------+
+| Column that names the sample-class | gender                             |
++------------------------------------+------------------------------------+
+| Variable range-filters             | FEATMAX:20.93157:,mz:200:,rt::800  |
++------------------------------------+------------------------------------+
+| Data transforamtion                | log2                               |
++------------------------------------+------------------------------------+

 **Expected outputs**

@@ -705,76 +807,89 @@
 NEWS
 ----

-CHANGES IN VERSION 0.98.7
+Changes in version 0.98.8
+=========================
+
+New features
+
+- The tool now appears in Galaxy with a new, more representative name: "W4m Data Subset". (Earlier versions of this tool appeared in Galaxy with the name "Sample Subset".)
+- Option was added to log-transform data matrix values.
+- Output datasets are named in conformance with the W4m convention of appending the name of each preprocessing tool to the input dataset name.
+- Superflous "Column that names the sample" input parameter was eliminated.
+- Some documentation was updated or clarified.
+
+Internal modifications
+
+- None
+
+Changes in version 0.98.7
 =========================

 New features

-* First column of output variableMetadata (that has feature names) now is always named "variableMetadata".
-* First column of output sampleMetadata now (that has sample names) is always named "sampleMetadata".
+- First column of output variableMetadata (that has feature names) now is always named "variableMetadata".
+- First column of output sampleMetadata now (that has sample names) now is always named "sampleMetadata".

 Internal modifications

-* Now uses w4mclassfilter R package v0.98.7.
+- Now uses w4mclassfilter R package v0.98.7.

-CHANGES IN VERSION 0.98.6
+Changes in version 0.98.6
 =========================

 New features

-* Added support for filtering out features whose attributes fall outside specified ranges.
+- Added support for filtering out features whose attributes fall outside specified ranges.
   For more detail, see "Variable-range filters" above.

 Internal modifications

-* Now uses w4mclassfilter R package v0.98.6.
-* Now sorts sample names and feature names in output files because some statistical tools expect the same order in `dataMatrix` row and column names as in the corresponding metadata files.
+- Now uses w4mclassfilter R package v0.98.6.
+- Now sorts sample names and feature names in output files because some statistical tools expect the same order in `dataMatrix` row and column names as in the corresponding metadata files.

 Changes in version 0.98.3
 =========================

 Internal modifications

-* Improved input handling.
-* Now uses w4mclassfilter R package v0.98.3, although that version has no functional implications for this tool.
-* Improved reference-list.
+- Improved input handling.
+- Now uses w4mclassfilter R package v0.98.3, although that version has no functional implications for this tool.
+- Improved reference-list.

 Changes in version 0.98.2
 =========================

 New features

-* Added support for R-flavored regular expression pattern-matching when selecting names of sample-classes.
-* Empty classes argument or zero-length class_column result in no samples filtered out.
+- Added support for R-flavored regular expression pattern-matching when selecting names of sample-classes.
+- Empty classes argument or zero-length class_column result in no samples filtered out.

 Internal modifications

-* Support and tests for new features.
+- Support and tests for new features.

 Changes in version 0.98.1
 =========================

-First release - Wrap the w4mclassfilter R package that implements filtering of W4M data matrix, variable metadata, and sample metadata by class of sample.
+First release - Wrap the w4mclassfilter R package that implements filtering of W4m data matrix, variable metadata, and sample metadata by class of sample.

 New features

-* *dataMatrix* *is* modified by the tool, so it *does* appear as an output file
-* *sampleMetadata* *is* modified by the tool, so it *does* appear as an output file
-* *variableMetadata* *is* modified by the tool, so it *does* appear as an output file
-
-Internal modifications
-
-* N/A
+- Output *dataMatrix*       is input dataMatrix       as modified by the tool
+- Output *sampleMetadata*   is input sampleMetadata   as modified by the tool
+- Output *variableMetadata* is input variableMetadata as modified by the tool

   ]]></help>
   <citations>
-    <!-- Giacomoni_2014 W4M 2.5 -->
+    <!-- Giacomoni_2014 W4m 2.5 -->
     <citation type="doi">10.1093/bioinformatics/btu813</citation>
-    <!-- Guitton_2017 W4M 3.0 -->
+    <!-- Guitton_2017 W4m 3.0 -->
     <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
+    <!-- Kuhl_2012 CAMERA -->
+    <citation type="doi">10.1021/ac202450g</citation>
     <!-- Smith_2006 XCMS -->
     <citation type="doi">10.1021/ac051437y</citation>
-    <!-- Th_venot_2015 Urinary metabolome statistics -->
+    <!-- Thevenot_2015 Urinary metabolome statistics -->
     <citation type="doi">10.1021/acs.jproteome.5b00354</citation>
   </citations>
   <!--
--- a/w4mclassfilter_wrapper.R	Mon Jan 29 21:20:07 2018 -0500
+++ b/w4mclassfilter_wrapper.R	Sat Mar 03 22:58:14 2018 -0500
@@ -82,6 +82,7 @@

 # other parameters

+transformation <- as.character(argVc["transformation"])
 wildcards <- as.logical(argVc["wildcards"])
 sampleclassNames <- as.character(argVc["sampleclassNames"])
 sampleclassNames <- strsplit(x = sampleclassNames, split = ",", fixed = TRUE)[[1]]
@@ -96,6 +97,42 @@
 variable_range_filter <- as.character(argVc["variable_range_filter"])
 variable_range_filter <- strsplit(x = variable_range_filter, split = ",", fixed = TRUE)[[1]]

+## -----------------------------
+## Transformation and imputation
+## -----------------------------
+my_w4m_filter_imputation <- if (transformation == "log10") {
+  function(m) {
+    if (!is.matrix(m))
+      stop("Cannot impute and transform data - the supplied data is not in matrix form")
+    if (nrow(m) == 0)
+      stop("Cannot impute and transform data - data matrix has no rows")
+    if (ncol(m) == 0)
+      stop("Cannot impute and transform data - data matrix has no columns")
+    suppressWarnings(
+      # suppress warnings here since non-positive values will produce NaN's that will be fixed in the next step
+      m <- log10(m)
+    )
+    return ( w4m_filter_imputation(m) )
+  }
+} else if (transformation == "log2") {
+  function(m) {
+    if (!is.matrix(m))
+      stop("Cannot impute and transform data - the supplied data is not in matrix form")
+    if (nrow(m) == 0)
+      stop("Cannot impute and transform data - data matrix has no rows")
+    if (ncol(m) == 0)
+      stop("Cannot impute and transform data - data matrix has no columns")
+    suppressWarnings(
+      # suppress warnings here since non-positive values will produce NaN's that will be fixed in the next step
+      m <- log2(m)
+    )
+    return ( w4m_filter_imputation(m) )
+  }
+} else {
+  # use the method from the w4mclassfilter class
+  w4m_filter_imputation
+}
+
 ##------------------------------
 ## Computation
 ##------------------------------
@@ -113,6 +150,7 @@
 , samplename_column     = samplenameColumn
 , variable_range_filter = variable_range_filter
 , failure_action        = my_print
+, data_imputation       = my_w4m_filter_imputation
 )

 my_print("\nResult of '", modNamC, "' Galaxy module call to 'w4mclassfilter::w4m_filter_by_sample_class' R function: ",