Mercurial > repos > eschen42 > w4mclassfilter
changeset 4:499c7ecfa834 draft
planemo upload for repository https://github.com/HegemanLab/w4mclassfilter_galaxy_wrapper/tree/master commit 7049f74a86f6e47565a68336d6496d112713cbba
author | eschen42 |
---|---|
date | Mon, 19 Jun 2017 23:42:33 -0400 |
parents | 191a720488ce |
children | 2cdf7d5982c8 |
files | w4mclassfilter.xml w4mclassfilter_wrapper.R |
diffstat | 2 files changed, 210 insertions(+), 39 deletions(-) [+] |
line wrap: on
line diff
--- a/w4mclassfilter.xml Thu May 11 00:05:51 2017 -0400 +++ b/w4mclassfilter.xml Mon Jun 19 23:42:33 2017 -0400 @@ -1,10 +1,10 @@ -<tool id="w4mclassfilter" name="Sample_Subset" version="0.98.1"> +<tool id="w4mclassfilter" name="Sample_Subset" version="0.98.2"> <description>Filter W4M data by sample class</description> <requirements> <requirement type="package" version="3.3.1">r-base</requirement> <requirement type="package" version="1.1_4">r-batch</requirement> - <requirement type="package" version="0.98.1">w4mclassfilter</requirement> + <requirement type="package" version="0.98.2">w4mclassfilter</requirement> </requirements> <stdio> @@ -14,29 +14,36 @@ <command detect_errors="aggressive"><![CDATA[ Rscript $__tool_directory__/w4mclassfilter_wrapper.R - dataMatrix_in "$dataMatrix_in" - sampleMetadata_in "$sampleMetadata_in" - variableMetadata_in "$variableMetadata_in" - sampleclassNames "$sampleclassNames" - inclusive "$inclusive" - classnameColumn "$classnameColumn" - samplenameColumn "$samplenameColumn" - dataMatrix_out "$dataMatrix_out" - sampleMetadata_out "$sampleMetadata_out" - variableMetadata_out "$variableMetadata_out" + dataMatrix_in '$dataMatrix_in' + sampleMetadata_in '$sampleMetadata_in' + variableMetadata_in '$variableMetadata_in' + sampleclassNames '$sampleclassNames' + inclusive '$inclusive' + wildcards '$wildcards' + classnameColumn '$classnameColumn' + samplenameColumn '$samplenameColumn' + dataMatrix_out '$dataMatrix_out' + sampleMetadata_out '$sampleMetadata_out' + variableMetadata_out '$variableMetadata_out' ]]></command> <inputs> <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> - <param name="sampleclassNames" label="Names of sample classes" type="text" value = "" help="comma-separated names of sample classes to filter in or out; defaults to no names" /> + <param name="samplenameColumn" label="Column that names the sample" type="text" value = "sampleMetadata" help="name of the column in the sample metadata file that has the name of the sample - defaults to 'sampleMetadata'" /> + <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'classes' input parameter - defaults to 'class'" /> + <param name="sampleclassNames" label="Names of sample classes" type="text" value = "" help="comma-separated names (or comma-less regular expressions to match names) of sample-classes to filter in or out; defaults to no names"> + <sanitizer sanitize="False"/> + </param> + <param name="wildcards" label="Use wild-cards or regular-expressions" type="select" help="wild-cards (the default) - use '*' and '?' to match class names; regular-expressions - use comma-less regular expressions to match class names"> + <option value="TRUE" selected="true">wild-cards</option> + <option value="FALSE">regular-expressions</option> + </param> <param name="inclusive" label="Include named classes" type="select" help="filter-in - include only the named sample classes; filter-out (the default) - exclude only the named sample classes"> <option value="TRUE">filter-in</option> <option value="FALSE" selected="true">filter-out</option> </param> - <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'classes' input parameter - defaults to 'class'" /> - <param name="samplenameColumn" label="Column that names the sample" type="text" value = "sampleMetadata" help="name of the column in sample metadata that has the name of the sample - defaults to 'sampleMetadata'" /> </inputs> <outputs> <data name="dataMatrix_out" label="${tool.name}_${dataMatrix_in.name}" format="tabular" ></data> @@ -156,6 +163,101 @@ </assert_contents> </output> </test> + <test> + <param name="dataMatrix_in" value="input_nofilter_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="classnameColumn" value="gender"/> + <param name="sampleclassNames" value="M"/> + <param name="samplenameColumn" value="sampleMetadata"/> + <param name="inclusive" value="filter-in"/> + <output name="variableMetadata_out"> + <assert_contents> + <has_text text="HMDB03193" /> + <not_has_text text="HMDB00822" /> + <has_text text="HMDB01101" /> + <has_text text="HMDB01101.1" /> + <has_text text="HMDB10348" /> + <has_text text="HMDB59717" /> + <not_has_text text="HMDB13189" /> + <has_text text="HMDB00299" /> + <has_text text="HMDB00191" /> + <has_text text="HMDB00518" /> + <has_text text="HMDB00715" /> + <has_text text="HMDB01032" /> + <has_text text="HMDB00208" /> + <has_text text="HMDB04824" /> + <has_text text="HMDB00512" /> + <has_text text="HMDB00251" /> + </assert_contents> + </output> + </test> + <test> + <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="classnameColumn" value="gender"/> + <param name="sampleclassNames" value="[Mm],[fF]"/> + <param name="samplenameColumn" value="sampleMetadata"/> + <param name="inclusive" value="filter-in"/> + <output name="sampleMetadata_out"> + <assert_contents> + <has_text text="HU_028" /> + <has_text text="HU_051" /> + <has_text text="HU_060" /> + <has_text text="HU_110" /> + <has_text text="HU_149" /> + <has_text text="HU_152" /> + <has_text text="HU_175" /> + <has_text text="HU_178" /> + <has_text text="HU_185" /> + <not_has_text text="HU_204" /> + <has_text text="HU_208" /> + <has_text text="HU_017" /> + <has_text text="HU_034" /> + <has_text text="HU_078" /> + <has_text text="HU_091" /> + <has_text text="HU_093" /> + <has_text text="HU_099" /> + <has_text text="HU_130" /> + <has_text text="HU_134" /> + <has_text text="HU_138" /> + </assert_contents> + </output> + </test> + <test> + <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="classnameColumn" value=""/> + <param name="sampleclassNames" value="M"/> + <param name="samplenameColumn" value="sampleMetadata"/> + <param name="inclusive" value="filter-in"/> + <output name="sampleMetadata_out"> + <assert_contents> + <has_text text="HU_028" /> + <has_text text="HU_051" /> + <has_text text="HU_060" /> + <has_text text="HU_110" /> + <has_text text="HU_149" /> + <has_text text="HU_152" /> + <has_text text="HU_175" /> + <has_text text="HU_178" /> + <has_text text="HU_185" /> + <not_has_text text="HU_204" /> + <has_text text="HU_208" /> + <has_text text="HU_017" /> + <has_text text="HU_034" /> + <has_text text="HU_078" /> + <has_text text="HU_091" /> + <has_text text="HU_093" /> + <has_text text="HU_099" /> + <has_text text="HU_130" /> + <has_text text="HU_134" /> + <has_text text="HU_138" /> + </assert_contents> + </output> + </test> </tests> @@ -163,13 +265,11 @@ <help> <![CDATA[ -.. class:: infomark **Author** Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) -------------------------------------------------------------------------- -.. class:: infomark **R package** @@ -177,7 +277,6 @@ ----------------------------------------------------------------------------------------------------------------------------------------- -.. class:: infomark **Tool updates** @@ -199,16 +298,16 @@ Workflow Position ----------------- - - Upstream tool category: Preprocessing - - Downstream tool categories: Normalisation, Statistical Analysis, Quality Control +- Upstream tool category: Preprocessing +- Downstream tool categories: Normalisation, Statistical Analysis, Quality Control, Filter and Sort ---------- Motivation ---------- -GC-MS1 and LC-MS1 experiments seek to resolve chemicals as features that have distinct chromatographic behavior and (after ionization) mass-to-charge ratio. +GC-MS1 and LC-MS1 experiments seek to resolve chemicals as features that have distinct chromatographic behavior and (after ionization) mass-to-charge ratio. Data for a sample are collected as MS intensities, each of which is associated with a position on a 2D plane with dimensions of m/z ratio and chromatographic retention time. -Ideally, features would be sufficiently reproducible from sample-run to sample-run to identify features that are commmon among samples and those that differ. +Ideally, features would be sufficiently reproducible from sample-run to sample-run to identify features that are commmon among samples and those that differ. However, the chromatographic retention time for a chemical can vary from one run to another. In the Workflow4Metabolomics (W4M, [Giacomoni *et al.*, 2014]) "flavor" of Galaxy, the XCMS [Smith *et al.*, 2006] preprocessing tools provide for "retention time correction" to align features among samples, but features may be better aligned if pooled samples and blanks are included. @@ -224,7 +323,6 @@ Next, missing and negative intensites for features of the remaining samples are imputed to zero. Finally, samples or features with zero variance are eliminated. - ----------- Input files ----------- @@ -256,23 +354,28 @@ | variable x metadata **variableMetadata** (tabular separated values) file of the numeric and/or character variable metadata, with . as decimal and NA for missing values | +Column that names the sample (default = '``sampleMetadata``') + | name of the column in sample metadata that has the name of the sample + | + +Column that names the sample-class (default = '``class``') + | name of the column in sample metadata that has the values to be tested against the '``classes``' input parameter + | + Names of sample classes (default = no names) | comma-separated names of sample classes to include or exclude | -Include named classes (default = filter-out) - | *filter-in* - include only the named sample classes - | *filter-out* - exclude only the named sample classes +Wild-cards (default = '``wild-cards``') + | '``wild-cards``' - use wild-cards to match names of sample classes (see 'Wild card patterns to match class-names' below) + | '``regular-expressions``' - exclude only the named sample classes (see 'Regular expression patterns to match class-names' below) | - -Column that names the sample-class (default = 'class') - | name of the column in sample metadata that has the values to be tested against the 'classes' input parameter +Include named classes (default = '``filter-out``') + | '``filter-in``' - include only the named sample classes + | '``filter-out``' - exclude only the named sample classes | -Column that names the sample (default = 'sampleMetadata') - | name of the column in sample metadata that has the name of the sample - | ------------ @@ -293,6 +396,63 @@ | +--------------------------------------- +Wild card patterns to match class-names +--------------------------------------- + +Beginning with v0.98.2, w4mclassfilter supports use of R regular expression patterns to select class-names. + +- use '``?``' to match a single character +- use '``*``' to match zero or more characters +- the entire pattern must match the sample name + +For example + +- '``??.samp*``' matches '``my.sample``' but not '``my.own.sample``' +- '``*.sample``' matches '``my.sample``' and '``my.own.sample``' +- '``*.sampl``' matches neither '``my.sample``' nor '``my.own.sample``' + +------------------------------------------------ +Regular expression patterns to match class-names +------------------------------------------------ + +Beginning with v0.98.2, w4mclassfilter supports use of R regular expression patterns to select class-names. + +R uses POSIX 1003.2 standard regular expressions, which allow precise pattern-matching and are exhaustively defined at: +http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html + +However, only a few basic building blocks of regular expressions need to be mastered for most cases: + +- '``^``' matches the beginning of a class-name +- '``$``' matches the end of a class-name +- '``.``' outside of square brackets matches a single character +- '``*``' matches character specified immediately before zero or more times +- square brackets specify a set of characters to be matched. + +Within square brackets + +- '``^``' as the first character specifies that the list of characters are those that should **not** be matched. +- '``-``' is used to specify ranges of characters + +Caveat: The tool wrapper uses the comma ('``,``') to split a list of sample-class names, so **commas may not be used within regular expressions for this tool** + +First Example: Consider a field of class-names consisting of '``marq3,marq6,marq9,marq12,front3,front6,front9,front12``' + +- The regular expression '``^front[0-9][0-9]*$``' will match the same sample-classes as '``front3,front6,front9,front12``' +- The regular expression '``^[a-z][a-z]3$``' will match the same sample-classes as '``front3,marq3``' +- The regular expression '``^[a-z][a-z]12$``' will match the same sample-classes as '``front12,marq12``' +- The regular expression '``^[a-z][a-z][0-9]$``' will match the same sample-classes as '``front3,front6,front9,marq3,marq6,marq9``' + +Second Example: Consider these regular expression patterns as possible matches to a sample-class name '``AB0123``': + +- '``^[A-Z][A-Z][0-9][0-9]*$``' - MATCHES '``**^AB0123$**``' +- '``^[A-Z][A-Z]*[0-9][0-9]*$``' - MATCHES '``**^AB0123$**``' +- '``^[A-Z][0-9]*``' - MATCHES '``**^A** B0123$``' - first character is a letter, '``*``' can specify zero characters, and end of line did not need to be matched. +- '``^[A-Z][A-Z][0-9]``' - MATCHES '``**^AB0** 123$``' - first two characters are letters aind the third is a digit. +- '``^[A-Z][A-Z]*[0-9][0-9]$``' - NO MATCH - the name does not end with the pattern '``[A-Z][0-9][0-9]$``', i.e., it ends with four digits, not two. +- '``^[A-Z][0-9]*$``' - NO MATCH - the pattern specifies that second character and all those that follow, if present, must be digits. + + --------------- Working example --------------- @@ -348,6 +508,19 @@ NEWS ---- +CHANGES IN VERSION 0.98.2 +========================= + +NEW FEATURES + +* Added support for R-flavored regular expression pattern-matching when selecting names of sample-classes. +* Empty classes argument or zero-length class_column result in no samples filtered out. + +INTERNAL MODIFICATIONS + +* Support and tests for new features. + + CHANGES IN VERSION 0.98.1 =========================
--- a/w4mclassfilter_wrapper.R Thu May 11 00:05:51 2017 -0400 +++ b/w4mclassfilter_wrapper.R Mon Jun 19 23:42:33 2017 -0400 @@ -83,15 +83,13 @@ # other parameters sampleclassNames <- as.character(argVc["sampleclassNames"]) -# if (sampleclassNames == "NONE_SPECIFIED") { -# sampleclassNames <- as.character(c()) -# -# } else { -# sampleclassNames <- strsplit(x = sampleclassNames, split = ",", fixed = TRUE)[[1]] -# } +wildcards <- as.logical(argVc["wildcards"]) sampleclassNames <- strsplit(x = sampleclassNames, split = ",", fixed = TRUE)[[1]] +if (wildcards) { + sampleclassNames <- gsub("[.]", "[.]", sampleclassNames) + sampleclassNames <- utils::glob2rx(sampleclassNames, trim.tail = FALSE) +} inclusive <- as.logical(argVc["inclusive"]) -# print(sprintf("inclusive = '%s'", as.character(inclusive))) classnameColumn <- as.character(argVc["classnameColumn"]) samplenameColumn <- as.character(argVc["samplenameColumn"])