view msclust.xml @ 41:e67149fbff20

small changes/improvements; new metams and xcms tools
author pieter.lukasse@wur.nl
date Thu, 06 Nov 2014 16:14:44 +0100
parents 2398cbcac2cb
children 26b93438f30e
line wrap: on
line source

<tool name="MsClust" id="msclust2" version="2.0.5">
	<description>Extracts fragmentation spectra from aligned data</description>
	<!-- 
	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
	                    //////////////////////////
	       
	       TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps 
	    -->
	<command interpreter="java -jar ">
	    MsClust.jar 
	   	-peaksFileName $inputPeaks 
	   	-dataType $dataType
        -imputationMethod $imputationMethod.type
        #if $imputationMethod.type == "valueRange"
        	-rangeUpperLimit $imputationMethod.rangeUpperLimit
        #end if
		-plInputFormat "metalign" 
		-potDensFuncType $potDensFuncType.type 
		-centerSelectionType $centerSelectionType.type 
		-clusteringType $clusteringType.type 
		-neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize 
		-clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion
		-pearsonDistTreshold $potDensFuncType.pdf_pears_treshold
		-pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf
		-pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold
		-pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope
		-rtDistTolUnit $potDensFuncType.rt_dist_tol_unit.type
		-rtDistTol $potDensFuncType.rt_dist_tol_unit.pdf_rt_toler
		-rtDistanceConfidence $potDensFuncType.pdf_scan_conf
		#if $clusteringType.type == "original"
			-clustMembershipCutoff $clusteringType.clust_membership_cutoff
		#end if  
		-centrotypesOut $centrotypesOut 
		-simOut $simOut
		-micOut $micOut
		-mspOut $mspOut 
		-classOut $classOut
		-outReport $htmlReportFile
	    -outReportPicturesPath $htmlReportFile.files_path
        #if $clusteringType.type == "fuzzyCMeans"
        	-fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent 
			-fcmStopCriterion $clusteringType.fcmStopCriterion
			-fcmCorrelationWeight $clusteringType.fcmCorrelationWeight
			-fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type
			#if $clusteringType.finalClusterAssembly.type == "membershipBased"
				-fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff
			#end if
        #end if
		-verbose "false"
	    #if $advancedSettings.settings == True
	    	-advancedSettings YES
	    	-saturationLimit $advancedSettings.saturationLimit
	    	-sampleSelectionSortType $advancedSettings.sampleSelectionSortType
	    	-simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm
	    	-simMassFilter "$advancedSettings.simMassFilter"
	    	-simMembershipThreshold $advancedSettings.simMembershipThreshold
	    	-simSaturationThreshold $advancedSettings.simSaturationThreshold
	    	-simAbsenseThreshold $advancedSettings.simAbsenseThreshold
	    	-micMembershipThreshold $advancedSettings.micMembershipThreshold
	    	-peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm
        #else
        	-advancedSettings YES
        	-sampleSelectionSortType SIM_INTENSITY
        	-peakIntensityCorrectionAlgorithm CORRELATION_BASED
        #end if
	    
	</command>
	<inputs>

	 	<param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" />
		<param name="dataType" type="select" size="30" label="Data type">
				<option value="gcms"  selected="true">GC-MS</option>
				<option value="lcms">LC-MS</option>
			</param>
	 	<conditional name="imputationMethod">
			<param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps">
				<option value="none" >none</option>
				<option value="metot" selected="true">MeTot</option>
				<option value="valueRange">Values range</option>
			</param>
			<when value="valueRange">
				<param name="rangeUpperLimit" type="integer" size="10" value="0" label="Range upper limit" help="values up to this limit will be considered 'generated' values"  />
			</when>
			<when value="metot">
     		</when>
     		<when value="none">
     		</when>
		</conditional>		    
	 	<conditional name="potDensFuncType">
		    <param name="type" type="select" size="30" label="Select PD function type =====================================================">
		      <option value="original" selected="true">Original</option>
		    </param>
		    <when value="original">
		      <param name="pdf_neighborhoodWindowSize" type="integer" size="10" value="200" label="Effective Peaks"  />
		      <conditional name="rt_dist_tol_unit">
		      	<param name="type" type="select" size="30" label="Peak time unit">
		      		<option value="1" selected="true">scan nr</option>
		      		<option value="2" >(average) micro minutes</option>
		      	</param>
		      	<when value="1">
		      		<param name="pdf_rt_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
		      	</when>
		      	<when value="2">
		      		<param name="pdf_rt_toler" type="float" size="10" value="100000" label="Peak Width, in micro minutes" help="e.g. 100,000=6 seconds" />
		      	</when>
		      </conditional>
		      <param name="pdf_scan_conf" type="float" size="10" value="80" label="Peak Width confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
		      <param name="pdf_pears_treshold" type="float" size="10" value="0.8" label="Correlation threshold (0.0 - 1.0)" />
		      <param name="pdf_pears_conf" type="float" size="10" value="98.0" label="Correlation threshold confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
		    </when>
		</conditional>
		<conditional name="centerSelectionType">
		    <param name="type" type="select" label="Initial Centers selection type ==================================================" >
		      <option value="original" selected="true">Original - Subtractive potential reductions with stop criterion and REUSE tolerances (from PD function)</option>
		    </param>
		    <when value="original">
		      <param name="cs_pears_pd_reductionTreshold" type="float" size="10" value="0.8" label="Potential Density reduction (0.0 - 1.0)"  />
		      <param name="cs_pears_pd_reductionSlope" type="float" size="10" value="0.01" label="Potential Density reduction softness "  />
		      <param name="cs_stop_criterion" type="float" size="10" value="2" label="Stop Criterion "  />
			</when>
		</conditional>
		<conditional name="clusteringType">
		    <param name="type" type="select" label="Classify using ===========================================================">
		      <option value="original" selected="true">Original - Fuzzy clustering, keep original centers and REUSE (scan distance) tolerances</option>
		      <option value="fuzzyCMeans">(experimental) Fuzzy C-Means - Fuzzy clustering, optimize centers</option>
		    </param>
		    <when value="original">
		    	<param name="clust_membership_cutoff" type="float" size="10" value=""
		    	       label="Membership cutoff (0.0 - 1.0)"
		    	       help="Items with membership below this value are NOT added to the cluster"/>
			</when>
		    <!-- one idea would be to have clustering specific tolerance values, not reusing the centrotype selection ones
		    <when value="originalNewTol">
		      <param name="clust_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
		      <param name="clust_scan_slope" type="float" size="10" value="2" label="Peak Width margin softness"  />
		    </when>
		    -->
		    <when value="fuzzyCMeans">
				<param name="fcmMembershipWeightingExponent" type="float" size="10" value="2.0" label="Membership Weighting Exponent" help="Influences cluster center repositioning in the iterations 1.1 (exploratory) to around 3.0 (conservative)" />
				<param name="fcmStopCriterion" type="float" size="10" value="0.05" label="Stop Criterion" help="When convergence is 'reached' (e.g. 0.05 means memberships only changed with 5% in last iteration)" />
				<param name="fcmCorrelationWeight" type="float" size="10" value="2" label="Correlation weight factor" help="Increase this if you think the correlation is reliable (e.g. you have a high number of samples)" />
				<conditional name="finalClusterAssembly">
					<param name="type" type="select" label="Final cluster assembly" >
				      <option value="original" selected="true">Original - distance based</option>
				      <option value="membershipBased">Membership based</option>
				    </param>
					<when value="membershipBased">
						<param name="fcmMembershipCutoff" type="select" label="Maximum allowed peak overlap" >
							<option value="0.05" >~7 clusters</option>
							<option value="0.10" >~5 clusters</option>
							<option value="0.20" >~3 clusters</option>
						</param>
					</when>
					<when value="original">
					    <!-- nothing -->
					</when>
				</conditional>
		    </when>
		</conditional>
		
		<param name="summaryReport" type="boolean" checked="true" label="Generate summary report" help="NB: this will increase the processing time (in some cases up to a few extra minutes)"/>
     	
        <conditional name="advancedSettings">
     		<param name="settings" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Advanced settings ========================================================"/>
     		<when value="Yes">
     			<param name="saturationLimit" optional="true" type="integer" size="10" label="Saturation limit (optional)" help="fill in if you have saturation problems in your data"  />
	 			<param name="sampleSelectionSortType"  type="select" label="Sample selection scheme for spectrum peak intensity correction algorithm (optional/experimental)" help="The intensity values to use to select the samples for each cluster/metabolite in which it is most intense/abundant. These samples are used in the peak intensity correction (see parameter below). Use this option to try to avoid samples that have insufficient signal or saturation."  >
     				<option value="None">None</option>
     				<!-- in order of best FORWARD scoring when tested on /test/data/report_test_sets/(P2) Relative peak heights in spectra/Input (Test set 1) -->
     				<option value="SIM_INTENSITY" selected="true">SIM intensities</option>
		    		<option value="MAX_INTENSITY">Maximum intensities</option>
     				<option value="CENTROTYPE_INTENSITY">Centrotype peak intensities</option>
		    		<option value="MIC_INTENSITY">MIC intensities</option>		    		
     			</param>
     			<param name="peakIntensityCorrectionAlgorithm"  type="select" label="Spectrum peak intensity correction algorithm (optional/experimental)" help="Whether spectrum peak heights should be adjusted according to their membership to the cluster or to their correlation to the cluster's centrotype ion"  >
     				<option value="MEMBERSHIP_BASED">Membership based (msclust 1.0 mode)</option>
		    		<option value="CORRELATION_BASED" selected="true">Correlation based</option>
     			</param>     			
     			<param name="simSelectionAlgorithm" type="select" label="SIM selection algorithm (experimental)" help="Set this if you want to deviate from the standard which is: allow shared SIM peaks for GC-MS data, and force unique SIM peaks for LC-MS data">
     				<option value="" selected="true"></option>
     				<option value="uniqueSIM">Unique SIM peak</option>
		    		<option value="sharedSIM">Shared SIM peak</option>
     			</param>
     			<param name="simMassFilter" type="text" optional="true" size="30" label="SIM mass exclusion list" help="Comma-separated list of masses NOT to use as SIM peaks. E.g. '73,147,...' " />
     			<param name="simMembershipThreshold" optional="true" type="float" size="10" label="SIM membership threshold" help="Minimum membership a peak should have to qualify as a SIM candidate. E.g. 0.8 " />
     			<param name="simSaturationThreshold" optional="true" type="float" size="10" label="SIM saturation threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be saturated. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found this criteria, mass 0 is reported" />
     			<param name="simAbsenseThreshold" optional="true" type="float" size="10" label="SIM absence threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be absent. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found meeting this criteria, mass 0 is reported" />
     			
     			<param name="micMembershipThreshold" optional="true" type="float" size="10" label="MIC membership threshold" help="Minimum membership a peak should have to be counted in the MIC sum. E.g. 0.8 " />
     			
     		</when>
     		<when value="No">
			</when>
     	</conditional>	

     	
	</inputs>
	<outputs>
	  <data name="centrotypesOut" format="msclust.csv" label="${tool.name} on ${on_string} - centrotypes file"/>
	  <data name="simOut" format="msclust.csv" label="${tool.name} on ${on_string} - SIM file"/>
	  <data name="micOut" format="msclust.csv" label="${tool.name} on ${on_string} - MIC file"/>
	   <data name="mspOut" format="msp" label="${tool.name} on ${on_string} - SPECTRA file"/>
	  <data name="classOut" format="msclust.csv" label="${tool.name} on ${on_string} - Classification file"/>
	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
	 	<!-- If the expression is false, the file is not created -->
	  	<filter>( summaryReport == True )</filter>
	  </data>
	</outputs>
	<tests>
	  <!--  find out how to use -->
	</tests>
  <help>

<!-- see also http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets -->
  
.. class:: infomark
  
This tool extracts spectra from ion-wise aligned MS(/MS) results. It uses expression profiles and 
retention times of the putative ions to cluster them. Each cluster is then used to generate 
one spectrum containing the clustered ions (peaks). 

.. image:: msclust_summary.png 


-----

**Input**

The input file should contain the following columns (in this order), followed by the sample intensity columns (one column with the
intensity value for each sample):

*ScanNR*
	
*Ret(umin)*
	
*Mass(uD)*
	
*(Optional)retentionMean*

*(only required if retentionMean is present)retentionSD*

*N sample intensity columns...*


-----

**Output**

This tools returns a number of ouptut files and a small report. 

**Parameters index**


*Select the approach used for imputing missing values:* only select this if you have used a specific method to 
fill in the data gaps in the input file. One example is replacing zero values by some randomly generated low value.
If MeTot is chosen, then a value is considered generated if: the value contains a dot '.' and some number 
other than 0 (zero) after the dot. 

*Effective Peaks:* Neighborhood window size to consider when calculating density. Smaller values increase 
performance but are less reliable.

*Peak Width, in scans:* Scan window width of scans to consider 'close'. One can see this as the 
'tolerated variation in scans' for the apex positions of the fragment peaks composing a cluster. 
Note: if MetAlign was used, this is the variation *after* pre-processing by MetAlign.   

*Peak Width confidence:* The higher the confidence, the stricter the threshold.

*Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value, 
the higher the correlation between 2 items has to be for them to be considered 'close'. 

*Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__

*Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation. 
The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own. 

*Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower 
values = stricter separation at the value determined in 'Potential Density reduction' above  
(TODO review this comment). 

*Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations 

.. __: javascript:window.open('.. image:: confidence_and_slope_params_explain.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')


-----

**Output files described below**

-----

*SPECTRA:* this file can be submitted to NIST for identification of the spectra.

`Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_  

.. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open('.. image:: sample_sel_and_peak_height_correction.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')

-----

*MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count 
values (corrected by their membership) for all MEASURED cluster ions in the given sample.

The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus: 

sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] )

-----

*SIM:* stands for Selective Ion Mode ->  it contains, for each cluster, the intensity values of the 
most representative member ion peak of this cluster. The most representative member peak is the one with the 
highest membership*average_intensity. This definition leads to conflicts as a peak can have a 
membership in two or more clusters. The assignment of a SIM peak to a cluster depends on 
the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings":

(1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its 
highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words,
if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster 
with highest membership. Continue searching for other SIM peaks to assign to the other clusters until 
all ambiguities are solved.

(2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected
by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not 
"shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file. 

`Click here for more details on the SIM output file`_  

.. _Click here for more details on the SIM output file: javascript:window.open('.. image:: sample_SIM.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')


**References**

If you use this Galaxy tool in work leading to a scientific publication please
cite the following papers:

Y. M. Tikunov, S. Laptenok, R. D. Hall, A. Bovy, and R. C. H. de Vos (2012).
MSClust: a tool for unsupervised mass spectra extraction of 
chromatography-mass spectrometry ion-wise aligned data
http://dx.doi.org/10.1007%2Fs11306-011-0368-2

  </help>
</tool>