diff lcmsmatching.xml @ 5:fb9c0409d85c draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 608d9e59a0d2dcf85a037968ddb2c61137fb9bce
author prog
date Wed, 19 Apr 2017 10:00:05 -0400
parents b34c14151f25
children f86fec07f392
line wrap: on
line diff
--- a/lcmsmatching.xml	Tue Mar 14 12:40:22 2017 -0400
+++ b/lcmsmatching.xml	Wed Apr 19 10:00:05 2017 -0400
@@ -1,33 +1,43 @@
-<tool id="lcmsmatching" name="LC/MS matching" version="3.2.0" profile="16.01">
+<tool id="lcmsmatching" name="LC/MS matching" version="3.3.1" profile="16.01">
 
 	<description>Annotation of MS peaks using matching on a spectra database.</description>
 
 	<requirements>
+		<!--<requirement type="package" version="3.3.3">r</requirement>-->
+		<requirement type="package" version="7.0">readline</requirement> <!-- Try readline 7.0 -->
 		<requirement type="package" version="1.20.0">r-getopt</requirement>
 		<requirement type="package" version="1.0.0">r-stringr</requirement>
 		<requirement type="package" version="1.8.3">r-plyr</requirement>
 		<requirement type="package" version="3.98">r-xml</requirement>
 		<requirement type="package" version="1.0_6">r-bitops</requirement>
 		<requirement type="package" version="1.95">r-rcurl</requirement>
-		<requirement type="package" version="1.3">r-rjsonio</requirement>
+		<requirement type="package" version="1.1">r-jsonlite</requirement>
 	</requirements>
 
 	<code file="list-chrom-cols.py"/>
+	<code file="list-file-cols.py"/>
+	<code file="list-ms-mode-values.py"/>
 
-	<!--~~~~~~~
-	~ COMMAND ~
-	~~~~~~~~-->
+	<!--=======
+	= COMMAND =
+	========-->
 
 	<command>
 		<![CDATA[
 		## @@@BEGIN_CHEETAH@@@
-		$__tool_directory__/search-mz -i "$mzrtinput"
+		$__tool_directory__/search-mz
+
+		## Input file
+		-i "$mzrtinput"
+		--input-col-names "mz=$inputmzfield,rt=$inputrtfield"
+		--rtunit "$inputrtunit"
 
 		## Database
 		#if $db.dbtype == "inhouse"
 			-d file
-			--db-fields "$db.dbfields"
-			--db-ms-modes "$db.dbmsmodes"
+			--db-fields "mztheo=$db.dbmzreffield,chromcolrt=$db.dbchromcolrtfield,compoundid=$db.dbspectrumidfield,chromcol=$db.dbchromcolfield,msmode=$db.dbmsmodefield,peakattr=$db.dbpeakattrfield,pubchemcompid=$db.dbpubchemcompidfield,chebiid=$db.dbchebiidfield,hmdbid=$db.dbhmdbidfield,keggid=$db.dbkeggidfield"
+			--db-ms-modes "pos=$db.dbmsposmode,neg=$db.dbmsnegmode"
+			--db-rt-unit $db.dbrtunit
 		#end if
 		#if $db.dbtype == "peakforest"
 			-d peakforest
@@ -57,22 +67,14 @@
 		## HTML output 
 		--html-output-file "$htmloutput" --no-main-table-in-html-output
 
-		## Fields of input file
-		--input-col-names "$inputfields"
-
 		## Ouput setting
-		#if $out.enabled == "true"
-			--output-col-names "$out.outputfields"
-			--molids-sep "$out.molidssep"
-		#else
-			--molids-sep "|"
-		#end if
+		--molids-sep "$molidssep"
 		## @@@END_CHEETAH@@@
 	]]></command>
 
-	<!--~~~~~~
-	~ INPUTS ~
-	~~~~~~~-->
+	<!--======
+	= INPUTS =
+	=======-->
 
 	<inputs>
 
@@ -90,10 +92,26 @@
 				<param name="dburl" label="Database file" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. Retention time values must be in seconds."/>
 
 				<!-- File database field names -->
-				<param name="dbfields" label="File database column names" type="text" size="256" value="mztheo=mztheo,chromcolrt=chromcolrt,compoundid=compoundid,chromcol=chromcol,msmode=msmode,peakattr=peakattr,peakcomp=peakcomp,fullnames=fullnames,compoundmass=compoundmass,compoundcomp=compoundcomp,inchi=inchi,inchikey=inchikey,pubchemcompid=pubchemcompid,chebiid=chebiid,hmdbid=hmdbid,keggid=keggid" refresh_on_change="true" help=""/>
+				<param name="dbspectrumidfield" type="select" label="Database file Spectrum ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'spectrumid,accession,compoundid,molid')" help="Select the Spectrum ID column of the database file."/>
+				<param name="dbmzreffield" type="select" label="Database file Reference MZ column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'mztheo,mzexp,mz')" help="Select the Reference MZ column of the database file."/>
+				<param name="dbchromcolfield" type="select" label="Database file Chromatographic Column Name column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcol,col')" help="Select the Chromatographic Column Name column of the database file." refresh_on_change="true"/>
+				<param name="dbchromcolrtfield" type="select" label="Database file Chromatographic Column Retention Time column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcolrt,colrt,rt')" help="Select the Chromatographic Column Retention Time column of the database file."/>
+				<param name="dbmsmodefield" type="select" label="Database file MS Mode column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'msmode,mode')" help="Select the MS Mode column of the database file." refresh_on_change="true"/>
+				<param name="dbpeakattrfield" type="select" label="Database file Peak Attribution column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'peakattr,attr')" help="Select the Peak Attribution column of the database file."/>
+				<param name="dbpubchemcompidfield" type="select" label="Database file PubChem Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'pubchemcompid,pubchemid,pubchemcomp,pubchem')" help="Select the PubChem Compound ID column of the database file."/>
+				<param name="dbchebiidfield" type="select" label="Database file ChEBI ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chebiid,chebi')" help="Select the ChEBI ID column of the database file."/>
+				<param name="dbhmdbidfield" type="select" label="Database file HMDB Metabolite ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'hmdbid,hmdb')" help="Select the HMDB Metabolite ID column of the database file."/>
+				<param name="dbkeggidfield" type="select" label="Database file KEGG Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'keggid,kegg')" help="Select the KEGG Compound ID column of the database file."/>
 
 				<!-- File database MS modes -->
-				<param name="dbmsmodes" label="File database MS modes" type="text" size="32" value="pos=POS,neg=NEG" help=""/>
+				<param name="dbmsposmode" label="File database MS Positive mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'POS,pos,+')" help="Select the value used to identify the positive MS mode."/>
+				<param name="dbmsnegmode" label="File database MS Negative mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'NEG,neg,-')" help="Select the value used to identify the negitive MS mode."/>
+
+				<!-- File database RT unit -->
+				<param name="dbrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help="">
+					<option value="sec">Seconds</option>
+					<option value="min">Minutes</option>
+				</param>
 
 				<param name="dbtoken" type="text" size="32" value="" hidden="true"/>
 			</when>
@@ -103,17 +121,24 @@
 
 				<param name="dbtoken" label="Peakforest security token" type="text" size="32" value="" refresh_on_change="true" help="If you do not have yet a Peakforest token, go to Peakforest website and request one from your account."/>
 
-				<param name="dbfields" type="text" size="32" value="" hidden="true"/>
+				<param name="dbchromcolfield" type="text" size="32" value="" hidden="true"/>
 			</when>
 		</conditional>
 
 		<!-- INPUT -->
 
 			<!-- Input file -->
-			<param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/>
+			<param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/>
+
+			<!-- Input field field names -->
+			<param name="inputmzfield" type="select" label="Input file MZ column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'mzmed,mz')" help="Select the MZ column of the input file."/>
+			<param name="inputrtfield" type="select" label="Input file RT column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'rtmed,rt')" help="Select the RT column of the input file."/>
 
-			<!-- Input field names -->
-			<param name="inputfields" label="Input file column names" type="text" size="32" value="mz=mzmed,rt=rtmed" help=""/>
+			<!-- Input file RT unit -->
+			<param name="inputrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help="">
+				<option value="sec">Seconds</option>
+				<option value="min">Minutes</option>
+			</param>
 
 		<!-- M/Z MATCHING -->
 
@@ -130,7 +155,7 @@
 		<!-- RETENTION TIME PARAMETERS -->
 
 			<!-- List of chromatographic columns -->
-			<param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], dbfields = db['dbfields'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/>
+			<param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], col_field = db['dbchromcolfield'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/>
 
 			<!-- Tolerances -->
 			<param name="tolx" label="RTX retention time tolerance, parameter x (in seconds)" type="float" help="" value="5"/>
@@ -174,38 +199,23 @@
 		</conditional>
 
 		<!-- OUTPUT -->
-		<conditional name="out">
-
-			<param name="enabled" label="Output settings" type="select">
-				<option value="false">Default</option>
-				<option value="true">Customized</option>
-			</param>
-
-			<when value="false"></when>
-			<when value="true">
-
-				<!-- Output field names -->
-				<param name="outputfields" label="Output column names" type="text" size="256" value="mz=mz,rt=rt,chromcol=chromcol,chromcolrt=chromcolrt,compoundid=compoundid,peakattr=peakattr,peakcomp=peakcomp,intensity=intensity,relative.intensity=relative.intensity,mzexp=mzexp,mztheo=mztheo,fullnames=fullnames,compoundmass=compoundmass,compoundcomp=compoundcomp,inchi=inchi,inchikey=inchikey,pubchemcompid=pubchemcompid,chebiid=chebiid,hmdbid=hmdbid,keggid=keggid" help=""/>
-
-				<!-- Molecule IDs separator character -->
-				<param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help="">
-					<sanitizer>
-						<valid initial="string.printable">
-							<remove value='"'/>
-						</valid>
-						<mapping initial="none">
-							<add source='"' target='\"'/>
-						</mapping>
-					</sanitizer>
-				</param>
-			</when>
-		</conditional>
+		<!-- Molecule IDs separator character -->
+		<param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help="">
+			<sanitizer>
+				<valid initial="string.printable">
+					<remove value='"'/>
+				</valid>
+				<mapping initial="none">
+					<add source='"' target='\"'/>
+				</mapping>
+			</sanitizer>
+		</param>
 
 	</inputs>
 
-	<!--~~~~~~~
-	~ OUTPUTS ~
-	~~~~~~~~-->
+	<!--=======
+	= OUTPUTS =
+	========-->
 
 	<outputs>
 
@@ -216,9 +226,9 @@
 
 	</outputs>
 
-	<!--~~~~~
-	~ TESTS ~
-	~~~~~~-->
+	<!--=====
+	= TESTS =
+	======-->
 
 	<tests>
 
@@ -229,7 +239,8 @@
 			<param name="dbfields" value=""/>
 			<param name="dbmsmodes" value=""/>
 			<param name="mzrtinput" value="mz-input-small.tsv"/>
-			<param name="inputfields" value=""/>
+			<param name="inputmzfield" value="mzmed"/>
+			<param name="inputrtfield" value="rtmed"/>
 			<param name="mzmode" value="pos"/>
 			<output name="mainoutput" file="filedb-small-mz-match-output.tsv"/>
 			<output name="peaksoutput" file="filedb-small-mz-match-peaks-output.tsv"/>
@@ -253,9 +264,9 @@
 -->
 	</tests>
 
-	<!--~~~~
-	~ HELP ~
-	~~~~~-->
+	<!--====
+	= HELP =
+	=====-->
 
 	<help>
 <!-- @@@BEGIN_RST@@@ -->
@@ -272,7 +283,7 @@
 
 When selecting the database, you have the choice between a Peakforest database or an in-house file.
 
-For the Peakforest database, a default REST web base address is already provided. But you can change it of you want to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database.
+For the Peakforest database, a default REST web base address is already provided. But you can change it to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database URL.
 
 For the in-house file, please refer to the paragraph "Single file database" below.
 
@@ -285,50 +296,13 @@
 Single file database
 ====================
 
-The database used is provided as a single file, in tabular format, through the *Database file* field. This file contains a list of MS peaks, with retention times.
-Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compounds in positive mode.
-
-The file must contain a header with the column names. The names are free, but must be provided through the *File database column names* field.
-In this field, each column is identified with a tag, and the columns names are listed as a comma separated list of tag/name couples (separated by character `=`). The allowed tags are the following ones:
+The database used is provided as a single file, in tabular format, through the *Database file* field. This file must contain a list of MS peaks, with possibly retention times.
+Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compound in positive mode.
 
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-| Column tag   | Compulsory | Values                                                                                                     |
-+==============+============+============================================================================================================+
-|    mztheo    |    Yes     | The m/z values.                                                                                            |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|    mode      |    Yes     | The MS mode.                                                                                               |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|     molid    |    Yes     | This is the identifier of your compound.                                                                   |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|     colrt    |    No      | The retention time values in seconds.                                                                      |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|      col     |    No      | The chromatographic column associated with the retention time. Compulsory if retention times are provided. |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|     attr     |    No      | The attribution of the peak (e.g.: ``[(M+H)-(H2O)-(NH3)]+``).                                              |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|     comp     |    No      | The composition of the peak (e.g.: ``C6 H10 N O``).                                                        |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|   molcomp    |    No      | The composition of the molecule. (e.g.: ``C6H14N2O2``).                                                    |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|   molmass    |    No      | The mass of the molecule.                                                                                  |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|   molnames   |    No      | The names of the molecule, as a semicolon separated list.                                                  |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|     inchi    |    No      | The InChI of the molecule.                                                                                 |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|    inchikey  |    No      | The InChI key of the molecule.                                                                             |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|   pubchem    |    No      | The PubChem ID of the molecule.                                                                            |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|   chebi      |    No      | The ChEBI ID of the molecule.                                                                              |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|    hmdb      |    No      | The HMDB ID of the molecule.                                                                               |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
-|    kegg      |    No      | The KEGG ID of the molecule.                                                                               |
-+--------------+------------+------------------------------------------------------------------------------------------------------------+
+The file must contain a header with the column names. The names are free, but must be provided through the different fields named *Database file ... column name*.
+Then you must provide the values used to identify the MS modes (positive and negative).
 
-The field *File database MS modes* allows you to personalize the MS mode identifiers. The value of the field is a comma separated list of mode/name couples (separated by character `=`)..
-For instance, if in your database file you use characters '+' and '-' to identify the modes, then you must set the field to `pos=+,neg=-`.
+A last information about the single file database is the unit of the retention times, either in seconds or in minutes.
 
 Example of database file (totally fake, no meaning):
 
@@ -361,24 +335,12 @@
 MZ/RT input file
 ================
 
-The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of MZ/RT values.
-
-The following columns will be used:
+The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of M/Z values, with possibly also RT values.
 
-+--------------+------------+---------------------------------------+
-| Column tag   | Compulsory | Values                                |
-+==============+============+=======================================+
-|      mz      |    Yes     | The m/z values.                       |
-+--------------+------------+---------------------------------------+
-|      rt      |    No      | The retention time values in seconds. |
-+--------------+------------+---------------------------------------+
+The column names for the M/Z and RT values must be provided through the fields *Input file MZ column name* and *Input file RT column name*.
+As a consequence, the file must contain a header line.
 
-The file may contain a header line, in which case you have to provide the column names through the *Input file column names* field, which consists in a comma separated list of tag/name couples (separated by character `=`). If your file does not contain a header line, then you must provide the column numbers. Examples:
-
- * With a header line having name MASS for mz column and RET for rt column: `mz=MASS,rt=RET`.
- * With no header line: `mz=1,rt=2`.
-
-Since the MS spectrum mode can not be known from the file, an *MS mode* radio button field is provided for setting the mode.
+The unit of the retention time has to be provided with the field *Retention time unit*.
 
 Example of file input:
 
@@ -408,15 +370,15 @@
 
 The parameters *M/Z precision* and *M/Z shift* are used by the algorithm in the following formula in order to match an *m/z* value:
 
-	mz (1 + (- shift - precision) / 10^6) &lt; mztheo &lt; mz (1 + (- shift - precision) / 10^6)
+	mz (1 + (- shift - precision) / 10^6) &lt; mzref &lt; mz (1 + (- shift - precision) / 10^6)
 
-Where *mztheo* is the theoretical mass of the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak.
+Where *mzref* is the M/Z of reference from the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak.
 
 --------------------
 Retention time match
 --------------------
 
-If at least one column is checked inside the *Columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula:
+If at least one column is selected inside the *Chromatographic columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula:
 
 	rt - x - rt^y &lt; colrt &lt; rt + x + rt^y
 
@@ -452,48 +414,6 @@
 Output settings
 ---------------
 
-The *Output column names* parameter is used to customize the columns of the output files. As with the *File database column names* parameter, each column is identified with a tag, and the columns names are listed as a comma separated list of tag/name couples (separated by character `=`). The allowed tags are the following ones:
-
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-| Column tag   | Values                                                                                                                          |
-+==============+=================================================================================================================================+
-|      mz      | The m/z values from the input file.                                                                                             |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|   mztheo     | The m/z values from the database.                                                                                               |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|     molid    | This is the identifier of your compound.                                                                                        |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|      rt      | The retention time values in seconds from the input file.                                                                       |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|      col     | The chromatographic column associated with the retention time.                                                                  |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|     colrt    | The retention time associated with the matched chromatographic column.                                                          |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|  msmatching  | The list IDs of matched molecules. IDs are separated by the character specified in the *Molecule IDs separator character* field |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|     attr     | The attribution of the peak (e.g.: ``[(M+H)-(H2O)-(NH3)]+``).                                                                   |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|     comp     | The composition of the peak (e.g.: ``C6 H10 N O``).                                                                             |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|   molcomp    | The composition of the molecule. (e.g.: ``C6H14N2O2``).                                                                         |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|   molmass    | The mass of the molecule.                                                                                                       |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|   molnames   | The names of the molecule, as a semicolon separated list.                                                                       |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|     inchi    | The InChI of the molecule.                                                                                                      |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|    inchikey  | The InChI key of the molecule.                                                                                                  |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|   pubchem    | The PubChem ID of the molecule.                                                                                                 |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|   chebi      | The ChEBI ID of the molecule.                                                                                                   |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|    hmdb      | The HMDB ID of the molecule.                                                                                                    |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-|    kegg      | The KEGG ID of the molecule.                                                                                                    |
-+--------------+---------------------------------------------------------------------------------------------------------------------------------+
-
 The *Molecule IDs separator character* is used to customize the character used to separate the molecule IDs of the **molid** column inside the *main* output file.
 
 Output files
@@ -540,9 +460,9 @@
 <!-- @@@END_RST@@@ -->
 	</help>
 
-	<!--~~~~~~~~~
-	~ CITATIONS ~
-	~~~~~~~~~~-->
+	<!--=========
+	= CITATIONS =
+	==========-->
 
 	<citations/>