Mercurial > repos > prog > lcmsmatching
diff lcmsmatching.xml @ 5:fb9c0409d85c draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 608d9e59a0d2dcf85a037968ddb2c61137fb9bce
author | prog |
---|---|
date | Wed, 19 Apr 2017 10:00:05 -0400 |
parents | b34c14151f25 |
children | f86fec07f392 |
line wrap: on
line diff
--- a/lcmsmatching.xml Tue Mar 14 12:40:22 2017 -0400 +++ b/lcmsmatching.xml Wed Apr 19 10:00:05 2017 -0400 @@ -1,33 +1,43 @@ -<tool id="lcmsmatching" name="LC/MS matching" version="3.2.0" profile="16.01"> +<tool id="lcmsmatching" name="LC/MS matching" version="3.3.1" profile="16.01"> <description>Annotation of MS peaks using matching on a spectra database.</description> <requirements> + <!--<requirement type="package" version="3.3.3">r</requirement>--> + <requirement type="package" version="7.0">readline</requirement> <!-- Try readline 7.0 --> <requirement type="package" version="1.20.0">r-getopt</requirement> <requirement type="package" version="1.0.0">r-stringr</requirement> <requirement type="package" version="1.8.3">r-plyr</requirement> <requirement type="package" version="3.98">r-xml</requirement> <requirement type="package" version="1.0_6">r-bitops</requirement> <requirement type="package" version="1.95">r-rcurl</requirement> - <requirement type="package" version="1.3">r-rjsonio</requirement> + <requirement type="package" version="1.1">r-jsonlite</requirement> </requirements> <code file="list-chrom-cols.py"/> + <code file="list-file-cols.py"/> + <code file="list-ms-mode-values.py"/> - <!--~~~~~~~ - ~ COMMAND ~ - ~~~~~~~~--> + <!--======= + = COMMAND = + ========--> <command> <![CDATA[ ## @@@BEGIN_CHEETAH@@@ - $__tool_directory__/search-mz -i "$mzrtinput" + $__tool_directory__/search-mz + + ## Input file + -i "$mzrtinput" + --input-col-names "mz=$inputmzfield,rt=$inputrtfield" + --rtunit "$inputrtunit" ## Database #if $db.dbtype == "inhouse" -d file - --db-fields "$db.dbfields" - --db-ms-modes "$db.dbmsmodes" + --db-fields "mztheo=$db.dbmzreffield,chromcolrt=$db.dbchromcolrtfield,compoundid=$db.dbspectrumidfield,chromcol=$db.dbchromcolfield,msmode=$db.dbmsmodefield,peakattr=$db.dbpeakattrfield,pubchemcompid=$db.dbpubchemcompidfield,chebiid=$db.dbchebiidfield,hmdbid=$db.dbhmdbidfield,keggid=$db.dbkeggidfield" + --db-ms-modes "pos=$db.dbmsposmode,neg=$db.dbmsnegmode" + --db-rt-unit $db.dbrtunit #end if #if $db.dbtype == "peakforest" -d peakforest @@ -57,22 +67,14 @@ ## HTML output --html-output-file "$htmloutput" --no-main-table-in-html-output - ## Fields of input file - --input-col-names "$inputfields" - ## Ouput setting - #if $out.enabled == "true" - --output-col-names "$out.outputfields" - --molids-sep "$out.molidssep" - #else - --molids-sep "|" - #end if + --molids-sep "$molidssep" ## @@@END_CHEETAH@@@ ]]></command> - <!--~~~~~~ - ~ INPUTS ~ - ~~~~~~~--> + <!--====== + = INPUTS = + =======--> <inputs> @@ -90,10 +92,26 @@ <param name="dburl" label="Database file" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. Retention time values must be in seconds."/> <!-- File database field names --> - <param name="dbfields" label="File database column names" type="text" size="256" value="mztheo=mztheo,chromcolrt=chromcolrt,compoundid=compoundid,chromcol=chromcol,msmode=msmode,peakattr=peakattr,peakcomp=peakcomp,fullnames=fullnames,compoundmass=compoundmass,compoundcomp=compoundcomp,inchi=inchi,inchikey=inchikey,pubchemcompid=pubchemcompid,chebiid=chebiid,hmdbid=hmdbid,keggid=keggid" refresh_on_change="true" help=""/> + <param name="dbspectrumidfield" type="select" label="Database file Spectrum ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'spectrumid,accession,compoundid,molid')" help="Select the Spectrum ID column of the database file."/> + <param name="dbmzreffield" type="select" label="Database file Reference MZ column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'mztheo,mzexp,mz')" help="Select the Reference MZ column of the database file."/> + <param name="dbchromcolfield" type="select" label="Database file Chromatographic Column Name column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcol,col')" help="Select the Chromatographic Column Name column of the database file." refresh_on_change="true"/> + <param name="dbchromcolrtfield" type="select" label="Database file Chromatographic Column Retention Time column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcolrt,colrt,rt')" help="Select the Chromatographic Column Retention Time column of the database file."/> + <param name="dbmsmodefield" type="select" label="Database file MS Mode column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'msmode,mode')" help="Select the MS Mode column of the database file." refresh_on_change="true"/> + <param name="dbpeakattrfield" type="select" label="Database file Peak Attribution column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'peakattr,attr')" help="Select the Peak Attribution column of the database file."/> + <param name="dbpubchemcompidfield" type="select" label="Database file PubChem Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'pubchemcompid,pubchemid,pubchemcomp,pubchem')" help="Select the PubChem Compound ID column of the database file."/> + <param name="dbchebiidfield" type="select" label="Database file ChEBI ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chebiid,chebi')" help="Select the ChEBI ID column of the database file."/> + <param name="dbhmdbidfield" type="select" label="Database file HMDB Metabolite ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'hmdbid,hmdb')" help="Select the HMDB Metabolite ID column of the database file."/> + <param name="dbkeggidfield" type="select" label="Database file KEGG Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'keggid,kegg')" help="Select the KEGG Compound ID column of the database file."/> <!-- File database MS modes --> - <param name="dbmsmodes" label="File database MS modes" type="text" size="32" value="pos=POS,neg=NEG" help=""/> + <param name="dbmsposmode" label="File database MS Positive mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'POS,pos,+')" help="Select the value used to identify the positive MS mode."/> + <param name="dbmsnegmode" label="File database MS Negative mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'NEG,neg,-')" help="Select the value used to identify the negitive MS mode."/> + + <!-- File database RT unit --> + <param name="dbrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help=""> + <option value="sec">Seconds</option> + <option value="min">Minutes</option> + </param> <param name="dbtoken" type="text" size="32" value="" hidden="true"/> </when> @@ -103,17 +121,24 @@ <param name="dbtoken" label="Peakforest security token" type="text" size="32" value="" refresh_on_change="true" help="If you do not have yet a Peakforest token, go to Peakforest website and request one from your account."/> - <param name="dbfields" type="text" size="32" value="" hidden="true"/> + <param name="dbchromcolfield" type="text" size="32" value="" hidden="true"/> </when> </conditional> <!-- INPUT --> <!-- Input file --> - <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> + <param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/> + + <!-- Input field field names --> + <param name="inputmzfield" type="select" label="Input file MZ column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'mzmed,mz')" help="Select the MZ column of the input file."/> + <param name="inputrtfield" type="select" label="Input file RT column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'rtmed,rt')" help="Select the RT column of the input file."/> - <!-- Input field names --> - <param name="inputfields" label="Input file column names" type="text" size="32" value="mz=mzmed,rt=rtmed" help=""/> + <!-- Input file RT unit --> + <param name="inputrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help=""> + <option value="sec">Seconds</option> + <option value="min">Minutes</option> + </param> <!-- M/Z MATCHING --> @@ -130,7 +155,7 @@ <!-- RETENTION TIME PARAMETERS --> <!-- List of chromatographic columns --> - <param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], dbfields = db['dbfields'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/> + <param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], col_field = db['dbchromcolfield'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/> <!-- Tolerances --> <param name="tolx" label="RTX retention time tolerance, parameter x (in seconds)" type="float" help="" value="5"/> @@ -174,38 +199,23 @@ </conditional> <!-- OUTPUT --> - <conditional name="out"> - - <param name="enabled" label="Output settings" type="select"> - <option value="false">Default</option> - <option value="true">Customized</option> - </param> - - <when value="false"></when> - <when value="true"> - - <!-- Output field names --> - <param name="outputfields" label="Output column names" type="text" size="256" value="mz=mz,rt=rt,chromcol=chromcol,chromcolrt=chromcolrt,compoundid=compoundid,peakattr=peakattr,peakcomp=peakcomp,intensity=intensity,relative.intensity=relative.intensity,mzexp=mzexp,mztheo=mztheo,fullnames=fullnames,compoundmass=compoundmass,compoundcomp=compoundcomp,inchi=inchi,inchikey=inchikey,pubchemcompid=pubchemcompid,chebiid=chebiid,hmdbid=hmdbid,keggid=keggid" help=""/> - - <!-- Molecule IDs separator character --> - <param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help=""> - <sanitizer> - <valid initial="string.printable"> - <remove value='"'/> - </valid> - <mapping initial="none"> - <add source='"' target='\"'/> - </mapping> - </sanitizer> - </param> - </when> - </conditional> + <!-- Molecule IDs separator character --> + <param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help=""> + <sanitizer> + <valid initial="string.printable"> + <remove value='"'/> + </valid> + <mapping initial="none"> + <add source='"' target='\"'/> + </mapping> + </sanitizer> + </param> </inputs> - <!--~~~~~~~ - ~ OUTPUTS ~ - ~~~~~~~~--> + <!--======= + = OUTPUTS = + ========--> <outputs> @@ -216,9 +226,9 @@ </outputs> - <!--~~~~~ - ~ TESTS ~ - ~~~~~~--> + <!--===== + = TESTS = + ======--> <tests> @@ -229,7 +239,8 @@ <param name="dbfields" value=""/> <param name="dbmsmodes" value=""/> <param name="mzrtinput" value="mz-input-small.tsv"/> - <param name="inputfields" value=""/> + <param name="inputmzfield" value="mzmed"/> + <param name="inputrtfield" value="rtmed"/> <param name="mzmode" value="pos"/> <output name="mainoutput" file="filedb-small-mz-match-output.tsv"/> <output name="peaksoutput" file="filedb-small-mz-match-peaks-output.tsv"/> @@ -253,9 +264,9 @@ --> </tests> - <!--~~~~ - ~ HELP ~ - ~~~~~--> + <!--==== + = HELP = + =====--> <help> <!-- @@@BEGIN_RST@@@ --> @@ -272,7 +283,7 @@ When selecting the database, you have the choice between a Peakforest database or an in-house file. -For the Peakforest database, a default REST web base address is already provided. But you can change it of you want to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database. +For the Peakforest database, a default REST web base address is already provided. But you can change it to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database URL. For the in-house file, please refer to the paragraph "Single file database" below. @@ -285,50 +296,13 @@ Single file database ==================== -The database used is provided as a single file, in tabular format, through the *Database file* field. This file contains a list of MS peaks, with retention times. -Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compounds in positive mode. - -The file must contain a header with the column names. The names are free, but must be provided through the *File database column names* field. -In this field, each column is identified with a tag, and the columns names are listed as a comma separated list of tag/name couples (separated by character `=`). The allowed tags are the following ones: +The database used is provided as a single file, in tabular format, through the *Database file* field. This file must contain a list of MS peaks, with possibly retention times. +Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compound in positive mode. -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| Column tag | Compulsory | Values | -+==============+============+============================================================================================================+ -| mztheo | Yes | The m/z values. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| mode | Yes | The MS mode. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molid | Yes | This is the identifier of your compound. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| colrt | No | The retention time values in seconds. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| col | No | The chromatographic column associated with the retention time. Compulsory if retention times are provided. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| attr | No | The attribution of the peak (e.g.: ``[(M+H)-(H2O)-(NH3)]+``). | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| comp | No | The composition of the peak (e.g.: ``C6 H10 N O``). | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molcomp | No | The composition of the molecule. (e.g.: ``C6H14N2O2``). | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molmass | No | The mass of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| molnames | No | The names of the molecule, as a semicolon separated list. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| inchi | No | The InChI of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| inchikey | No | The InChI key of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| pubchem | No | The PubChem ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| chebi | No | The ChEBI ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| hmdb | No | The HMDB ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ -| kegg | No | The KEGG ID of the molecule. | -+--------------+------------+------------------------------------------------------------------------------------------------------------+ +The file must contain a header with the column names. The names are free, but must be provided through the different fields named *Database file ... column name*. +Then you must provide the values used to identify the MS modes (positive and negative). -The field *File database MS modes* allows you to personalize the MS mode identifiers. The value of the field is a comma separated list of mode/name couples (separated by character `=`).. -For instance, if in your database file you use characters '+' and '-' to identify the modes, then you must set the field to `pos=+,neg=-`. +A last information about the single file database is the unit of the retention times, either in seconds or in minutes. Example of database file (totally fake, no meaning): @@ -361,24 +335,12 @@ MZ/RT input file ================ -The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of MZ/RT values. - -The following columns will be used: +The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of M/Z values, with possibly also RT values. -+--------------+------------+---------------------------------------+ -| Column tag | Compulsory | Values | -+==============+============+=======================================+ -| mz | Yes | The m/z values. | -+--------------+------------+---------------------------------------+ -| rt | No | The retention time values in seconds. | -+--------------+------------+---------------------------------------+ +The column names for the M/Z and RT values must be provided through the fields *Input file MZ column name* and *Input file RT column name*. +As a consequence, the file must contain a header line. -The file may contain a header line, in which case you have to provide the column names through the *Input file column names* field, which consists in a comma separated list of tag/name couples (separated by character `=`). If your file does not contain a header line, then you must provide the column numbers. Examples: - - * With a header line having name MASS for mz column and RET for rt column: `mz=MASS,rt=RET`. - * With no header line: `mz=1,rt=2`. - -Since the MS spectrum mode can not be known from the file, an *MS mode* radio button field is provided for setting the mode. +The unit of the retention time has to be provided with the field *Retention time unit*. Example of file input: @@ -408,15 +370,15 @@ The parameters *M/Z precision* and *M/Z shift* are used by the algorithm in the following formula in order to match an *m/z* value: - mz (1 + (- shift - precision) / 10^6) < mztheo < mz (1 + (- shift - precision) / 10^6) + mz (1 + (- shift - precision) / 10^6) < mzref < mz (1 + (- shift - precision) / 10^6) -Where *mztheo* is the theoretical mass of the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak. +Where *mzref* is the M/Z of reference from the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak. -------------------- Retention time match -------------------- -If at least one column is checked inside the *Columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula: +If at least one column is selected inside the *Chromatographic columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula: rt - x - rt^y < colrt < rt + x + rt^y @@ -452,48 +414,6 @@ Output settings --------------- -The *Output column names* parameter is used to customize the columns of the output files. As with the *File database column names* parameter, each column is identified with a tag, and the columns names are listed as a comma separated list of tag/name couples (separated by character `=`). The allowed tags are the following ones: - -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| Column tag | Values | -+==============+=================================================================================================================================+ -| mz | The m/z values from the input file. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| mztheo | The m/z values from the database. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molid | This is the identifier of your compound. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| rt | The retention time values in seconds from the input file. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| col | The chromatographic column associated with the retention time. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| colrt | The retention time associated with the matched chromatographic column. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| msmatching | The list IDs of matched molecules. IDs are separated by the character specified in the *Molecule IDs separator character* field | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| attr | The attribution of the peak (e.g.: ``[(M+H)-(H2O)-(NH3)]+``). | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| comp | The composition of the peak (e.g.: ``C6 H10 N O``). | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molcomp | The composition of the molecule. (e.g.: ``C6H14N2O2``). | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molmass | The mass of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| molnames | The names of the molecule, as a semicolon separated list. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| inchi | The InChI of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| inchikey | The InChI key of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| pubchem | The PubChem ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| chebi | The ChEBI ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| hmdb | The HMDB ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ -| kegg | The KEGG ID of the molecule. | -+--------------+---------------------------------------------------------------------------------------------------------------------------------+ - The *Molecule IDs separator character* is used to customize the character used to separate the molecule IDs of the **molid** column inside the *main* output file. Output files @@ -540,9 +460,9 @@ <!-- @@@END_RST@@@ --> </help> - <!--~~~~~~~~~ - ~ CITATIONS ~ - ~~~~~~~~~~--> + <!--========= + = CITATIONS = + ==========--> <citations/>