Mercurial > repos > prog > lcmsmatching

<tool id="lcmsmatching" name="LC/MS matching" version="3.3.1" profile="16.01">

	<description>Annotation of MS peaks using matching on a spectra database.</description>

	<requirements>
		<!--<requirement type="package" version="3.3.3">r</requirement>-->
		<requirement type="package" version="7.0">readline</requirement> <!-- Try readline 7.0 -->
		<requirement type="package" version="1.20.0">r-getopt</requirement>
		<requirement type="package" version="1.0.0">r-stringr</requirement>
		<requirement type="package" version="1.8.3">r-plyr</requirement>
		<requirement type="package" version="3.98">r-xml</requirement>
		<requirement type="package" version="1.0_6">r-bitops</requirement>
		<requirement type="package" version="1.95">r-rcurl</requirement>
		<requirement type="package" version="1.1">r-jsonlite</requirement>
	</requirements>

	<code file="list-chrom-cols.py"/>
	<code file="list-file-cols.py"/>
	<code file="list-ms-mode-values.py"/>

	<!--=======
	= COMMAND =
	========-->

	<command>
		<![CDATA[
		## @@@BEGIN_CHEETAH@@@
		$__tool_directory__/search-mz

		## Input file
		-i "$mzrtinput"
		--input-col-names "mz=$inputmzfield,rt=$inputrtfield"
		--rtunit "$inputrtunit"

		## Database
		#if $db.dbtype == "inhouse"
			-d file
			--db-fields "mztheo=$db.dbmzreffield,chromcolrt=$db.dbchromcolrtfield,compoundid=$db.dbspectrumidfield,chromcol=$db.dbchromcolfield,msmode=$db.dbmsmodefield,peakattr=$db.dbpeakattrfield,pubchemcompid=$db.dbpubchemcompidfield,chebiid=$db.dbchebiidfield,hmdbid=$db.dbhmdbidfield,keggid=$db.dbkeggidfield"
			--db-ms-modes "pos=$db.dbmsposmode,neg=$db.dbmsnegmode"
			--db-rt-unit $db.dbrtunit
		#end if
		#if $db.dbtype == "peakforest"
			-d peakforest
			--db-token "$db.dbtoken"
		#end if
			--url "$db.dburl"

		## M/Z matching
		-m $mzmode -p $mzprec -s $mzshift

		## Precursor matching
		#if $prec.match == "true"
			--precursor-match --pos-prec "$prec.pos" --neg-prec "$prec.neg"
		#end if
		#if $prec.match == "true" and $chromcols:
			--precursor-rt-tol $tolz
		#end if

		## Chromatographic columns options and retention matching
		#if $chromcols:
			-c "$chromcols" --check-cols --rttolx $tolx --rttoly $toly
		#end if

		## Table outputs
		-o "$mainoutput" --peak-output-file "$peaksoutput" --same-rows --same-cols

		## HTML output
		--html-output-file "$htmloutput" --no-main-table-in-html-output

		## Ouput setting
		--molids-sep "$molidssep"
		## @@@END_CHEETAH@@@
	]]></command>

	<!--======
	= INPUTS =
	=======-->

	<inputs>

		<!-- DATABASE -->

		<conditional name="db">

			<param name="dbtype" label="Database" type="select" refresh_on_change="true">
				<option value="inhouse">In-house</option>
				<option value="peakforest">Peakforest</option>
			</param>

			<when value="inhouse">
				<!-- Database file -->
				<param name="dburl" label="Database file" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. Retention time values must be in seconds."/>

				<!-- File database field names -->
				<param name="dbspectrumidfield" type="select" label="Database file Spectrum ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'spectrumid,accession,compoundid,molid')" help="Select the Spectrum ID column of the database file."/>
				<param name="dbmzreffield" type="select" label="Database file Reference MZ column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'mztheo,mzexp,mz')" help="Select the Reference MZ column of the database file."/>
				<param name="dbchromcolfield" type="select" label="Database file Chromatographic Column Name column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcol,col')" help="Select the Chromatographic Column Name column of the database file." refresh_on_change="true"/>
				<param name="dbchromcolrtfield" type="select" label="Database file Chromatographic Column Retention Time column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chromcolrt,colrt,rt')" help="Select the Chromatographic Column Retention Time column of the database file."/>
				<param name="dbmsmodefield" type="select" label="Database file MS Mode column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'msmode,mode')" help="Select the MS Mode column of the database file." refresh_on_change="true"/>
				<param name="dbpeakattrfield" type="select" label="Database file Peak Attribution column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'peakattr,attr')" help="Select the Peak Attribution column of the database file."/>
				<param name="dbpubchemcompidfield" type="select" label="Database file PubChem Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'pubchemcompid,pubchemid,pubchemcomp,pubchem')" help="Select the PubChem Compound ID column of the database file."/>
				<param name="dbchebiidfield" type="select" label="Database file ChEBI ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'chebiid,chebi')" help="Select the ChEBI ID column of the database file."/>
				<param name="dbhmdbidfield" type="select" label="Database file HMDB Metabolite ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'hmdbid,hmdb')" help="Select the HMDB Metabolite ID column of the database file."/>
				<param name="dbkeggidfield" type="select" label="Database file KEGG Compound ID column name" dynamic_options="get_file_cols(file = db['dburl'], preferred = 'keggid,kegg')" help="Select the KEGG Compound ID column of the database file."/>

				<!-- File database MS modes -->
				<param name="dbmsposmode" label="File database MS Positive mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'POS,pos,+')" help="Select the value used to identify the positive MS mode."/>
				<param name="dbmsnegmode" label="File database MS Negative mode" type="select" dynamic_options="get_ms_mode_value(file = db['dburl'], col = db['dbmsmodefield'], preferred = 'NEG,neg,-')" help="Select the value used to identify the negitive MS mode."/>

				<!-- File database RT unit -->
				<param name="dbrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help="">
					<option value="sec">Seconds</option>
					<option value="min">Minutes</option>
				</param>

				<param name="dbtoken" type="text" size="32" value="" hidden="true"/>
			</when>

			<when value="peakforest">
				<param name="dburl" type="text" size="128" value="https://peakforest-alpha.inra.fr/rest" refresh_on_change="true"/>

				<param name="dbtoken" label="Peakforest security token" type="text" size="32" value="" refresh_on_change="true" help="If you do not have yet a Peakforest token, go to Peakforest website and request one from your account."/>

				<param name="dbchromcolfield" type="text" size="32" value="" hidden="true"/>
			</when>
		</conditional>

		<!-- INPUT -->

			<!-- Input file -->
			<param name="mzrtinput" label="Input file - MZ(/RT) values" type="data" format="tabular,tsv" refresh_on_change="true" help="Decimal: '.', missing: NA, mode: character and numerical, sep: tabular. RT values must be in seconds."/>

			<!-- Input field field names -->
			<param name="inputmzfield" type="select" label="Input file MZ column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'mzmed,mz')" help="Select the MZ column of the input file."/>
			<param name="inputrtfield" type="select" label="Input file RT column name" dynamic_options="get_file_cols(file = mzrtinput, preferred = 'rtmed,rt')" help="Select the RT column of the input file."/>

			<!-- Input file RT unit -->
			<param name="inputrtunit" label="Retention time unit" type="select" display="radio" multiple="false" help="">
				<option value="sec">Seconds</option>
				<option value="min">Minutes</option>
			</param>

		<!-- M/Z MATCHING -->

			<!-- Mode -->
			<param name="mzmode" label="MS mode" type="select" display="radio" multiple="false" help="">
				<option value="pos">Positive</option>
				<option value="neg">Negative</option>
			</param>

			<!-- MZ matching parameters -->
			<param name="mzprec" label="M/Z precision (in ppm)" type="float" help="" value="5"/>
			<param name="mzshift" label="M/Z shift (in ppm)" type="float" help="" value="0"/>

		<!-- RETENTION TIME PARAMETERS -->

			<!-- List of chromatographic columns -->
			<param name="chromcols" type="select" label="Chromatographic columns" multiple="true" dynamic_options="get_chrom_cols(dbtype = db['dbtype'], dburl = db['dburl'], dbtoken = db['dbtoken'], col_field = db['dbchromcolfield'])" help="Select here the set of chromatographic columns against which the retention time matching will be run."/>

			<!-- Tolerances -->
			<param name="tolx" label="RTX retention time tolerance, parameter x (in seconds)" type="float" help="" value="5"/>
			<param name="toly" label="RTY retention time tolerance, parameter y" type="float" help="" value="0.8"/>
			<param name="tolz" label="RTZ retention time tolerance, used when precursor matching is enabled." type="float" help="" value="5"/>

		<!-- PRECURSOR MATCH -->
		<conditional name="prec">

			<param name="match" label="Precursor match" type="select">
				<option value="false">Off</option>
				<option value="true">On</option>
			</param>

			<when value="false"></when>
			<when value="true">
				<!-- Negative precursors -->
				<param name="neg" label="List of negative precursors" type="text" size="128" value="[(M-H)]-,[M-H]-,[(M+Cl)]-,[M+Cl]-" help="">
					<sanitizer>
						<valid initial="string.printable">
							<remove value='"'/>
						</valid>
						<mapping initial="none">
							<add source='"' target='\"'/>
						</mapping>
					</sanitizer>
				</param>

				<!-- Positive precursors -->
				<param name="pos" label="List of positive precursors" type="text" size="128" value="[(M+H)]+,[M+H]+,[(M+Na)]+,[M+Na]+,[(M+K)]+,[M+K]+" help="">
					<sanitizer>
						<valid initial="string.printable">
							<remove value='"'/>
						</valid>
						<mapping initial="none">
							<add source='"' target='\"'/>
						</mapping>
					</sanitizer>
				</param>
			</when>
		</conditional>

		<!-- OUTPUT -->
		<!-- Molecule IDs separator character -->
		<param name="molidssep" label="Molecule IDs separator character" type="text" size="3" value="|" help="">
			<sanitizer>
				<valid initial="string.printable">
					<remove value='"'/>
				</valid>
				<mapping initial="none">
					<add source='"' target='\"'/>
				</mapping>
			</sanitizer>
		</param>

	</inputs>

	<!--=======
	= OUTPUTS =
	========-->

	<outputs>

		<!-- Output file -->
		<data name="mainoutput"  label="lcmsmatch_${mzrtinput.name}" format="tabular"/>
		<data name="peaksoutput" label="lcmsmatch_${mzrtinput.name}_peaks" format="tabular"/>
		<data name="htmloutput"  label="lcmsmatch_${mzrtinput.name}.html" format="html"/>

	</outputs>

	<!--=====
	= TESTS =
	======-->

	<tests>

		<!-- File database test -->
		<test>
			<param name="dbtype" value="inhouse"/>
			<param name="dburl" value="filedb.tsv"/>
			<param name="dbfields" value=""/>
			<param name="dbmsmodes" value=""/>
			<param name="mzrtinput" value="mz-input-small.tsv"/>
			<param name="inputmzfield" value="mzmed"/>
			<param name="inputrtfield" value="rtmed"/>
			<param name="mzmode" value="pos"/>
			<output name="mainoutput" file="filedb-small-mz-match-output.tsv"/>
			<output name="peaksoutput" file="filedb-small-mz-match-peaks-output.tsv"/>
			<output name="htmloutput" file="filedb-small-mz-match-html-output.html"/>
		</test>

		<!-- File database test -->
<!--
		<test>
			<param name="dbtype" value="peakforest"/>
			<param name="dbtoken" value="@PEAKFOREST_TOKEN@"/>
			<param name="mzrtinput" value="mz-input-small.tsv"/>
			<param name="inputfields" value=""/>
			<param name="mzmode" value="pos"/>
			<output name="mainoutput">
				<assert_contents>
					<has_text text="mz"/>
				</assert_contents>
			</output>
		</test>
-->
	</tests>

	<!--====
	= HELP =
	=====-->

	<help>
<!-- @@@BEGIN_RST@@@ -->

==============
LC/MS matching
==============

This tool performs LC/MS matching on an input list of MZ/RT values, using either a provided in-house single file database or a connection to Peakforest database.

--------
Database
--------

When selecting the database, you have the choice between a Peakforest database or an in-house file.

For the Peakforest database, a default REST web base address is already provided. But you can change it to use a custom database. A field is also available for setting a token key in case the access to the Peakforest database you want to use is restricted. This is the case of the default database URL.

For the in-house file, please refer to the paragraph "Single file database" below.

-----------
Input files
-----------

Be careful to always provide UTF-8 encoded files, unless you do not use special characters at all. For instance, greek letters in molecule names give errors if the file is in latin1 (ISO 8859-1) or Windows 1252 (not distinguishable from latin1) encoding.

Single file database
====================

The database used is provided as a single file, in tabular format, through the *Database file* field. This file must contain a list of MS peaks, with possibly retention times.
Peaks are "duplicated" as much as necessary. For instance if 3 retention times are available on a compound with 10 peaks in positive mode, then there will be 30 lines for this compound in positive mode.

The file must contain a header with the column names. The names are free, but must be provided through the different fields named *Database file ... column name*.
Then you must provide the values used to identify the MS modes (positive and negative).

A last information about the single file database is the unit of the retention times, either in seconds or in minutes.

Example of database file (totally fake, no meaning):

+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| molid | mode  | mz         | composition        | attribution             | col       | rt    | molcomp       | molmass   | molnames     |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 112.07569  | "P9Z6W410 O"       | "[(M+H)-(H2O)-(NH3)]+"  | "colzz"   | 5.69  | "J114L6M62O2" | 146.10553 | "Blablaine'" |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 112.07569  | "P9Z6W410 O"       | "[(M+H)-(H2O)-(NH3)]+"  | "col12"   | 0.8   | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 112.07569  | "P9Z6W410 O"       | "[(M+H)-(H2O)-(NH3)]+"  | "somecol" | 8.97  | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 191.076694 | "P92Z6W413 Na2 O2" | "[(M-H+2Na)]+"          | "colAA"   | 1.58  | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 191.076694 | "P92Z6W413 Na2 O2" | "[(M-H+2Na)]+"          | "colzz2"  | 4.08  | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 294.221687 | "U1113P94ZW429 O4" | "[(2M+H)]+ (13C)"       | "somecol" | 8.97  | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 72.080775  | "P9Z4W410 O0"      | "[(M+H)-(J15L2M6O2)]+"  | "hcoltt"  | 0.8   | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 112.07569  | "P9Z6W410 O"       | "[(M+H)-(H2O)-(NH3)]+"  | "colzz3"  | 4.54  | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 72.080775  | "P9Z4W410 O0"      | "[(M+H)-(J15L2M6O2)]+"  | "colzz3"  | 4.54  | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 72.080775  | "P9Z4W410 O0"      | "[(M+H)-(J15L2M6O2)]+"  | "colpp"   | 0.89  | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+
| A10   | "POS" | 145.097154 | "P92Z6W413 O2"     | "[(M+H)-(H2)]+"         | "hcoltt"  | 0.8   | "J114L6M62O2" | 146.10553 | "Blablaine"  |
+-------+-------+------------+--------------------+-------------------------+-----------+-------+---------------+-----------+--------------+

MZ/RT input file
================

The input to provide is a file, in a tabular format (or TSV: Tab Seperated Values), containing the list of M/Z values, with possibly also RT values.

The column names for the M/Z and RT values must be provided through the fields *Input file MZ column name* and *Input file RT column name*.
As a consequence, the file must contain a header line.

The unit of the retention time has to be provided with the field *Retention time unit*.

Example of file input:

+-------------+-------------+
| mz          | rt          |
+-------------+-------------+
| 75.02080998 | 49.38210915 |
+-------------+-------------+
| 75.05547146 | 0.658528069 |
+-------------+-------------+
| 75.08059797 | 1743.94267  |
+-------------+-------------+
| 76.03942694 | 51.23158899 |
+-------------+-------------+
| 76.07584477 | 50.51249853 |
+-------------+-------------+
| 76.07593168 | 0.149308136 |
+-------------+-------------+

------------
M/Z matching
------------

In the simplest form of the algorithm only the *m/z* values are matched against the database peaks. This happens if both *Retention time match* and *Precursor match* are off.

The first parameter is the MS mode, specified through the *MS mode* parameter.

The parameters *M/Z precision* and *M/Z shift* are used by the algorithm in the following formula in order to match an *m/z* value:

	mz (1 + (- shift - precision) / 10^6) &lt; mzref &lt; mz (1 + (- shift - precision) / 10^6)

Where *mzref* is the M/Z of reference from the database peak that is tested. If this double inequality is true, then the *m/z* value is matched with this peak.

--------------------
Retention time match
--------------------

If at least one column is selected inside the *Chromatographic columns* parameter section, then retention time is also matched, in addition to the *m/z* value, according to the following formula:

	rt - x - rt^y &lt; colrt &lt; rt + x + rt^y

Where *x* is the value of the parameter *RTX* and *y* the value of the parameter *RTY*.

If for a reference compound the database does not contain retention time for at least one of the specified columns, then only the *m/z* value is matched against the peaks of the reference compound. This means that in the results you can find compounds that do no match the provided retention time value.

The *RTZ* parameter is used in the *Precursor match* algorithm (see below).

---------------
Precursor match
---------------

If the "Precursor match" option is enabled inside the parameters section, then a more sophisticated version of the algorithm, which is executed in two steps, is used.

This algorithm takes two more parameters, one for each MS mode. These are the lists of precursors. Since the matching is run for one MS mode only, only one of the two parameters is used. Inside the single file database, all the peaks whose **attr** column value is equal to one of the precursor listed in *List of negative precursors* or *List of positive precursors*, depending on the mode, are considered as precursor peaks.

M/Z matching using precursor matching
=====================================

 1. Using the normal M/Z matching algorithm described above, we first look only for precursor peaks ([(M+H)]+, [(M+Na)]+, [(M+Cl)]-, ...).
 2. From step 1, we construct a list of matched molecules.
 3. We look at all peaks inside the molecule list obtained in step 2, using the normal M/Z matching algorithm described above.

MZ/RT matching using precursor matching
=======================================

 1. Using the normal MZ/RT matching algorithm described above, we first look only for precursor peaks ([(M+H)]+, [(M+Na)]+, [(M+Cl)]-, ...).
 2. From step 1, we construct a list of matched molecules, retaining the matched retention time of each molecule.
 3. For each input couple (m/z,rt), we look at all peaks inside the molecules taken from step 2, whose matched retention time between *rt - z* and *rt + z*, where *z* is the value of parameter *RTZ*.

---------------
Output settings
---------------

The *Molecule IDs separator character* is used to customize the character used to separate the molecule IDs of the **molid** column inside the *main* output file.

Output files
============

Three files are output by the tool.

+-------------+--------------------------------------+--------------------------------------------------------+
|   Outputs   |              File name               |                      Description                       |
+-------------+--------------------------------------+--------------------------------------------------------+
| Main output | lcmsmatching_{input_file_name}       | Contains the list of compounds that have been matched. |
+-------------+--------------------------------------+--------------------------------------------------------+
| Peak list   | lcmsmatching_peaks_{input_file_name} | Contains all matched database peaks.                   |
+-------------+--------------------------------------+--------------------------------------------------------+
| HTML output | lcmsmatching_{input_file_name}.html  | Contains the two tables on one page.                   |
+-------------+--------------------------------------+--------------------------------------------------------+

The **main** output is identical to the input file, to which is added an *msmatching* column. This column contains a list of IDs of the compounds that have been matched for this couple of (m/z, rt) values.

The **peak list** output contains all database peaks that have been matched, for each (m/z, rt) input couple. Thus for each (m/z, rt) couple, there will be zero, one or more matched peaks output. The columns output are *mz*, *rt*, *id*, *mztheo*, *col*, *colrt*, *attribution* and *composition*, where *id* is the compound ID, *mztheo* is the theoretical mass of the fragment, *col* is the matched column and *colrt* is the retention time measured on the column for the reference compound.

The **HTML** output contains the peak table with links toward HMDB, KEGG, ChEBI and PubChem public databases, when IDs are available.

=====
About
=====

.. class:: infomark

**Author**
	Pierrick Roger (pierrick.roger@cea.fr) wrote this MS matching method.
	MetaboHUB: The French National Infrastructure for Metabolomics and Fluxomics (http://www.metabohub.fr/en).

.. class:: infomark

**Acknowledgement**
	Data and algorithms have been kindly provided by Christophe Junot at *DSV/IBITEC-S/SPI* (*CEA/Saclay*), from a former application developped by Cyrille Petat and Arnaud Martel at *DSV/IBITEC-S/DIR* (*CEA/Saclay*).

.. class:: infomark

**Please cite**
	R Core Team (2013). R: A language and Environment for Statistical Computing. http://www.r-project.org

<!-- @@@END_RST@@@ -->
	</help>

	<!--=========
	= CITATIONS =
	==========-->

	<citations/>

</tool>
author	prog
date	Wed, 19 Apr 2017 10:00:05 -0400
parents	b34c14151f25
children	f86fec07f392