Mercurial > repos > pieterlukasse > prims_proteomics
view quantifere.xml @ 19:d31c6978d9d0
fixes for NapQ
author | pieter.lukasse@wur.nl |
---|---|
date | Mon, 26 Jan 2015 06:24:15 +0100 |
parents | 40ec8770780d |
children | 5215b5cfdc53 |
line wrap: on
line source
<tool name="Quantifere" id="quantifere1" version="1.0.3"> <description>Protein Inference by Peptide Quantification patterns</description> <!-- For remote debugging start you listener on port 8000 and use the following as command interpreter: java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 ////////////////////////// --> <command interpreter="java -jar "> Quantifere.jar -annotatedQuantificationFilesList $annotatedQuantificationFilesList -identificationFilesList $identificationFilesList -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile -quantificationDataToUse $quantificationDataToUse -minCorrel $minCorrel -minProtCoverage $minProtCoverage -minAboveAverageHits $minAboveAverageHits -minNrIdsForInferencePeptide $minNrIdsForInferencePeptide -refineModel $refineModel -functionalAnnotationCSV $functionalAnnotationCSV -outputCSV $outputCSV -outputInferenceLogCSV $outputInferenceLogCSV -outputSummaryAnnotationCSV $outputSummaryAnnotationCSV -outReport $htmlReportFile -outReportPicturesPath $htmlReportFile.files_path #if $is2D_LC_MS.fractions == True -namingConventionCodesForFractions $is2D_LC_MS.namingConventionCodesForFractions #end if </command> <inputs> <repeat name="annotatedQuantificationFiles" title="Peptide (filtered) quantification files (APML)" help="The APML contents as aligned, annotated and scored feature lists, as produced by MsFilt tool. Select one or more files. For 2D-LC-MS we expect one file per fraction."> <param name="annotatedQuantificationFile" size="50" type="data" format="apml" label="File (APML format)" /> </repeat> <repeat name="identificationFiles" title="Peptide (filtered) identification files (MS/MS identifications)" help="Full set of MS/MS peptide identification files, including peptides that could not be quantified. This set of identifications is ideally filtered on some quality and statistical measures (e.g. as is done by MsFilt). Tip: to base the inference only on the selected peptide quantification files, you can select the same quantification files here as well. Select one or more files."> <param name="identificationFile" size="50" type="data" format="apml,mzid" label="File (APML or MZIDENTML format)" /> </repeat> <conditional name="is2D_LC_MS"> <param name="fractions" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Data is from 2D LC-MS" help="Data acquisition was done in multiple fractions."/> <when value="Yes"> <param name="namingConventionCodesForFractions" type="text" size="100" value="" label="Part of run/file name that identifies the 2D LC-MS fraction" help="Add the CSV list of codes that occur in the file names and that stand for a fraction code. E.g. '_F1,_F2,_F3,etc.' In this way different peptide identifications from the same sample but measured in different fractions can be merged together. Otherwise each (fraction) file is seen as a separate sample."/> <!-- could do regular expressions as well but this would be hard for biologists, e.g. _F\d\b --> <!-- on help above: the given codes are removed from source name...separate features are clustered, not peptides, peptides are quantified based on summing features (raw), or summing patterns : TODO document the quantification columns present in the output CSV --> </when> <when value="No"> </when> </conditional> <param name="statisticalMeasuresConfig" type="text" area="true" size="8x70" label="Statistical measures configuration" help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). The format is: SM alias => SM name,type,mode[min/max]. Leaving this configuration out while these are present in the dataset will have the effect that they will be wrongly used as a regular scoring scheme, having effect on for example the filter criteria below like 'Minimum number of peptide matches with a score above average' ." value="smXTD => MS:1001330,XSLASH!Tandem:expect,min 
pvCSVEX => p_value,CSV_EXPORT,min 
smAUTO_LIKELIHOOD => AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max 
smLIKELIHOOD => LOGLIKELIHOOD,PLGS/Databank-search,max 
smPercoProb => Percolator: probability,Percolator probability,max 
smPercoPEP => Percolator: PEP,Percolator PEP,min 
smPercoQval => Percolator: q-Value,Percolator q-Value,max "/> <!-- keep value attribute above aligned like this to avoid white spaces in the value --> <param name="quantificationDataToUse" type="select" label="Quantification data to use" help="Quantification data to use for the pattern clustering and inference steps. NB: check if the chosen data is also present in your file, or choose 'auto' to let Quantifere check which quantification type is present in most peptides."> <option value="auto" selected="true">auto</option> <option value="getIntensity">(TODO)raw intensities</option> <option value="getApexIntensity">(TODO)apex intensities</option> <option value="getNormalizedIntensity">(TODO)normalized intensities</option> </param> <!-- TODO let minCorrel default value vary according to quantification type chosen above --> <param name="minCorrel" type="float" size="10" value="0.85" label="Minimum correlation in a cluster" help="Features will be grouped by their protein annotation and sample intensity values correlation. Set here the minimum correlation expected between grouped members. This is used to guide the clustering algorithm."/> <!-- simple extra heuristics to remove some "noise" protein hits --> <param name="minProtCoverage" type="float" size="10" value="0.0" label="Minimum protein coverage (%)" help="Set this to e.g. 5.0 if you have protein coverage information in your data. This will remove proteins that have a too small portion of their sequence covered by peptide matches."/> <!-- TODO : ADD warning to report if this is left 0 and no coverage is found ...or maybe validate the other way around--> <param name="minAboveAverageHits" type="integer" size="10" value="1" label="Minimum number of different peptide matches with a score above average" help="This will remove proteins that do not have enough reasonable peptides hits."/> <param name="minNrIdsForInferencePeptide" type="integer" size="10" value="1" label="Minimum number of peptide identifications for inference peptides" help="Minimum number of peptide identifications a peptide needs to be used as inference peptide for secondary proteins."/> <!-- currently, when one feature clusters with foreign peptide, then it is not inference peptide anymore...quite strict, could be less strict by letting user indicate for example: 90% of features should be inference features...then it is an inference pep. See QuantifereTool.inferSecondaryProteins() --> <param name="functionalAnnotationCSV" type="data" format="csv,txt,tsv" optional="true" label="(Functional)annotation mapping file (csv or tsv format)" help="Optional file that maps protein accessions to a network, pathway or other higher level annotations. In this file a header line is expected with these 2 columns (names and lower case is important): accession,annotation"/> <param name="refineModel" type="boolean" checked="true" label="Refine matches model" help="This will let the algorithm search for a reduced set of secondary protein matches that still explains the variation in the peptide quantification patterns"/> <param name="summaryReport" type="boolean" checked="true" label="Generate summary report"/> </inputs> <configfiles> <configfile name="annotatedQuantificationFilesList">## start comment ## iterate over the selected files and store their names in the config file #for $i, $s in enumerate( $annotatedQuantificationFiles ) ${s.annotatedQuantificationFile} #end for ## end comment</configfile> <configfile name="identificationFilesList">## start comment ## iterate over the selected files and store their names in the config file #for $i, $s in enumerate( $identificationFiles ) ${s.identificationFile} ## also print out the datatype in the next line, based on previously configured datatype #if isinstance( $s.identificationFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__): apml #else: mzid #end if #end for ## end comment</configfile> <configfile name="statisticalMeasuresConfigFile">## start comment ${statisticalMeasuresConfig} </configfile> </configfiles> <outputs> <data name="outputCSV" format="csv" label="${tool.name} on ${on_string}: Proteins list (CSV)" /> <data name="outputInferenceLogCSV" format="csv" label="${tool.name} on ${on_string}: Inference log (CSV)"/> <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"> <!-- If the expression is false, the file is not created --> <filter>( summaryReport == True )</filter> </data> <data name="outputSummaryAnnotationCSV" format="csv" label="${tool.name} on ${on_string} - Functional annotation summary (CSV)"> <!-- If the expression is false, the file is not created --> <filter>( functionalAnnotationCSV != None )</filter> </data> </outputs> <tests> </tests> <help> .. class:: infomark This tool takes Peptide Quantification patterns and uses this to do Protein Inference of both Primary Protein identifications as well as Secondary Protein identifications. This last class of protein identifications can not be done by traditional protein inference methods that look only at peptide identifications and their quality parameters. ----- **List of definitions** Primary Protein identification: protein identification belonging to the minimum set of proteins needed to account for the observed peptides. Secondary Protein identification: extra protein identifications that do not below to the minimum set of proteins mentioned above. raw intensities : is the intensity value resulting from the integration of the feature peak area apex intensities: is the intensity value as on the highest point of the feature peak normalized intensities : is the intensity normalized by some means ----- **Minimum correlation in a cluster** TODO - add doc. ----- **Output details** *Proteins list (CSV)* This is the list of primary and secondary proteins and their calculated inference score. Proteins with exactly the same peptide hits are also grouped together and labeled as primary_group and secondary_group instead of simply primary and secondary. *Inference log (CSV)* This CSV table shows all data, both inferred and ruled out proteins. This can be used by the user to troubleshoot the inference process and understand why certain proteins might have been ruled out. The CSV is provided in such a format that the data can easily be explored in a Cytoscape network. The figure below shows an example of the data being explored in Cytoscape using also the `Cytoscape chartplugin`_ to visualize the quantification data when selecting the peptide nodes. .. image:: $PATH_TO_IMAGES/quantifere_cyto_out.png . .. _Cytoscape chartplugin: http://apps.cytoscape.org/apps/chartplugin **References** If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: Pieter N. J. Lukasse and Antoine H. P. America (2014). Protein Inference Using Peptide Quantification Patterns. http://dx.doi.org/10.1021/pr401072g </help> <citations> <citation type="doi">10.1021/pr401072g</citation> <!-- example see also https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set --> </citations> </tool>