0
+ − 1 <tool name="Quantifere" id="quantifere1" version="1.0.2">
+ − 2 <description>Protein Inference by Peptide Quantification patterns</description>
+ − 3 <!--
+ − 4 For remote debugging start you listener on port 8000 and use the following as command interpreter:
+ − 5 java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000
+ − 6 //////////////////////////
+ − 7 -->
+ − 8 <command interpreter="java -jar ">
+ − 9 Quantifere.jar
+ − 10 -annotatedQuantificationFilesList $annotatedQuantificationFilesList
+ − 11 -identificationFilesList $identificationFilesList
+ − 12 -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
+ − 13 -quantificationDataToUse $quantificationDataToUse
+ − 14 -minCorrel $minCorrel
+ − 15 -minProtCoverage $minProtCoverage
+ − 16 -minAboveAverageHits $minAboveAverageHits
+ − 17 -minNrIdsForInferencePeptide $minNrIdsForInferencePeptide
+ − 18 -refineModel $refineModel
+ − 19 -functionalAnnotationCSV $functionalAnnotationCSV
+ − 20 -outputCSV $outputCSV
+ − 21 -outputInferenceLogCSV $outputInferenceLogCSV
+ − 22 -outputSummaryAnnotationCSV $outputSummaryAnnotationCSV
+ − 23 -outReport $htmlReportFile
+ − 24 -outReportPicturesPath $htmlReportFile.files_path
+ − 25 #if $is2D_LC_MS.fractions == True
+ − 26 -namingConventionCodesForFractions $is2D_LC_MS.namingConventionCodesForFractions
+ − 27 #end if
+ − 28 </command>
+ − 29
+ − 30 <inputs>
+ − 31
+ − 32 <repeat name="annotatedQuantificationFiles" title="Peptide (filtered) quantification files (APML)"
+ − 33 help="The APML contents as aligned, annotated and scored feature lists,
+ − 34 as produced by MsFilt tool. Select one or more files. For 2D-LC-MS we expect one file per fraction.">
+ − 35 <param name="annotatedQuantificationFile" size="50" type="data" format="apml" label="File (APML format)" />
+ − 36 </repeat>
+ − 37
+ − 38 <repeat name="identificationFiles" title="Peptide (filtered) identification files (MS/MS identifications)"
+ − 39 help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.
+ − 40 This set of identifications is ideally filtered on some quality and
+ − 41 statistical measures (e.g. as is done by MsFilt). Tip: to base the inference only on the
+ − 42 selected peptide quantification files, you
+ − 43 can select the same quantification files here as well. Select one or more files.">
+ − 44 <param name="identificationFile" size="50" type="data" format="apml,mzid" label="File (APML or MZIDENTML format)" />
+ − 45 </repeat>
+ − 46
+ − 47 <conditional name="is2D_LC_MS">
+ − 48 <param name="fractions" type="boolean" truevalue="Yes" falsevalue="No" checked="false"
+ − 49 label="Data is from 2D LC-MS"
+ − 50 help="Data acquisition was done in multiple fractions."/>
+ − 51 <when value="Yes">
+ − 52 <param name="namingConventionCodesForFractions" type="text" size="100" value=""
+ − 53 label="Part of run/file name that identifies the 2D LC-MS fraction"
+ − 54 help="Add the CSV list of codes that occur in the file names
+ − 55 and that stand for a fraction code. E.g. '_F1,_F2,_F3,etc.' In this
+ − 56 way different peptide identifications from the same sample but measured
+ − 57 in different fractions can be merged together. Otherwise each (fraction) file
+ − 58 is seen as a separate sample."/> <!-- could do regular expressions as well but this would be hard for biologists, e.g. _F\d\b -->
+ − 59 </when>
+ − 60 </conditional>
+ − 61
+ − 62 <param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration"
+ − 63 help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values).
+ − 64 The format is: SM alias => SM name,type,mode[min/max]. Leaving this configuration out while these are present in the
+ − 65 dataset will have the effect that they will be wrongly used as a regular scoring scheme, having effect on for example
+ − 66 the filter criteria below like 'Minimum number of peptide matches with a score above average' ."
+ − 67 value="smXTD => MS:1001330,XSLASH!Tandem:expect,min
+ − 68 
pvCSVEX => p_value,CSV_EXPORT,min
+ − 69 
smAUTO_LIKELIHOOD => AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max
+ − 70 
smLIKELIHOOD => LOGLIKELIHOOD,PLGS/Databank-search,max
+ − 71 "/>
+ − 72 <!-- keep value attribute above aligned like this to avoid white spaces in the value -->
+ − 73 <param name="quantificationDataToUse" type="select"
+ − 74 label="Quantification data to use"
+ − 75 help="Quantification data to use for the pattern clustering and inference steps. NB: check if the chosen data is also
+ − 76 present in your file, or choose 'auto' to let Quantifere check which quantification type is present in most peptides.">
+ − 77 <option value="auto" selected="true">auto</option>
+ − 78 <option value="getIntensity">(TODO)raw intensities</option>
+ − 79 <option value="getApexIntensity">(TODO)apex intensities</option>
+ − 80 <option value="getNormalizedIntensity">(TODO)normalized intensities</option>
+ − 81 </param>
+ − 82 <!-- TODO let minCorrel default value vary according to quantification type chosen above -->
+ − 83 <param name="minCorrel" type="float" size="10" value="0.85" label="Minimum correlation in a cluster" help="Features will be grouped by their protein annotation and
+ − 84 sample intensity values correlation. Set here the minimum correlation expected between grouped members. This is used to guide the clustering algorithm."/>
+ − 85
+ − 86 <!-- simple extra heuristics to remove some "noise" protein hits -->
+ − 87 <param name="minProtCoverage" type="float" size="10" value="5.0" label="Minimum protein coverage (%)" help="This will remove proteins that have a too small
+ − 88 portion of their sequence covered by peptide matches."/>
+ − 89
+ − 90 <param name="minAboveAverageHits" type="integer" size="10" value="1" label="Minimum number of different peptide matches with a score above average"
+ − 91 help="This will remove proteins that do not have enough reasonable peptides hits."/>
+ − 92
+ − 93 <param name="minNrIdsForInferencePeptide" type="integer" size="10" value="1" label="Minimum number of peptide identifications for inference peptides"
+ − 94 help="Minimum number of peptide identifications a peptide needs to be used as inference peptide for secondary proteins."/>
+ − 95
+ − 96
+ − 97 <param name="functionalAnnotationCSV" type="data" format="csv,txt,tsv" optional="true"
+ − 98 label="(Functional)annotation mapping file (csv or tsv format)"
+ − 99 help="Optional file that maps protein accessions to a network, pathway or other higher level annotations. In this file a header line is expected with these 2 columns (names and lower case is important): accession,annotation"/>
+ − 100
+ − 101 <param name="refineModel" type="boolean" checked="true" label="Refine matches model"
+ − 102 help="This will let the algorithm search for a reduced set of secondary protein matches that still explains the variation in the peptide quantification patterns"/>
+ − 103
+ − 104
+ − 105 <param name="summaryReport" type="boolean" checked="true" label="Generate summary report"/>
+ − 106
+ − 107 </inputs>
+ − 108 <configfiles>
+ − 109 <configfile name="annotatedQuantificationFilesList">## start comment
+ − 110 ## iterate over the selected files and store their names in the config file
+ − 111 #for $i, $s in enumerate( $annotatedQuantificationFiles )
+ − 112 ${s.annotatedQuantificationFile}
+ − 113 #end for
+ − 114 ## end comment</configfile>
+ − 115
+ − 116 <configfile name="identificationFilesList">## start comment
+ − 117 ## iterate over the selected files and store their names in the config file
+ − 118 #for $i, $s in enumerate( $identificationFiles )
+ − 119 ${s.identificationFile}
+ − 120 ## also print out the datatype in the next line, based on previously configured datatype
+ − 121 #if isinstance( $s.identificationFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
+ − 122 apml
+ − 123 #else:
+ − 124 mzid
+ − 125 #end if
+ − 126 #end for
+ − 127 ## end comment</configfile>
+ − 128 <configfile name="statisticalMeasuresConfigFile">## start comment
+ − 129 ${statisticalMeasuresConfig}
+ − 130 </configfile>
+ − 131 </configfiles>
+ − 132 <outputs>
+ − 133 <data name="outputCSV" format="csv" label="${tool.name} on ${on_string}: Proteins list (CSV)" />
+ − 134 <data name="outputInferenceLogCSV" format="csv" label="${tool.name} on ${on_string}: Inference log (CSV)"/>
+ − 135 <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
+ − 136 <!-- If the expression is false, the file is not created -->
+ − 137 <filter>( summaryReport == True )</filter>
+ − 138 </data>
+ − 139 <data name="outputSummaryAnnotationCSV" format="csv" label="${tool.name} on ${on_string} - Functional annotation summary (CSV)">
+ − 140 <!-- If the expression is false, the file is not created -->
+ − 141 <filter>( functionalAnnotationCSV != None )</filter>
+ − 142 </data>
+ − 143 </outputs>
+ − 144 <tests>
+ − 145 </tests>
+ − 146 <help>
+ − 147
+ − 148 .. class:: infomark
+ − 149
+ − 150 This tool takes Peptide Quantification patterns and uses this to do Protein Inference of both Primary Protein
+ − 151 identifications as well as Secondary Protein identifications. This last class of protein identifications
+ − 152 can not be done by traditional protein inference methods that look only at peptide identifications and
+ − 153 their quality parameters.
+ − 154
+ − 155
+ − 156 -----
+ − 157
+ − 158 **List of definitions**
+ − 159
+ − 160 Primary Protein identification: protein identification belonging to the minimum set of proteins needed
+ − 161 to account for the observed peptides.
+ − 162
+ − 163 Secondary Protein identification: extra protein identifications that do not below to the minimum set
+ − 164 of proteins mentioned above.
+ − 165
+ − 166 raw intensities : is the intensity value resulting from the integration of the feature peak area
+ − 167
+ − 168 apex intensities: is the intensity value as on the highest point of the feature peak
+ − 169
+ − 170 normalized intensities : is the intensity normalized by some means
+ − 171
+ − 172 -----
+ − 173
+ − 174 **Minimum correlation in a cluster**
+ − 175
+ − 176 TODO - add doc.
+ − 177
+ − 178 -----
+ − 179
+ − 180 **Output details**
+ − 181
+ − 182 *Proteins list (CSV)*
+ − 183
+ − 184 This is the list of primary and secondary proteins and their calculated inference score. Proteins
+ − 185 with exactly the same peptide hits are also grouped together and labeled as primary_group and secondary_group
+ − 186 instead of simply primary and secondary.
+ − 187
+ − 188
+ − 189 *Inference log (CSV)*
+ − 190
+ − 191 This CSV table shows all data, both inferred and ruled out proteins. This can be used by the user to
+ − 192 troubleshoot the inference process and understand why certain proteins might have been ruled out.
+ − 193 The CSV is provided in such a format that the data can easily be explored in a Cytoscape network.
+ − 194
+ − 195 The figure below shows an example of the data being explored in Cytoscape using also the
+ − 196 `Cytoscape chartplugin`_ to visualize the quantification data when selecting the peptide nodes.
+ − 197
+ − 198 .. image:: $PATH_TO_IMAGES/quantifere_cyto_out.png
+ − 199
4
+ − 200 .
0
+ − 201
+ − 202 .. _Cytoscape chartplugin: http://apps.cytoscape.org/apps/chartplugin
+ − 203
+ − 204
+ − 205
+ − 206 </help>
+ − 207 </tool>