annotate ExtractModificationSiteSequenceContext.xml @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
163892325845 Initial commit.
galaxyp
parents:
diff changeset
1 <!--
163892325845 Initial commit.
galaxyp
parents:
diff changeset
2 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
3 # $Id: ExtractModificationSiteSequenceContext.xml 90 2011-01-19 13:20:31Z pieter.neerincx@gmail.com $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
4 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractModificationSiteSequenceContext.xml $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
5 # $LastChangedDate: 2011-01-19 07:20:31 -0600 (Wed, 19 Jan 2011) $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
6 # $LastChangedRevision: 90 $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
7 # $LastChangedBy: pieter.neerincx@gmail.com $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
8 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
9 -->
163892325845 Initial commit.
galaxyp
parents:
diff changeset
10 <tool id="ExtractPeptideSequenceContext4" version="2.1" name="Extract Modification Site Context">
163892325845 Initial commit.
galaxyp
parents:
diff changeset
11 <description>by mapping modified amino acids in peptides back to proteins and fetching the sequence surrounding the modified sites.</description>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
12 <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol --s --modo $modo --ma '$ma' --n $n --c $c --pc '$pc' --ll ERROR</command>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
13 <inputs>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
14 <param name="fragments" type="data" format="tabular" label="Peptide sequences and their protein's identifiers"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
15 help="(in tab delimited format)"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
16 <param name="icol" type="data_column" value="1" data_ref="fragments" label="Protein identifier column"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
17 <param name="pcol" type="data_column" value="2" data_ref="fragments" label="Peptide sequence column"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
18 <!--
163892325845 Initial commit.
galaxyp
parents:
diff changeset
19 <param name="icol" type="integer" value="1" label="Protein identifier column"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
20 <param name="pcol" type="integer" value="2" label="Peptide sequence column"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
21 -->
163892325845 Initial commit.
galaxyp
parents:
diff changeset
22 <param name="db" type="data" format="fasta" label="Protein sequences"
163892325845 Initial commit.
galaxyp
parents:
diff changeset
23 help="(in FASTA format)"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
24 <param name="n" type="integer" value="5" label="N-terminal sequence context length"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
25 <param name="c" type="integer" value="5" label="C-terminal sequence context length"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
26 <param name="pc" type="select" help="to fill positions in the sequence context when the protein was too short for a full length context.">
163892325845 Initial commit.
galaxyp
parents:
diff changeset
27 <label>Padding character</label>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
28 <option value="-">dash</option>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
29 <option value=" ">space</option>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
30 <option value="">none</option>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
31 </param>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
32 <param name="ma" type="text" label="Modified amino acid"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
33 </inputs>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
34 <outputs>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
35 <data name="modo" format="tabular" label="Modification site sequence contexts for ${fragments.name}"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
36 </outputs>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
37 <!--
163892325845 Initial commit.
galaxyp
parents:
diff changeset
38 <tests>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
39 <test>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
40 <param name="input" value="*.fasta"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
41 <param name="identifiers" value="*.txt"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
42 <output name="output" file="*.fasta"/>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
43 </test>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
44 </tests>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
45 -->
163892325845 Initial commit.
galaxyp
parents:
diff changeset
46 <help>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
47
163892325845 Initial commit.
galaxyp
parents:
diff changeset
48 .. role:: raw-html(raw)
163892325845 Initial commit.
galaxyp
parents:
diff changeset
49 :format: html
163892325845 Initial commit.
galaxyp
parents:
diff changeset
50
163892325845 Initial commit.
galaxyp
parents:
diff changeset
51 .. class:: infomark
163892325845 Initial commit.
galaxyp
parents:
diff changeset
52
163892325845 Initial commit.
galaxyp
parents:
diff changeset
53 **What it does**
163892325845 Initial commit.
galaxyp
parents:
diff changeset
54
163892325845 Initial commit.
galaxyp
parents:
diff changeset
55 Map peptide sequences back to proteins and extract sequence contexts for modification sites.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
56
163892325845 Initial commit.
galaxyp
parents:
diff changeset
57 :raw-html:`&lt;object data="static/images/nbic_gmr/ExtractModificationSiteSequenceContext.svg" type="image/svg+xml" width="100%"/&gt;`
163892325845 Initial commit.
galaxyp
parents:
diff changeset
58
163892325845 Initial commit.
galaxyp
parents:
diff changeset
59
163892325845 Initial commit.
galaxyp
parents:
diff changeset
60 ===================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
61 *Peptide sequences and their protein's identifiers*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
62 ===================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
63
163892325845 Initial commit.
galaxyp
parents:
diff changeset
64 This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
65 The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
66 If a sequence context was found, it will be appended in a new column to the right of the existing columns. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
67 When another sequence context was found for the same peptide, it will appended as an extra row in the output.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
68 Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
69 The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
70 For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
71 Amino acids in the petide sequences must be in uppercase.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
72
163892325845 Initial commit.
galaxyp
parents:
diff changeset
73 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
74 *Protein sequences*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
75 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
76
163892325845 Initial commit.
galaxyp
parents:
diff changeset
77 Input file containing all protein sequences in FASTA format. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
78 This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
79 Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
80 Optionally IDs may be prefixed with a database namespace and a colon (:). \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
81 For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
82
163892325845 Initial commit.
galaxyp
parents:
diff changeset
83 >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
163892325845 Initial commit.
galaxyp
parents:
diff changeset
84
163892325845 Initial commit.
galaxyp
parents:
diff changeset
85 and in this one:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
86
163892325845 Initial commit.
galaxyp
parents:
diff changeset
87 >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
163892325845 Initial commit.
galaxyp
parents:
diff changeset
88
163892325845 Initial commit.
galaxyp
parents:
diff changeset
89 ===================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
90 *N-terminal and C-terminal sequence context length*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
91 ===================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
92
163892325845 Initial commit.
galaxyp
parents:
diff changeset
93 Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
94 Note that the width of a modification site is 1 amino acid. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
95 When defaults are used for both the N-terminal and C-terminal sequence context lengths, \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
96 the total sequence context length for a modification site will be:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
97 (N-terminal sequence context) + (modified amino acid) + (C-terminal sequence context) = 5 + 1 + 5 = 11.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
98
163892325845 Initial commit.
galaxyp
parents:
diff changeset
99 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
100 *Modified amino acid*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
101 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
102
163892325845 Initial commit.
galaxyp
parents:
diff changeset
103 The amino acid must be specified in uppercase and the modification in lower case. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
104 The order is not important. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
105 Hence a phophorylated serine in a peptide sequence can be indicated with either pS or Sp, \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
106 but you cannot mix both pS and Sp in a single peptide sequence file. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
107 You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
108 for the specified modification no matter what amino acid it was located on. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
109 A modification may be specified with more than one lower case character, \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
110 so for example phosphoS or Sphospho can also be used for a phosphorylated serine.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
111
163892325845 Initial commit.
galaxyp
parents:
diff changeset
112 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
113 *Padding character*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
114 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
115
163892325845 Initial commit.
galaxyp
parents:
diff changeset
116 Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
117 when the protein was too short to get a complete sequence context. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
118 Defaults to - a.k.a. dash or alignment gap character. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
119
163892325845 Initial commit.
galaxyp
parents:
diff changeset
120 -----
163892325845 Initial commit.
galaxyp
parents:
diff changeset
121
163892325845 Initial commit.
galaxyp
parents:
diff changeset
122 **Getting input data**
163892325845 Initial commit.
galaxyp
parents:
diff changeset
123
163892325845 Initial commit.
galaxyp
parents:
diff changeset
124 .. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl
163892325845 Initial commit.
galaxyp
parents:
diff changeset
125
163892325845 Initial commit.
galaxyp
parents:
diff changeset
126 This tool requires \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
127 peptide sequences in TAB delimited format and \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
128 protein sequences from which the peptides were derived in FASTA format. \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
129 If your peptide sequences are not in TAB delimited format, you can convert from:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
130
163892325845 Initial commit.
galaxyp
parents:
diff changeset
131 - FASTA format using *FASTA manipulation* -&gt; *FASTA-to-Tabular*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
132 - A format using a different delimiter using *Text Manipulation* -&gt; *Convert*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
133
163892325845 Initial commit.
galaxyp
parents:
diff changeset
134 When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\
163892325845 Initial commit.
galaxyp
parents:
diff changeset
135 please make sure you provide the same FASTA database for this tool as the one used for your search.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
136 If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
137 you can use the `my folder utility`_ to download the FASTA databases from the Mascot server.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
138
163892325845 Initial commit.
galaxyp
parents:
diff changeset
139 -----
163892325845 Initial commit.
galaxyp
parents:
diff changeset
140
163892325845 Initial commit.
galaxyp
parents:
diff changeset
141 **Examples**
163892325845 Initial commit.
galaxyp
parents:
diff changeset
142
163892325845 Initial commit.
galaxyp
parents:
diff changeset
143 Example input for peptides identified with a Mascot search, \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
144 some with phosphorylated residues indicated by pS, pT or pY \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
145 and in TAB delimited format::
163892325845 Initial commit.
galaxyp
parents:
diff changeset
146
163892325845 Initial commit.
galaxyp
parents:
diff changeset
147 sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches
163892325845 Initial commit.
galaxyp
parents:
diff changeset
148 AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
149 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
150 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
151 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
152 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
153 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
154 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
155 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620]
163892325845 Initial commit.
galaxyp
parents:
diff changeset
156
163892325845 Initial commit.
galaxyp
parents:
diff changeset
157 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
158 *Appending modification site sequence contexts*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
159 ===============================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
160
163892325845 Initial commit.
galaxyp
parents:
diff changeset
161 With these options:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
162
163892325845 Initial commit.
galaxyp
parents:
diff changeset
163 - p\* as *modified amino acid*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
164 - c6 as *Protein identifier column*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
165 - c1 as *Peptide sequence column*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
166 - a suitable FASTA database with *Protein sequences*
163892325845 Initial commit.
galaxyp
parents:
diff changeset
167 - and everything else set to defaults
163892325845 Initial commit.
galaxyp
parents:
diff changeset
168
163892325845 Initial commit.
galaxyp
parents:
diff changeset
169 the example above will generate a result like this::
163892325845 Initial commit.
galaxyp
parents:
diff changeset
170
163892325845 Initial commit.
galaxyp
parents:
diff changeset
171 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] KIFKLSAAVVL
163892325845 Initial commit.
galaxyp
parents:
diff changeset
172 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] AGIKVTVAGLA
163892325845 Initial commit.
galaxyp
parents:
diff changeset
173 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] EKEKISGTVNI
163892325845 Initial commit.
galaxyp
parents:
diff changeset
174 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] EKISGTVNIRT
163892325845 Initial commit.
galaxyp
parents:
diff changeset
175 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] LEYKLYEALKF
163892325845 Initial commit.
galaxyp
parents:
diff changeset
176 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] DKLDASESLRK
163892325845 Initial commit.
galaxyp
parents:
diff changeset
177 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LDASESLRKEE
163892325845 Initial commit.
galaxyp
parents:
diff changeset
178
163892325845 Initial commit.
galaxyp
parents:
diff changeset
179 Note the header line was ignored, peptides like AGNAARDN without any modified amino acids are absent from the output \
163892325845 Initial commit.
galaxyp
parents:
diff changeset
180 and peptides like KLDApSEpSLR with more than one modified amino acid occur more than once in the output.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
181
163892325845 Initial commit.
galaxyp
parents:
diff changeset
182 </help>
163892325845 Initial commit.
galaxyp
parents:
diff changeset
183 </tool>