comparison shm_csr.xml @ 4:5ffd52fc35c4 draft

Uploaded
author davidvanzessen
date Mon, 12 Dec 2016 05:22:37 -0500
parents
children
comparison
equal deleted inserted replaced
3:beaa487ecf43 4:5ffd52fc35c4
1 <tool id="shm_csr" name="SHM &amp; CSR pipeline" version="1.0">
2 <description></description>
3 <command interpreter="bash">
4 wrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_uniques $class_filter_cond.class_filter $empty_region_filter $fast
5 </command>
6 <inputs>
7 <param name="in_file" type="data" label="IMGT zip file to be analysed" />
8 <param name="empty_region_filter" type="select" label="Sequence starts at" help="" >
9 <option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>
10 <option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>
11 <option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option>
12 <option value="FR2">FR2: include CDR2,FR3 in filters</option>
13 </param>
14 <param name="functionality" type="select" label="Functionality filter" help="" >
15 <option value="productive" selected="true">Productive (Productive and Productive see comment)</option>
16 <option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option>
17 <option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>
18 </param>
19 <param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example.">
20 <option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>
21 <option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>
22 <option value="no">No</option>
23 </param>
24 <param name="unique" type="select" label="Remove duplicates based on" help="" >
25 <option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>
26 <option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>
27 <option value="CDR3.IMGT.AA,best_match_class">CDR3 (AA), C region</option>
28 <option value="CDR3.IMGT.AA">CDR3 (AA)</option>
29
30 <option value="VGene,CDR3.IMGT.seq,best_match_class">Top.V.Gene, CDR3 (nt), C region</option>
31 <option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3 (nt)</option>
32 <option value="CDR3.IMGT.seq,best_match_class">CDR3 (nt), C region</option>
33 <option value="CDR3.IMGT.seq">CDR3 (nt)</option>
34 <option value="Sequence.ID" selected="true">Don't remove duplicates</option>
35 </param>
36 <conditional name="class_filter_cond">
37 <param name="class_filter" type="select" label="Human Class/Subclass filter" help="" >
38 <option value="70_70" selected="true">>70% class and >70% subclass</option>
39 <option value="60_55">>60% class and >55% subclass</option>
40 <option value="70_0">>70% class</option>
41 <option value="60_0">>60% class</option>
42 <option value="101_101">Do not assign (sub)class</option>
43 </param>
44 </conditional>
45 <conditional name="naive_output_cond">
46 <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">
47 <option value="yes">Yes</option>
48 <option value="no" selected="true">No</option>
49 </param>
50 </conditional>
51 <param name="fast" type="select" label="Fast" help="Skips generating the new ZIP files and Change-O/Baseline" >
52 <option value="yes">Yes</option>
53 <option value="no" selected="true">No</option>
54 </param>
55 </inputs>
56 <outputs>
57 <data format="html" name="out_file" label = "SHM &amp; CSR on ${in_file.name}"/>
58 <data format="imgt_archive" name="naive_output_ca" label = "Naive CA input data from ${in_file.name}" >
59 <filter>naive_output_cond['naive_output'] == "yes"</filter>
60 <filter>class_filter_cond['class_filter'] != "101_101"</filter>
61 </data>
62 <data format="imgt_archive" name="naive_output_cg" label = "Naive CG input data from ${in_file.name}" >
63 <filter>naive_output_cond['naive_output'] == "yes"</filter>
64 <filter>class_filter_cond['class_filter'] != "101_101"</filter>
65 </data>
66 <data format="imgt_archive" name="naive_output_cm" label = "Naive CM input data from ${in_file.name}" >
67 <filter>naive_output_cond['naive_output'] == "yes"</filter>
68 <filter>class_filter_cond['class_filter'] != "101_101"</filter>
69 </data>
70 <data format="imgt_archive" name="naive_output_ce" label = "Naive CE input data from ${in_file.name}" >
71 <filter>naive_output_cond['naive_output'] == "yes"</filter>
72 <filter>class_filter_cond['class_filter'] != "101_101"</filter>
73 </data>
74 <data format="imgt_archive" name="naive_output_all" label = "Naive input data from ${in_file.name}" >
75 <filter>naive_output_cond['naive_output'] == "yes"</filter>
76 <filter>class_filter_cond['class_filter'] == "101_101"</filter>
77 </data>
78 </outputs>
79 <citations>
80 <citation type="doi">10.1093/nar/gks457</citation>
81 <citation type="doi">10.1093/bioinformatics/btv359</citation>
82 </citations>
83 <help>
84 <![CDATA[
85 **References**
86
87 Yaari, G. and Uduman, M. and Kleinstein, S. H. (2012). Quantifying selection in high-throughput Immunoglobulin sequencing data sets. In *Nucleic Acids Research, 40 (17), pp. e134–e134.* [`doi:10.1093/nar/gks457`_]
88
89 .. _doi:10.1093/nar/gks457: http://dx.doi.org/10.1093/nar/gks457
90
91 Gupta, Namita T. and Vander Heiden, Jason A. and Uduman, Mohamed and Gadala-Maria, Daniel and Yaari, Gur and Kleinstein, Steven H. (2015). Change-O: a toolkit for analyzing large-scale B cell immunoglobulin repertoire sequencing data: Table 1. *In Bioinformatics, 31 (20), pp. 3356–3358.* [`doi:10.1093/bioinformatics/btv359`_]
92
93 .. _doi:10.1093/bioinformatics/btv359: http://dx.doi.org/10.1093/bioinformatics/btv359
94
95 -----
96
97 **Input files**
98
99 IMGT/HighV-QUEST .zip and .txz are accepted as input files.
100
101 .. class:: infomark
102
103 Note: Files can be uploaded by using “get data” and “upload file” and selecting “IMGT archive“ as a file type.
104
105 -----
106
107 **Sequence starts at**
108
109 Identifies the region which will be included in the analysis (analysed region)
110
111 - Sequences which are missing a gene region (FR1/CDR1 etc) in the analysed region are excluded
112 - Sequences containing an ambiguous base in the analysed region are excluded
113 - All other filtering/analysis is based on the analysed region
114
115 -----
116
117 **Functionality filter**
118
119 Allows filtering on productive rearrangement, unproductive rearrangements or both based on the assignment provided by IMGT.
120
121 **Filter unique sequences**
122
123 *Remove unique:*
124
125
126 This filter consists of two different steps.
127
128 Step 1: removes all sequences of which the nucleotide sequence in the “analysed region” (see sequence starts at filter) occurs only once. (Sub)classes are not taken into account in this filter step.
129
130 Step 2: removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region and the same (sub)class).
131
132 .. class:: infomark
133
134 Note: This means that sequences with the same nucleotide sequence but a different (sub)class will be included in the results of both (sub)classes.
135
136 *Keep unique:*
137
138 Removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region and the same (sub)class).
139
140 Example of the sequences that are included using either the “remove unique filter” or the “keep unique filter”
141
142 +--------------------------+
143 | unique filter |
144 +--------+--------+--------+
145 | values | remove | keep |
146 +--------+--------+--------+
147 | A | A | A |
148 +--------+--------+--------+
149 | A | B | B |
150 +--------+--------+--------+
151 | B | D | C |
152 +--------+--------+--------+
153 | B | | D |
154 +--------+--------+--------+
155 | C | | |
156 +--------+--------+--------+
157 | D | | |
158 +--------+--------+--------+
159 | D | | |
160 +--------+--------+--------+
161
162 -----
163
164 **Remove duplicates based on**
165
166 Allows the selection of a single sequence per clone. Different definitions of a clone can be chosen.
167
168 .. class:: infomark
169
170 Note: The first sequence (in the data set) of each clone is always included in the analysis. When the first matched sequence is unmatched (no subclass assigned) the first matched sequence will be included. This means that altering the data order (by for instance sorting) can change the sequence which is included in the analysis and therefore slightly influence results.
171
172 -----
173
174 **Human Class/Subclass filter**
175
176 .. class:: warningmark
177
178 Note: This filter should only be applied when analysing human IGH data in which a (sub)class specific sequence is present. Otherwise please select the "do not assign (sub)class" option to prevent errors when running the pipeline.
179
180 The class percentage is based on the ‘chunk hit percentage’ (see below). The subclass percentage is based on the ‘nt hit percentage’ (see below).
181
182 The SHM & CSR pipeline identifies human Cµ, Cα, Cγ and Cε constant genes by dividing the reference sequences for the subclasses (NG_001019) in 8 nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are then individually aligned in the right order to each input sequence. This alignment is used to calculate the chunck hit percentage and the nt hit percentage.
183
184 *Chunk hit percentage*: the percentage of the chunks that is aligned
185
186 *Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass.
187
188 -----
189
190 **Output new IMGT archives per class into your history?**
191
192 If yes is selected, additional output files (one for each class) will be added to the history which contain information of the sequences that passed the selected filtering criteria. These files are in the same format as the IMGT/HighV-QUEST output files and therefore are also compatible with many other analysis programs, such as IGGalaxy.
193
194 ]]>
195 </help>
196 </tool>