comparison crest.xml @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc8d8bfeb9a
1 <tool id="crest" name="CREST" version="1.0">
2 <description>Clipping reveals structural variations</description>
3 <requirements>
4 <requirement type="package">bioperl</requirement>
5 <requirement type="binary">gfServer</requirement>
6 <requirement type="binary">gfClient</requirement>
7 <requirement type="binary">cap</requirement>
8 </requirements>
9 <command> mkdir $crest_log.extra_files_path; cat $shscript > $crest_log.extra_files_path/crest.sh; /bin/bash $shscript &#38;> $crest_log</command>
10 <inputs>
11 <param name="tumor_bam" type="data" format="bam" label="Tumor Sample"
12 help="BAM files must contain soft-clipping signatures at the breakpoints. If they do not, you will not get any results."/>
13 <param name="germline_bam" type="data" format="bam" optional="true" label="Germline Sample" help=""/>
14 <conditional name="refGenomeSource">
15 <param name="genomeSource" type="select" label="&lt;HR&gt;Will you select a reference genome from your history or use a built-in index" help="">
16 <option value="indexed">Use a built-in index</option>
17 <option value="history">Use one from the history</option>
18 </param>
19 <when value="indexed">
20 <param name="genome_fasta" type="select" label="Select a Reference Dataset" help="if your genome of interest is not listed - contact Galaxy team">
21 <options from_file="crest.loc">
22 <column name="dbkey" index="0"/>
23 <column name="name" index="1"/>
24 <column name="value" index="2"/>
25 <filter type="data_meta" ref="tumor_bam" key="dbkey" column="0" />
26 <validator type="no_options" message="No indexes are available" />
27 </options>
28 </param>
29 <param name="genome_2bit" type="select" optional="true" label="The 2bit index" help="">
30 <options from_file="crest.loc">
31 <column name="dbkey" index="0"/>
32 <column name="name" index="1"/>
33 <column name="value" index="3"/>
34 <filter type="data_meta" ref="tumor_bam" key="dbkey" column="0" />
35 </options>
36 </param>
37 </when>
38 <when value="history">
39 <param name="genome_fasta" type="data" format="fasta" label="Genome Reference Sequence" help="Should match your input Tumor Sample BAM fie database">
40 <validator type="unspecified_build" message="Must assign a build"/>
41 </param>
42 <param name="genome_2bit" type="data" format="twobit" label="Genome Reference 2bit index (Choose same as Genome Reference Sequence)" help="">
43 <validator type="unspecified_build" message="Must assign a build"/>
44 </param>
45 </when> <!-- history -->
46 </conditional> <!-- refGenomeSource -->
47 <!-- Input Datasets -->
48 <conditional name="rnaseq">
49 <param name="mode" type="boolean" checked="false" truevalue="yes" falsevalue="no" label="&lt;HR&gt;RNAseq mode"
50 help="Requires a gene model file"/>
51 <when value="no">
52 </when>
53 <when value="yes">
54 <param name="gene_model" type="data" format="bed" label="Gene model file" help="currently only refFlat format (BED) is supported"/>
55 <param name="cluster_size" type="integer" value="" optional="true" label="Cluster Size"
56 help="The soft-clipped reads within cluster_size will be considered together, default is 3">
57 <validator type="in_range" message="Must be greater than 0" min="1"/>
58 </param>
59 </when>
60 </conditional>
61 <!-- options -->
62 <param name="paired" type="boolean" checked="true" truevalue="" falsevalue="--nopaired" label="&lt;HR&gt;Paired Reads?"/>
63 <param name="read_len" type="integer" value="" optional="true" label="Read length of the sequencing data"
64 help="The read length of the sequencing data, defaut 100">
65 <validator type="in_range" message="Must be greater than 0" min="1"/>
66 </param>
67 <param name="sensitive" type="boolean" checked="false" truevalue="--sensitive" falsevalue="" label="&lt;HR&gt;Sensitive"
68 help="The program will generate more SVs with higher false positive rate."/>
69 <param name="range" type="text" value="" optional="true" label="Limit Genome range where SV will be detected,"
70 help="The range where SV will be detected, using chr1:100-200 format">
71 <validator type="regex" message="format: chr1:100-200">^\w+(:\d+-\d+)?$</validator>
72 </param>
73 <conditional name="hit">
74 <param name="mode" type="select" label="&lt;HR&gt;Adjust Hit Detection">
75 <option value="no">Use defaults</option>
76 <option value="yes">Adjust Settings</option>
77 </param>
78 <when value="yes">
79 <param name="max_score_diff" type="integer" value="" optional="true" label="maximum score difference"
80 help="The maximum score difference when stopping select hit, default 10.">
81 <validator type="in_range" message="Must be greater than 0" min="1"/>
82 </param>
83 <param name="min_sclip_reads" type="integer" value="" optional="true" label="Minimum number of soft clipping reads"
84 help="Minimum number of soft clipping read to triger the procedure, default 3 (10 for RNASeq)">
85 <validator type="in_range" message="Must be greater than 0" min="1"/>
86 </param>
87 <param name="max_rep_cover" type="integer" value="" optional="true" label="Repetitive coverage threshold"
88 help="The min number of coverage to be called as repetitive and don't triger the procedure, default 500 (5000 for RNASeq)">
89 <validator type="in_range" message="Must be greater than 0" min="1"/>
90 </param>
91 <param name="min_sclip_len" type="integer" value="" optional="true" label="Soft clipping detection"
92 help="The min length of soft clipping part at a position to triger the detection, default 20.">
93 <validator type="in_range" message="Must be greater than 0" min="1"/>
94 </param>
95 <param name="min_hit_len" type="integer" value="" optional="true" label="Minimum length of a hit for genome mapping"
96 help="">
97 <validator type="in_range" message="Must be greater than 0" min="1"/>
98 </param>
99 <param name="min_hit_reads" type="integer" value="" optional="true" label="Minimum read hits"
100 help="Minimum number of reads in a hit. default 3 (10 for RNASeq)">
101 <validator type="in_range" message="Must be greater than 0" min="1"/>
102 </param>
103 <param name="min_dist_diff" type="integer" value="" optional="true" label="Min distance between the mapped position and the soft clipping position"
104 help="Min distance between the mapped position and the soft clipping position, default 20.">
105 <validator type="in_range" message="Must be greater than 0" min="1"/>
106 </param>
107 </when>
108 <when value="no"/>
109 </conditional>
110
111 <conditional name="softclip">
112 <param name="mode" type="select" label="&lt;HR&gt;Adjust Soft Clipping">
113 <option value="no">Use defaults</option>
114 <option value="yes">Adjust Settings</option>
115 </param>
116 <when value="yes">
117 <param name="min_percent_id" type="integer" value="" optional="true" label="Identity threshold for soft clipping read mapping"
118 help="Min percentage of identity of soft clipping read mapping, default 90">
119 <validator type="in_range" message="Must be greater than 0" min="1"/>
120 </param>
121 <param name="min_percent_hq" type="integer" value="" optional="true" label="High quality bases threshold for soft clipping"
122 help="Min percentage of high quality base in soft clipping reads, default 80">
123 <validator type="in_range" message="Must be greater than 0" min="1"/>
124 </param>
125 <param name="lowqual_cutoff" type="integer" value="" optional="true" label="Low quality cutoff"
126 help="Low quality cutoff value, default 20.">
127 <validator type="in_range" message="Must be greater than 0" min="1"/>
128 </param>
129 </when>
130 <when value="no"/>
131 </conditional>
132
133 <conditional name="sv_filter">
134 <param name="mode" type="select" label="&lt;HR&gt;Adjust Structural Variant Filtering">
135 <option value="no">Use defaults</option>
136 <option value="yes">Adjust Settings</option>
137 </param>
138 <when value="yes">
139 <param name="min_percent_cons_of_read" type="float" value="" optional="true" label="Relative consensus length threshold"
140 help="Minimum percent of consensus length of read length, default 0.75">
141 <validator type="in_range" message="Must be greater than 0" min="0" max="1"/>
142 </param>
143 <param name="max_bp_dist" type="integer" value="" optional="true" label="Maximum distance between break points"
144 help="Maximum distance in base pairs between two idenfitifed break points, default 15">
145 <validator type="in_range" message="Must be greater than 0" min="1"/>
146 </param>
147 <param name="germline_seq_width" type="integer" value="" optional="true" label="Germline SV filtering window"
148 help="Half window width of genomic sequence around break point for germline SV filtering, default 100">
149 <validator type="in_range" message="Must be greater than 0" min="1"/>
150 </param>
151 <param name="germline_search_width" type="integer" value="" optional="true" label="Soft Clip Germline SV filtering window"
152 help="Half window width for seaching soft-clipped reads around breakpoint for germline SV iltering, default 50">
153 <validator type="in_range" message="Must be greater than 0" min="1"/>
154 </param>
155 </when>
156 <when value="no"/>
157 </conditional>
158
159 <conditional name="rescue">
160 <param name="mode" type="select" label="&lt;HR&gt;Rescue mode"
161 help="a SV with only 1 side with enough soft-clipped reads is considered as a valid one, default is ON.">
162 <option value="no">Rescue mode Off</option>
163 <option value="default" selected="true">Rescue On with default Setting</option>
164 <option value="yes">Adjust Rescue Settings</option>
165 </param>
166 <when value="yes">
167 <param name="min_one_side_reads" type="integer" value="" optional="true" label="Minimum number of soft-clipped reads on one side"
168 help="the minimum number of soft-clipped reads on one side, default 5">
169 <validator type="in_range" message="Must be greater than 0" min="1"/>
170 </param>
171 </when>
172 <when value="no"/>
173 <when value="default"/>
174 </conditional>
175
176 <conditional name="tandem_repeat">
177 <param name="mode" type="select" label="&lt;HR&gt;Tandem Repeats"
178 help="Remove tandem repeat caused SV events, default is ON.">
179 <option value="default" selected="true">Remove Tandem Repeats using default Setting</option>
180 <option value="yes">Remove Tandem Repeats with Adjusted Settings</option>
181 <option value="no">Keep Tandem Repeats</option>
182 </param>
183 <when value="yes">
184 <param name="tr_max_indel_size" type="integer" value="" optional="true" label="Maximum INDEL events"
185 help="Maximum tandem repeat mediated INDEL events, default 100">
186 <validator type="in_range" message="Must be greater than 0" min="1"/>
187 </param>
188 <param name="tr_min_size" type="integer" value="" optional="true" label="Minimum tandem reapet size"
189 help="Minimum tandem reapet size, default 2">
190 <validator type="in_range" message="Must be greater than 0" min="1"/>
191 </param>
192 <param name="tr_max_size" type="integer" value="" optional="true" label="Maximum tandem reapet size"
193 help="Maximum tandem reapet size, default 8">
194 <validator type="in_range" message="Must be greater than 0" min="1"/>
195 </param>
196 <param name="tr_min_num" type="integer" value="" optional="true" label="Minimum tandem reapet number"
197 help="Minimum tandem repeat number, defaut 4">
198 <validator type="in_range" message="Must be greater than 0" min="1"/>
199 </param>
200 <param name="hetero_factor" type="float" value="" optional="true" label="heterogenirity and heterozygosity factor"
201 help="The factor about the SV's heterogenirity and heterozygosity, default 0.4">
202 <validator type="in_range" message="Must be greater than 0" min="0" max="1"/>
203 </param>
204 <param name="triger_p_value" type="float" value="" optional="true" label="heterogenirity and heterozygosity factor"
205 help="The p-value that will triger the SV detection when number of soft-clipped reads is small, default 0.05">
206 <validator type="in_range" message="Must be greater than 0" min="0" max="1"/>
207 </param>
208 </when>
209 <when value="default"/>
210 <when value="no"/>
211 </conditional>
212 </inputs>
213 <outputs>
214 <data format="txt" name="crest_log" label="${tool.name} on ${on_string}: crest.log" />
215 <data format="tabular" name="tumor_cover" label="${tool.name} on ${on_string}: tumor.cover" from_work_dir="tumor.bam.cover"/>
216 <data format="tabular" name="tumor_sclip" label="${tool.name} on ${on_string}: tumor.sclip.txt" from_work_dir="tumor.bam.sclip.txt"/>
217 <data format="tabular" name="germline_cover" label="${tool.name} on ${on_string}: germline.cover" from_work_dir="germline.bam.cover">
218 <filter>germline_bam != None</filter>
219 </data>
220 <data format="tabular" name="germline_sclip" label="${tool.name} on ${on_string}: germline.sclip.txt" from_work_dir="germline.bam.sclip.txt">
221 <filter>germline_bam != None</filter>
222 </data>
223 <data format="tabular" name="predSV" label="${tool.name} on ${on_string}: tumor.predSV.txt" from_work_dir="tumor.bam.predSV.txt"/>
224 <data format="html" name="predSV_html" label="${tool.name} on ${on_string}: tumor.bam.predSV.html" />
225 </outputs>
226 <configfiles>
227 <configfile name="shscript"> #slurp
228 #!/bin/bash
229 ## define some things for cheetah proccessing and to avoid problems with xml parsing of this tool_config
230 #set $amp = chr(38)
231 #set $ds = chr(36)
232 #set $gt = chr(62)
233 #set $lt = chr(60)
234 #set $echo_cmd = 'echo'
235 ## Find the CREST.pl in the galaxy tool path
236 #import Cheetah.FileUtils
237 #set $toolpath = '/'.join([$__root_dir__,'tools','crest'])
238 #set $crest = $Cheetah.FileUtils.findFiles($toolpath,['CREST.pl'],[],['example','Tree'])[0]
239 #set $extractSClip = $Cheetah.FileUtils.findFiles($toolpath,['extractSClip.pl'],[],['example','Tree'])[0]
240 #set $countDiff = $Cheetah.FileUtils.findFiles($toolpath,['countDiff.pl'],[],['example','Tree'])[0]
241 #set $bam2html = $Cheetah.FileUtils.findFiles($toolpath,['bam2html.pl'],[],['example','Tree'])[0]
242 ##
243 ## Need ptrfinder on path
244 export PATH=${ds}PATH:$toolpath
245 #raw
246 ## Set temp directory
247 export TMPDIR=`pwd`/tmp
248 mkdir -p $TMPDIR
249 #end raw
250 ## check for the genome reference 2bit
251 if [ ! -f $refGenomeSource.genome_2bit ]; then exit 1; fi
252 ## get the dbkey and use that in link name
253 #set $dbkey = $tumor_bam.metadata.dbkey
254 #set $ref_fa = '.'.join([$dbkey,'fa'])
255 #set $ref_2bit = '.'.join([$dbkey,'2bit'])
256 ref_fa=$ref_fa
257 ref_2bit=$ref_2bit
258 ln -s $refGenomeSource.genome_fasta $ref_fa
259 ln -s $refGenomeSource.genome_2bit $ref_2bit
260 target_genome=`pwd`/$ref_2bit
261 ## Problem - gfServer doesn't reserve the port until it's done reading genome, so another might try to open the same port
262 #raw
263 ## start a local gfServer with the selected genome reference
264 ## find an open port on which to start a blat server via gfServer
265 for (( bp = 50000 + $$ % 1000; bp &lt; 60000; bp += 7 ))
266 do
267 if ! netstat -an | grep $bp > /dev/null; then blatport=$bp; break; fi
268 done
269 ## exit if can't open a port
270 echo "Starting gfServer on port " $blatport
271 #end raw
272 ( gfServer -canStop -log=gfServer.log start localhost ${ds}blatport ${ds}target_genome 2${gt} /dev/null ) ${amp}
273 #raw
274 (
275 ## symbolic link the tumor input bam annd bai files in our working directory
276 #end raw
277 ln -s $tumor_bam tumor.bam
278 ln -s $tumor_bam.metadata.bam_index tumor.bam.bai
279 ## String value of an Optional DataToolParameter input is 'None' when not set
280 #if $germline_bam.__str__ != 'None':
281 #raw
282 ## symbolic link the germline input bam annd bai files in our working directory
283 #end raw
284 ln -s $germline_bam germline.bam
285 ln -s $germline_bam.metadata.bam_index germline.bam.bai
286 #end if
287 #raw
288 ## Get soft-clipping positions.
289 #end raw
290 $echo_cmd perl -I $toolpath $extractSClip -i tumor.bam --ref_genome $ref_fa
291 perl -I $toolpath $extractSClip -i tumor.bam --ref_genome $ref_fa
292 ##
293 ## If there is a germline input
294 #if $germline_bam.__str__ != 'None':
295 $echo_cmd perl -I $toolpath $extractSClip -i tumor.bam --ref_genome $ref_fa
296 perl -I $toolpath $extractSClip -i tumor.bam --ref_genome $ref_fa
297 #raw
298 ## Remove germline events (optional)
299 #end raw
300 $echo_cmd perl -I $toolpath $countDiff -d tumor.bam.cover -g germline.bam.cover to soft_clip.dist.txt
301 perl -I $toolpath $countDiff -d tumor.bam.cover -g germline.bam.cover $gt soft_clip.dist.txt
302 #end if
303 )
304 ## Running the SV detection script.
305 ## Determine the CREST options
306 #set $crest_args = ["-f tumor.bam.cover -d tumor.bam"]
307 ##
308 #if $germline_bam.__str__ != 'None':
309 #set $crest_args = $crest_args + ["-g germline.bam"]
310 #end if
311 #set $crest_args = $crest_args + ["--ref_genome",$ref_fa]
312 ##
313 #if $rnaseq.mode
314 #set $crest_args = $crest_args + ["--RNASeq","--genemodel",$rnaseq.gene_model.__str__]
315 #if $rnaseq.cluster_size.__str__ != '':
316 #set $crest_args = $crest_args + ["--cluster_size",$rnaseq.cluster_size.__str__]
317 #end if
318 #end if
319 ##
320 #if $paired.__str__ != '':
321 #set $crest_args = $crest_args + [$paired.__str__]
322 #end if
323 #if $sensitive.__str__ != '':
324 #set $crest_args = $crest_args + [$sensitive.__str__]
325 #end if
326 #if $range.__str__ != '':
327 #set $crest_args = $crest_args + ["-r",$range.__str__]
328 #end if
329 #if $read_len.__str__ != '':
330 #set $crest_args = $crest_args + ["-l",$read_len.__str__]
331 #end if
332 ##
333 #if $hit.mode.__str__ == 'yes':
334 #if $hit.max_score_diff.__str__ != '':
335 #set $crest_args = $crest_args + ["--max_score_diff", $hit.max_score_diff.__str__]
336 #end if
337 #if $hit.min_sclip_reads.__str__ != '':
338 #set $crest_args = $crest_args + ["--min_sclip_reads",$hit.min_sclip_reads.__str__]
339 #end if
340 #if $hit.max_rep_cover.__str__ != '':
341 #set $crest_args = $crest_args + ["--max_rep_cover",$hit.max_rep_cover.__str__]
342 #end if
343 #if $hit.min_sclip_len.__str__ != '':
344 #set $crest_args = $crest_args + ["--min_sclip_len",$hit.min_sclip_len.__str__]
345 #end if
346 #if $hit.min_hit_len.__str__ != '':
347 #set $crest_args = $crest_args + ["--min_hit_len",$hit.min_hit_len.__str__]
348 #end if
349 #if $hit.min_hit_reads.__str__ != '':
350 #set $crest_args = $crest_args + ["--min_hit_reads",$hit.min_hit_reads.__str__]
351 #end if
352 #if $hit.min_dist_diff.__str__ != '':
353 #set $crest_args = $crest_args + ["--min_dist_diff",$hit.min_dist_diff.__str__]
354 #end if
355 #end if
356 ##
357 #if $softclip.mode.__str__ == 'yes':
358 #if $softclip.min_percent_id.__str__ != '':
359 #set $crest_args = $crest_args + ["--min_percent_id",$softclip.min_percent_id.__str__]
360 #end if
361 #if $softclip.min_percent_hq.__str__ != '':
362 #set $crest_args = $crest_args + ["--min_percent_hq",$softclip.min_percent_hq.__str__]
363 #end if
364 #if $softclip.lowqual_cutoff.__str__ != '':
365 #set $crest_args = $crest_args + ["--lowqual_cutoff",$softclip.lowqual_cutoff.__str__]
366 #end if
367 #end if
368 ##
369 #if $sv_filter.mode.__str__ == 'yes':
370 #if $sv_filter.min_percent_cons_of_read.__str__ != '':
371 #set $crest_args = $crest_args + ["--min_percent_cons_of_read",$sv_filter.min_percent_cons_of_read.__str__]
372 #end if
373 #if $sv_filter.max_bp_dist.__str__ != '':
374 #set $crest_args = $crest_args + ["--max_bp_dist",$sv_filter.max_bp_dist.__str__]
375 #end if
376 #if $sv_filter.germline_seq_width.__str__ != '':
377 #set $crest_args = $crest_args + ["--germline_seq_width",$sv_filter.germline_seq_width.__str__]
378 #end if
379 #if $sv_filter.germline_search_width.__str__ != '':
380 #set $crest_args = $crest_args + ["--germline_search_width",$sv_filter.germline_search_width.__str__]
381 #end if
382 #end if
383 ##
384 #if $rescue.mode.__str__ == 'yes':
385 #if $rescue.min_one_side_reads.__str__ != '':
386 #set $crest_args = $crest_args + ["--min_one_side_reads",$rescue.min_one_side_reads.__str__]
387 #end if
388 #elif $rescue.mode.__str__ == 'no':
389 #set $crest_args = $crest_args + ["--norescue"]
390 #end if
391 ##
392 #if $tandem_repeat.mode.__str__ == 'yes':
393 #if $tandem_repeat.tr_max_indel_size.__str__ != '':
394 #set $crest_args = $crest_args + ["--tr_max_indel_size",$tandem_repeat.tr_max_indel_size.__str__]
395 #end if
396 #if $tandem_repeat.tr_min_size.__str__ != '':
397 #set $crest_args = $crest_args + ["--tr_min_size",$tandem_repeat.tr_min_size.__str__]
398 #end if
399 #if $tandem_repeat.tr_max_size.__str__ != '':
400 #set $crest_args = $crest_args + ["--tr_max_size",$tandem_repeat.tr_max_size.__str__]
401 #end if
402 #if $tandem_repeat.tr_min_num.__str__ != '':
403 #set $crest_args = $crest_args + ["--tr_min_num",$tandem_repeat.tr_min_num.__str__]
404 #end if
405 #if $tandem_repeat.hetero_factor.__str__ != '':
406 #set $crest_args = $crest_args + ["--hetero_factor",$tandem_repeat.hetero_factor.__str__]
407 #end if
408 #if $tandem_repeat.triger_p_value.__str__ != '':
409 #set $crest_args = $crest_args + ["--triger_p_value",$tandem_repeat.triger_p_value.__str__]
410 #end if
411 #elif $tandem_repeat.mode.__str__ == 'no':
412 #set $crest_args = $crest_args + ["--norm_tandem_repeat"]
413 #end if
414 #raw
415 ## check if gfServer is ready
416 echo "Waiting for gfServer"
417 for (( tries = 0; tries &lt; 30; tries += 1 ))
418 do
419 if ! netstat -an | grep $blatport > /dev/null &#38;&#38; ps -f | grep gfServer | grep $blatport > /dev/null; then sleep 60; else break; fi
420 done
421 #end raw
422 #raw
423 ## Run CREST
424 #end raw
425 (
426 ## perl -I ~/src/crest ../CREST.pl -f tumor.bam.cover -d tumor.bam -g germline.bam --ref_genome hg18.fa --2bitdir=/home/msi/jj/src/crest/example --target_genome=hg18.2bit --blatserver localhost --blatport 50000
427 $echo_cmd perl -I $toolpath $crest #echo ' '.join($crest_args)# --target_genome ${ds}target_genome --blatserver localhost --blatport ${ds}blatport
428 perl -I $toolpath $crest #echo ' '.join($crest_args)# --target_genome ${ds}target_genome --blatserver localhost --blatport ${ds}blatport
429 #raw
430 ## Visulization of the detailed alignment at breakpoint (optional)
431 ## The bam2html.pl script builds an html view of the multiple alignment for the breakpoint, so you can manually check the soft-clipping and other things.
432 #end raw
433 ## bam2html.pl -r hg18.fa -d tumor.bam -g germline.bam -o predSV.html -f predSV.txt
434 if [ -e tumor.bam.predSV.txt ]
435 then
436 #if $germline_bam.__str__ != 'None':
437 perl -I $toolpath $bam2html -d tumor.bam -g germline.bam -f tumor.bam.predSV.txt --ref_genome $ref_fa -o $predSV_html
438 #else
439 perl -I $toolpath $bam2html -d tumor.bam -f tumor.bam.predSV.txt --ref_genome $ref_fa -o $predSV_html
440 #end if
441 fi
442 )
443 #raw
444 ## shut down the blat server
445 echo "shutting down gfServer on port " $blatport
446 gfServer stop localhost $blatport
447 #end raw
448 </configfile>
449 </configfiles>
450 <tests>
451 </tests>
452 <help>
453 **CREST**
454
455 CREST_ is an algorithm for detecting genomic structural variations at base-pair resolution using next-generation sequencing data. '
456
457 CREST uses pieces of DNA called soft clips to find structural variations. Soft clips are the DNA segments produced during sequencing that fail to properly align to the reference genome as the sample genome is reassembled. CREST uses the soft clips to precisely identify sites of chromosomal rearrangement or where pieces of DNA are inserted or deleted.
458
459 Please cite the following article:
460
461 Wang J, Mullighan CG, Easton J, Roberts S, Heatley SL, Ma J, Rusch MC, Chen K, Harris CC, Ding L, Holmfeldt L, Payne-Turner D, Fan X, Wei L, Zhao D, Obenauer JC, Naeve C, Mardis ER, Wilson RK, Downing JR and Zhang J. CREST maps somatic structural variation in cancer genomes with base-pair resolution (2011). Nature_Methods_.
462
463 .. _Nature_Methods: http://www.nature.com/nmeth/journal/v8/n8/pdf/nmeth.1628.pdf
464 .. _CREST: http://www.stjuderesearch.org/site/lab/zhang
465
466
467 ----
468
469 **Input formats**
470
471 BAM files that must contain soft-clipping signatures at the breakpoints. If
472 they do not, you will not get any results.
473
474 CREST uses soft-clipping signatures to identify breakpoints. Soft-clipping is
475 indicated by "S" elements in the CIGAR for SAM/BAM records. Soft-clipping may
476 not occur, depending on the mapping algorithm and parameters and sometimes even
477 the library preparation.
478
479 With bwa sampe:
480
481 One mapping method that will soft-clip reads is bwa sampe (BWA for paired-end
482 reads). When BWA successfully maps one read in a pair but is not able to map
483 the other, it will attempt a more permissive Smith-Waterman alignment of the
484 unmapped read in the neighborhood of the mapped mate. If it is only able to
485 align part of the read, then it will soft-clip the portion on the end that it
486 could not align. Often this occurs at the breakpoints of structural
487 variations.
488
489 In some cases when the insert sizes approach the read length, BWA will not
490 perform Smith-Waterman alignment. Reads from inserts smaller than the read
491 length will contain primer and/or adapter and will often not map. When the
492 insert size is close to the read length, this creates a skewed distribution
493 of inferred insert sizes which may cause BWA to not attempt Smith-Waterman
494 realignment. This is indicated by the error message "weird pairing". Often
495 in these cases there are also unusually low mapping rates.
496
497 One way to fix this problem is to remap unmapped reads bwasw. To do this,
498 extract the unmapped reads as FASTQ files (this may be done with a combination
499 of samtools view -f 4 and Picard's SamToFastq). Realign using bwa bwasw and
500 build a BAM file. Then, re-run CREST on this new BAM file, and you may pick
501 up events that would have been missed otherwise.
502
503
504
505
506 ------
507
508 **Outputs**
509
510 The output
511 file *.predSV.txt has the following tab-delimited columns: left_chr, left_pos,
512 left_strand, # of left soft-clipped reads, right_chr, right_pos, right_strand,
513 # right soft-clipped reads, SV type, coverage at left_pos, coverage at
514 right_pos, assembled length at left_pos, assembled length at right_pos,
515 average percent identity at left_pos, percent of non-unique mapping reads at
516 left_pos, average percent identity at right_pos, percent of non-unique mapping
517 reads at right_pos, start position of consensus mapping to genome,
518 starting chromosome of consensus mapping, position of the genomic mapping of
519 consensus starting position, end position of consensus mapping to genome,
520 ending chromsome of consnesus mapping, position of genomic mapping of
521 consensus ending posiiton, and consensus sequences. For inversion(INV), the
522 last 7 fields will be repeated to reflect the fact two different breakpoints
523 are needed to identify an INV event.
524
525 Example of the tumor.predSV.txt file:
526
527 4 125893227 + 5 10 66301858 - 4 CTX 29 14 83 71 0.895173453996983 0.230769230769231 0.735384615384615 0.5 1 4 125893135 176 10 66301773 TTATGAATTTTGAAATATATATCATATTTTGAAATATATATCATATTCTAAATTATGAAAAGAGAATATGATTCTCTTTTCAGTAGCTGTCACCTCCTGGGTTCAAGTGATTCTCCTGCCTCTACCTCCCGAGTAGCTGGGATTACAGGTGCCCACCACCATGCCTGGCTAATTTT
528 5 7052198 - 0 10 66301865 + 8 CTX 0 22 0 81 0.761379310344828 0.482758620689655 0 0 1 5 7052278 164 10 66301947 AGCCATGGACCTTGTGGTGGGTTCTTAACAATGGTGAGTCCGGAGTTCTTAACGATGGTGAGTCCGTAGTTTGTTCCTTCAGGAGTGAGCCAAGATCATGCCACTGCACTCTAGCCTGGGCAACAGAGGAAGACTCCACCTCAAAAAAAAAAAGTGGGAAGAGG
529 10 66301858 + 4 4 125893225 - 1 CTX 15 28 71 81 0.735384615384615 0.5 0.889507154213037 0.243243243243243 1 10 66301777 153 4 125893154 TTAGCCAGGCATGGTGGTGGGCACCTGTAATCCCAGCTACTCGGGAGGTAGAGGCAGGAGAATCACTTGAACCCAGGAGGTGACAGCTACTGAAAAGAGAATCATATTCTCTTTTCATAATTTAGAATATGATATATATTTCAAAATATGATA
530
531 If there are no or very few results, there may be a lack of soft-clipping.
532
533
534
535 </help>
536 </tool>