comparison defuse/defuse.xml @ 0:efddb7a0b3db

Uploaded
author jjohnson
date Fri, 16 Sep 2011 13:07:35 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:efddb7a0b3db
1 <tool id="defuse" name="DeFuse" version="1.0">
2 <description>identify fusion transcripts</description>
3 <requirements>
4 <requirement type="binary"></requirement>
5 </requirements>
6 <command interpreter="perl">
7 scripts/defuse.pl
8 -c `cp $defuse_config $config_txt; echo $defuse_config`
9 -d `mkdir -p data_dir; ln -s $left_pairendreads data_dir/reads_1.fastq; ln -s $right_pairendreads data_dir/reads_2.fastq; echo data_dir`
10 -o output_dir -p 8
11 </command>
12 <inputs>
13 <param name="left_pairendreads" type="data" format="fastq" label="left part of read pairs" help="The left and right reads pairs must be in the same order, and not have any unpaired reads. (FASTQ interlacer will pair reads and remove the unpaired. FASTQ de-interlacer will separate the result into left and right reads.)"/>
14 <param name="right_pairendreads" type="data" format="fastq" label="right part of read pairs" help="In the same order as the left reads"/>
15 <conditional name="refGenomeSource">
16 <param name="genomeSource" type="select" label="Will you select a built-in DeFuse Reference Dataset, or supply a configuration from your history" help="">
17 <option value="indexed">Use a built-in DeFuse Reference Dataset</option>
18 <option value="history">Use a configuration from your history that specifies the DeFuse Reference Dataset</option>
19 </param>
20 <when value="indexed">
21 <param name="index" type="select" label="Select a Reference Dataset" help="if your genome of interest is not listed - contact Galaxy team">
22 <options from_file="defuse.loc">
23 <column name="name" index="1"/>
24 <column name="value" index="2"/>
25 <filter type="sort_by" column="0" />
26 <validator type="no_options" message="No indexes are available" />
27 </options>
28 </param>
29 <conditional name="defuse_param">
30 <param name="settings" type="select" label="Defuse parameter settings" help="">
31 <option value="preSet">Default settings</option>
32 <option value="full">Full parameter list</option>
33 </param>
34 <when value="preSet" />
35 <when value="full">
36 <param name="max_insert_size" type="integer" value="500" optional="true" label="Bowtie max_insert_size" />
37 <param name="dna_concordant_length" type="integer" value="2000" optional="true" label="Minimum gene fusion range dna_concordant_length" />
38 <param name="discord_read_trim" type="integer" value="50" optional="true" label="Trim length for discordant reads discord_read_trim" help="(split reads are not trimmed)" />
39 <param name="clustering_precision" type="float" value=".95" optional="true" label="Filter clustering_precision">
40 <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
41 </param>
42 <param name="span_count_threshold" type="integer" value="5" optional="true" label="Filter span_count_threshold" />
43 <param name="split_count_threshold" type="integer" value="3" optional="true" label="Filter split_count_threshold" />
44 <param name="percent_identity_threshold" type="float" value=".90" optional="true" label="Filter percent_identity_threshold">
45 <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
46 </param>
47 <param name="max_dist_pos" type="integer" value="600" optional="true" label="Filter max_dist_pos" />
48 <param name="num_dist_genes" type="integer" value="500" optional="true" label="Filter num_dist_genes" />
49 <param name="split_min_anchor" type="integer" value="4" optional="true" label="Filter split_min_anchor" />
50 <param name="max_concordant_ratio" type="float" value="0.1" optional="true" label="Filter max_concordant_ratio">
51 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
52 </param>
53 <param name="splice_bias" type="integer" value="10" optional="true" label="Filter splice_bias" />
54 <param name="probability_threshold" type="float" value="0.50" optional="true" label="Filter probability_threshold">
55 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
56 </param>
57 <param name="covariance_sampling_density" type="float" value="0.01" optional="true" label="covariance_sampling_density">
58 <help>Position density when calculating covariance</help>
59 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
60 </param>
61 <param name="denovo_assembly" type="select" label="denovo_assembly" help="">
62 <option value="">Use Default</option>
63 <option value="no">no</option>
64 <option value="yes">yes</option>
65 </param>
66 <!--
67 <param name="positive_controls" type="data" format="txt" optional=true label="Defuse positive_controls" help=""/>
68 -->
69 </when> <!-- full -->
70 </conditional> <!-- defuse_param -->
71 </when>
72 <when value="history">
73 <param name="config" type="data" format="txt" label="Defuse Config file" help=""/>
74 </when> <!-- history -->
75 </conditional> <!-- refGenomeSource -->
76 </inputs>
77 <configfiles>
78 <configfile name="defuse_config">
79 #import ast
80 #if $refGenomeSource.genomeSource == "history":
81 #include raw $refGenomeSource.config.__str__
82 #else
83 #set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value))
84 #
85 # Configuration file for defuse
86 #
87 # At a minimum, change all values enclused by []
88 #
89 # Gene/Transcript id pattern
90 gene_id_pattern = #slurp
91 #try
92 $ref_dict['gene_id_pattern']
93 transcript_id_pattern = #slurp
94 #except
95 ENSG\d+
96 #end try
97 #try
98 $ref_dict['transcript_id_pattern']
99 #except
100 ENST\d+
101 #end try
102
103 # Directory where the defuse code was unpacked
104 ## Default location in the tool/defuse directory
105 # source_directory = ${__root_dir__}/tools/defuse
106 source_directory = #slurp
107 #try
108 $ref_dict['source_directory']
109 #except
110 ${__root_dir__}/tools/defuse
111 #end try
112
113 # Directory where you want your dataset
114 dataset_directory = #slurp
115 #try
116 $ref_dict['dataset_directory']
117 #except
118 /project/db/genomes/Hsapiens/hg19/defuse
119 #end try
120
121 # Input genome and gene models
122 gene_models = #slurp
123 #try
124 $ref_dict['gene_models']
125 #except
126 \$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf
127 #end try
128 genome_fasta = #slurp
129 #try
130 $ref_dict['genome_fasta']
131 #except
132 \$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa
133 #end try
134
135 # Repeat table from ucsc genome browser
136 repeats_filename = #slurp
137 #try
138 $ref_dict['repeats_filename']
139 #except
140 \$(dataset_directory)/rmsk.txt
141 #end try
142
143 # EST info downloaded from ucsc genome browser
144 est_fasta = #slurp
145 #try
146 $ref_dict['est_fasta']
147 #except
148 \$(dataset_directory)/est.fa
149 #end try
150 est_alignments = #slurp
151 #try
152 $ref_dict['est_alignments']
153 #except
154 \$(dataset_directory)/intronEst.txt
155 #end try
156
157 # Unigene clusters downloaded from ncbi
158 unigene_fasta = #slurp
159 #try
160 $ref_dict['unigene_fasta']
161 #except
162 \$(dataset_directory)/Hs.seq.uniq
163 #end try
164
165 # Paths to external tools
166 bowtie_bin = #slurp
167 #try
168 $ref_dict['bowtie_bin']
169 #except
170 /soft/bowtie/0.12.7/bowtie
171 #end try
172 bowtie_build_bin = #slurp
173 #try
174 $ref_dict['bowtie_build_bin']
175 #except
176 /soft/bowtie/0.12.7/bowtie-build
177 #end try
178 blat_bin = #slurp
179 #try
180 $ref_dict['blat_bin']
181 #except
182 /soft/blat/34/bin/blat
183 #end try
184 fatotwobit_bin = #slurp
185 #try
186 $ref_dict['fatotwobit_bin']
187 #except
188 /soft/blat/34/bin/faToTwoBit
189 #end try
190 r_bin = #slurp
191 #try
192 $ref_dict['r_bin']
193 #except
194 /project/sdml-sles11-weblocal/R-2.12.1/bin/R
195 #end try
196 rscript_bin = #slurp
197 #try
198 $ref_dict['rscript_bin']
199 #except
200 /project/sdml-sles11-weblocal/R-2.12.1/bin/Rscript
201 #end try
202
203 #raw
204 # Dataset files
205 dataset_prefix = $(dataset_directory)/defuse
206 chromosome_prefix = $(dataset_prefix).dna.chromosomes
207 exons_fasta = $(dataset_prefix).exons.fa
208 cds_fasta = $(dataset_prefix).cds.fa
209 cdna_regions = $(dataset_prefix).cdna.regions
210 cdna_fasta = $(dataset_prefix).cdna.fa
211 reference_fasta = $(dataset_prefix).reference.fa
212 rrna_fasta = $(dataset_prefix).rrna.fa
213 ig_gene_list = $(dataset_prefix).ig.gene.list
214 repeats_regions = $(dataset_directory)/repeats.regions
215 est_split_fasta1 = $(dataset_directory)/est.1.fa
216 est_split_fasta2 = $(dataset_directory)/est.2.fa
217 est_split_fasta3 = $(dataset_directory)/est.3.fa
218 est_split_fasta4 = $(dataset_directory)/est.4.fa
219 est_split_fasta5 = $(dataset_directory)/est.5.fa
220 est_split_fasta6 = $(dataset_directory)/est.6.fa
221 est_split_fasta7 = $(dataset_directory)/est.7.fa
222 est_split_fasta8 = $(dataset_directory)/est.8.fa
223 est_split_fasta9 = $(dataset_directory)/est.9.fa
224
225 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
226 prefilter1 = $(unigene_fasta)
227
228 # deFuse scripts and tools
229 scripts_directory = $(source_directory)/scripts
230 tools_directory = $(source_directory)/tools
231 data_directory = $(source_directory)/data
232 #end raw
233
234 # Path to samtools, 0.1.8 is compiled for you, use other versions at your own risk
235 samtools_bin = #slurp
236 #try
237 $ref_dict['samtools_bin']
238 #except
239 \$(source_directory)/external/samtools-0.1.8/samtools
240 #end try
241
242 # Bowtie parameters
243 bowtie_threads = #slurp
244 #try
245 $ref_dict['bowtie_threads']
246 #except
247 1
248 #end try
249 bowtie_quals = #slurp
250 #try
251 $ref_dict['bowtie_quals']
252 #except
253 --phred33-quals
254 #end try
255 max_insert_size = #slurp
256 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "":
257 $refGenomeSource.defuse_param.max_insert_size
258 #else
259 #try
260 $ref_dict['max_insert_size']
261 #except
262 500
263 #end try
264 #end if
265
266 # Parameters for building the dataset
267 chromosomes = #slurp
268 #try
269 $ref_dict.chromosomes
270 #except
271 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
272 #end try
273 mt_chromosome = #slurp
274 #try
275 $ref_dict['mt_chromosome']
276 #except
277 MT
278 #end try
279 gene_sources = #slurp
280 #try
281 $ref_dict['gene_sources']
282 #except
283 IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding
284 #end try
285 ig_gene_sources = #slurp
286 #try
287 $ref_dict['ig_gene_sources']
288 #except
289 IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene
290 #end try
291 rrna_gene_sources = #slurp
292 #try
293 $ref_dict['rrna_gene_sources']
294 #except
295 Mt_rRNA,rRNA,rRNA_pseudogene
296 #end try
297
298 # Blat sequences per job
299 num_blat_sequences = #slurp
300 #try
301 $ref_dict['num_blat_sequences']
302 #except
303 10000
304 #end try
305
306 # Minimum gene fusion range
307 dna_concordant_length = #slurp
308 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "":
309 $refGenomeSource.defuse_param.dna_concordant_length
310 #else
311 #try
312 $ref_dict['dna_concordant_length']
313 #except
314 2000
315 #end try
316 #end if
317
318 # Trim length for discordant reads (split reads are not trimmed)
319 discord_read_trim = #slurp
320 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "":
321 $refGenomeSource.defuse_param.discord_read_trim
322 #else
323 #try
324 $ref_dict['discord_read_trim']
325 #except
326 50
327 #end try
328 #end if
329
330 # Filtering parameters
331 clustering_precision = #slurp
332 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != ""
333 $refGenomeSource.defuse_param.clustering_precision
334 #else
335 #try
336 $ref_dict['clustering_precision']
337 #except
338 0.95
339 #end try
340 #end if
341 span_count_threshold = #slurp
342 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != ""
343 $refGenomeSource.defuse_param.span_count_threshold
344 #else
345 #try
346 $ref_dict['span_count_threshold']
347 #except
348 5
349 #end try
350 #end if
351 split_count_threshold = #slurp
352 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != ""
353 $refGenomeSource.defuse_param.split_count_threshold
354 #else
355 #try
356 $ref_dict['split_count_threshold']
357 #except
358 3
359 #end try
360 #end if
361 percent_identity_threshold = #slurp
362 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != ""
363 $refGenomeSource.defuse_param.percent_identity_threshold
364 #else
365 #try
366 $ref_dict['percent_identity_threshold']
367 #except
368 0.90
369 #end try
370 #end if
371 max_dist_pos = #slurp
372 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != ""
373 $refGenomeSource.defuse_param.max_dist_pos
374 #else
375 #try
376 $ref_dict['max_dist_pos']
377 #except
378 600
379 #end try
380 #end if
381 num_dist_genes = #slurp
382 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != ""
383 $refGenomeSource.defuse_param.num_dist_genes
384 #else
385 #try
386 $ref_dict['num_dist_genes']
387 #except
388 500
389 #end try
390 #end if
391 split_min_anchor = #slurp
392 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != ""
393 $refGenomeSource.defuse_param.split_min_anchor
394 #else
395 #try
396 $ref_dict['split_min_anchor']
397 #except
398 4
399 #end try
400 #end if
401 max_concordant_ratio = #slurp
402 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != ""
403 $refGenomeSource.defuse_param.max_concordant_ratio
404 #else
405 #try
406 $ref_dict['max_concordant_ratio']
407 #except
408 0.1
409 #end try
410 #end if
411 splice_bias = #slurp
412 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != ""
413 $refGenomeSource.defuse_param.splice_bias
414 #else
415 #try
416 $ref_dict['splice_bias']
417 #except
418 10
419 #end try
420 #end if
421 denovo_assembly = #slurp
422 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != ""
423 $refGenomeSource.defuse_param.denovo_assembly
424 #else
425 #try
426 $ref_dict['denovo_assembly']
427 #except
428 no
429 #end try
430 #end if
431 probability_threshold = #slurp
432 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != ""
433 $refGenomeSource.defuse_param.probability_threshold
434 #else
435 #try
436 $ref_dict['probability_threshold']
437 #except
438 0.50
439 #end try
440 #end if
441 positive_controls = \$(data_directory)/controls.txt
442
443 # Position density when calculating covariance
444 covariance_sampling_density = #slurp
445 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != ""
446 $refGenomeSource.defuse_param.covariance_sampling_density
447 #else
448 #try
449 $ref_dict['covariance_sampling_density']
450 #except
451 0.01
452 #end try
453 #end if
454
455
456 # Number of reads for each job in split
457 reads_per_job = 1000000
458
459 # Number of regions for each breakpoint sequence job in split
460 regions_per_job = 20
461
462 #raw
463 # If you have command line 'mail' and wish to be notified
464 # mailto = andrew.mcpherson@gmail.com
465
466 # Remove temp files
467 remove_job_files = yes
468 remove_job_temp_files = yes
469
470 # Converting to fastq
471 # Fastq converter config format 1 for reads stored in separate files for each end
472 # data_lane_rexex_N is a perl regex which stores the lane id in $1
473 # data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1
474 # data_compress_regex_N is a perl regex which stores the compression extension in $1
475 # data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout
476 # Fastq converter config format 2 for reads stored in separate files for each end
477 # data_lane_regex_N is a perl regex which stores the lane id in $1
478 # data_compress_regex_N is a perl regex which stores the compression extension in $1
479 # data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout
480 # data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout
481
482 data_lane_regex_1 = ^(.+)_[12]_export\.txt.*$
483 data_end_regex_1 = ^.+_([12])_export\.txt.*$
484 data_compress_regex_1 = ^.+_[12]_export\.txt(.*)$
485 data_converter_1 = $(scripts_directory)/fq_all2std.pl export2std
486
487 data_lane_regex_2 = ^(.+)_[12]_concat_qseq\.txt.*$
488 data_end_regex_2 = ^.+_([12])_concat_qseq\.txt.*$
489 data_compress_regex_2 = ^.+_[12]_concat_qseq\.txt(.*)$
490 data_converter_2 = $(scripts_directory)/qseq2fastq.pl
491
492 data_lane_regex_3 = ^(.+)\.bam.*$
493 data_compress_regex_3 = ^.+\.bam(.*)$
494 data_end1_converter_3 = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl
495 data_end2_converter_3 = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl
496
497 data_lane_regex_4 = ^(.+).[12].fastq.*$
498 data_end_regex_4 = ^.+.([12]).fastq.*$
499 data_compress_regex_4 = ^.+.[12].fastq(.*)$
500 data_converter_4 = cat
501 #end raw
502
503 #end if
504
505 </configfile>
506 </configfiles>
507 <outputs>
508 <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
509 <data format="txt" name="defuse_log" label="${tool.name} on ${on_string}: defuse.log" from_work_dir="output_dir/log/defuse.log"/>
510 <data format="tabular" name="results_tsv" label="${tool.name} on ${on_string}: results.tsv" from_work_dir="output_dir/results.tsv"/>
511 <data format="tabular" name="results_filtered_tsv" label="${tool.name} on ${on_string}: results.filtered.tsv" from_work_dir="output_dir/results.filtered.tsv"/>
512 <data format="tabular" name="results_classify_tsv" label="${tool.name} on ${on_string}: results.classify.tsv" from_work_dir="output_dir/results.classify.tsv"/>
513 </outputs>
514 <tests>
515 </tests>
516 <help>
517 **DeFuse**
518
519 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
520
521 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
522
523 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
524
525 ------
526
527 **Inputs**
528
529 DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
530
531 If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
532
533 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
534 - genome_fasta from Ensembl
535 - gene_models from Ensembl
536 - repeats_filename from UCSC RepeatMasker rmsk.txt
537 - est_fasta from UCSC
538 - est_alignments from UCSC intronEst.txt
539 - unigene_fasta from NCBI
540
541 .. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
542
543 ------
544
545 **Outputs**
546
547 The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
548
549 DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
550
551 The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
552
553 - **Identification**
554 - cluster_id : random identifier assigned to each prediction
555 - library_name : library name given on the command line of defuse
556 - gene1 : ensembl id of gene 1
557 - gene2 : ensembl id of gene 2
558 - gene_name1 : name of gene 1
559 - gene_name2 : name of gene 2
560 - **Evidence**
561 - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
562 - concordant_ratio : proportion of spanning reads considered concordant by blat
563 - denovo_min_count : minimum kmer count across denovo assembled sequence
564 - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
565 - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
566 - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
567 - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
568 - min_map_count : minimum of the number of genomic mappings for each spanning read
569 - max_map_count : maximum of the number of genomic mappings for each spanning read
570 - mean_map_count : average of the number of genomic mappings for each spanning read
571 - num_multi_map : number of spanning reads that map to more than one genomic location
572 - span_count : number of spanning reads supporting the fusion
573 - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
574 - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
575 - span_coverage_min : minimum of span_coverage1 and span_coverage2
576 - span_coverage_max : maximum of span_coverage1 and span_coverage2
577 - splitr_count : number of split reads supporting the prediction
578 - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
579 - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
580 - splitr_sequence : fusion sequence predicted by split reads
581 - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
582 - **Annotation**
583 - adjacent : fusion between adjacent genes
584 - altsplice : fusion likely the product of alternative splicing between adjacent genes
585 - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
586 - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
587 - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
588 - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
589 - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
590 - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
591 - deletion : fusion produced by a genomic deletion
592 - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
593 - eversion : fusion produced by a genomic eversion
594 - exonboundaries : fusion splice at exon boundaries
595 - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
596 - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
597 - gene_chromosome1 : chromosome of gene 1
598 - gene_chromosome2 : chromosome of gene 2
599 - gene_end1 : end position for gene 1
600 - gene_end2 : end position for gene 2
601 - gene_location1 : location of breakpoint in gene 1
602 - gene_location2 : location of breakpoint in gene 2
603 - gene_start1 : start of gene 1
604 - gene_start2 : start of gene 2
605 - gene_strand1 : strand of gene 1
606 - gene_strand2 : strand of gene 2
607 - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
608 - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
609 - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
610 - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
611 - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
612 - interchromosomal : fusion produced by an interchromosomal translocation
613 - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
614 - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
615 - inversion : fusion produced by genomic inversion
616 - orf : fusion combines genes in a way that preserves a reading frame
617 - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
618 - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
619 - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
620 - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
621 - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
622 - splice_score : number of nucleotides similar to GTAG at fusion splice
623 - num_splice_variants : number of potential splice variants for this gene pair
624 - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
625 - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
626
627
628 **Example**
629
630 results.tsv::
631
632 cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2
633 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 -
634 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - -
635
636 </help>
637 </tool>