# HG changeset patch # User Jim Johnson # Date 1316194897 18000 # Node ID 36306d8086fa559cf1d39c18337d50dcdd910862 # Parent efddb7a0b3dbe8eb8f6035e93f31ec4e19fc4d3c Remove defuse dir diff -r efddb7a0b3db -r 36306d8086fa README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Fri Sep 16 12:41:37 2011 -0500 @@ -0,0 +1,33 @@ +The DeFuse galaxy tool is based on DeFuse_Version_0.4.2 + http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page + +DeFuse is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. + + +Manual: + http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 + + +The included defuse source code is from: http://sourceforge.net/projects/defuse/files/defuse/0.4/defuse-0.4.2.tar.gz/download +(without the defuse-0.4.2 dir level) +tar zxf defuse-0.4.2.tar.gz +cd tool +make +cd .. + +To use with non human genome references: +tar zxf modified_scripts.tgz +Defuse source was modified to include 2 extra parameters for non human references: gene_id_pattern and transcript_id_pattern + scripts/alignjob.pl + scripts/annotate_fusions.pl + scripts/calculate_expression_simple.pl + scripts/filter_bulk_fusion_reads.pl + scripts/filter_sam_genes.pl + scripts/find_concordant_ensembl.pl + scripts/find_gene_clusters.pl + + +The defuse.xml galaxy tool wrapper will generate a defuse config.txt using values from tool-data/defuse.loc +and call scripts/defuse.pl + + diff -r efddb7a0b3db -r 36306d8086fa defuse-0.4.2.tar.gz Binary file defuse-0.4.2.tar.gz has changed diff -r efddb7a0b3db -r 36306d8086fa defuse.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse.xml Fri Sep 16 12:41:37 2011 -0500 @@ -0,0 +1,637 @@ + + identify fusion transcripts + + + + + scripts/defuse.pl + -c `cp $defuse_config $config_txt; echo $defuse_config` + -d `mkdir -p data_dir; ln -s $left_pairendreads data_dir/reads_1.fastq; ln -s $right_pairendreads data_dir/reads_2.fastq; echo data_dir` + -o output_dir -p 8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Position density when calculating covariance + + + + + + + + + + + + + + + + + + +#import ast +#if $refGenomeSource.genomeSource == "history": +#include raw $refGenomeSource.config.__str__ +#else +#set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value)) +# +# Configuration file for defuse +# +# At a minimum, change all values enclused by [] +# +# Gene/Transcript id pattern +gene_id_pattern = #slurp +#try +$ref_dict['gene_id_pattern'] +transcript_id_pattern = #slurp +#except +ENSG\d+ +#end try +#try +$ref_dict['transcript_id_pattern'] +#except +ENST\d+ +#end try + +# Directory where the defuse code was unpacked +## Default location in the tool/defuse directory +# source_directory = ${__root_dir__}/tools/defuse +source_directory = #slurp +#try +$ref_dict['source_directory'] +#except +${__root_dir__}/tools/defuse +#end try + +# Directory where you want your dataset +dataset_directory = #slurp +#try +$ref_dict['dataset_directory'] +#except +/project/db/genomes/Hsapiens/hg19/defuse +#end try + +# Input genome and gene models +gene_models = #slurp +#try +$ref_dict['gene_models'] +#except +\$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf +#end try +genome_fasta = #slurp +#try +$ref_dict['genome_fasta'] +#except +\$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa +#end try + +# Repeat table from ucsc genome browser +repeats_filename = #slurp +#try +$ref_dict['repeats_filename'] +#except +\$(dataset_directory)/rmsk.txt +#end try + +# EST info downloaded from ucsc genome browser +est_fasta = #slurp +#try +$ref_dict['est_fasta'] +#except +\$(dataset_directory)/est.fa +#end try +est_alignments = #slurp +#try +$ref_dict['est_alignments'] +#except +\$(dataset_directory)/intronEst.txt +#end try + +# Unigene clusters downloaded from ncbi +unigene_fasta = #slurp +#try +$ref_dict['unigene_fasta'] +#except +\$(dataset_directory)/Hs.seq.uniq +#end try + +# Paths to external tools +bowtie_bin = #slurp +#try +$ref_dict['bowtie_bin'] +#except +/soft/bowtie/0.12.7/bowtie +#end try +bowtie_build_bin = #slurp +#try +$ref_dict['bowtie_build_bin'] +#except +/soft/bowtie/0.12.7/bowtie-build +#end try +blat_bin = #slurp +#try +$ref_dict['blat_bin'] +#except +/soft/blat/34/bin/blat +#end try +fatotwobit_bin = #slurp +#try +$ref_dict['fatotwobit_bin'] +#except +/soft/blat/34/bin/faToTwoBit +#end try +r_bin = #slurp +#try +$ref_dict['r_bin'] +#except +/project/sdml-sles11-weblocal/R-2.12.1/bin/R +#end try +rscript_bin = #slurp +#try +$ref_dict['rscript_bin'] +#except +/project/sdml-sles11-weblocal/R-2.12.1/bin/Rscript +#end try + +#raw +# Dataset files +dataset_prefix = $(dataset_directory)/defuse +chromosome_prefix = $(dataset_prefix).dna.chromosomes +exons_fasta = $(dataset_prefix).exons.fa +cds_fasta = $(dataset_prefix).cds.fa +cdna_regions = $(dataset_prefix).cdna.regions +cdna_fasta = $(dataset_prefix).cdna.fa +reference_fasta = $(dataset_prefix).reference.fa +rrna_fasta = $(dataset_prefix).rrna.fa +ig_gene_list = $(dataset_prefix).ig.gene.list +repeats_regions = $(dataset_directory)/repeats.regions +est_split_fasta1 = $(dataset_directory)/est.1.fa +est_split_fasta2 = $(dataset_directory)/est.2.fa +est_split_fasta3 = $(dataset_directory)/est.3.fa +est_split_fasta4 = $(dataset_directory)/est.4.fa +est_split_fasta5 = $(dataset_directory)/est.5.fa +est_split_fasta6 = $(dataset_directory)/est.6.fa +est_split_fasta7 = $(dataset_directory)/est.7.fa +est_split_fasta8 = $(dataset_directory)/est.8.fa +est_split_fasta9 = $(dataset_directory)/est.9.fa + +# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs +prefilter1 = $(unigene_fasta) + +# deFuse scripts and tools +scripts_directory = $(source_directory)/scripts +tools_directory = $(source_directory)/tools +data_directory = $(source_directory)/data +#end raw + +# Path to samtools, 0.1.8 is compiled for you, use other versions at your own risk +samtools_bin = #slurp +#try +$ref_dict['samtools_bin'] +#except +\$(source_directory)/external/samtools-0.1.8/samtools +#end try + +# Bowtie parameters +bowtie_threads = #slurp +#try +$ref_dict['bowtie_threads'] +#except +1 +#end try +bowtie_quals = #slurp +#try +$ref_dict['bowtie_quals'] +#except +--phred33-quals +#end try +max_insert_size = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "": +$refGenomeSource.defuse_param.max_insert_size +#else +#try +$ref_dict['max_insert_size'] +#except +500 +#end try +#end if + +# Parameters for building the dataset +chromosomes = #slurp +#try +$ref_dict.chromosomes +#except +1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT +#end try +mt_chromosome = #slurp +#try +$ref_dict['mt_chromosome'] +#except +MT +#end try +gene_sources = #slurp +#try +$ref_dict['gene_sources'] +#except +IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding +#end try +ig_gene_sources = #slurp +#try +$ref_dict['ig_gene_sources'] +#except +IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene +#end try +rrna_gene_sources = #slurp +#try +$ref_dict['rrna_gene_sources'] +#except +Mt_rRNA,rRNA,rRNA_pseudogene +#end try + +# Blat sequences per job +num_blat_sequences = #slurp +#try +$ref_dict['num_blat_sequences'] +#except +10000 +#end try + +# Minimum gene fusion range +dna_concordant_length = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "": +$refGenomeSource.defuse_param.dna_concordant_length +#else +#try +$ref_dict['dna_concordant_length'] +#except +2000 +#end try +#end if + +# Trim length for discordant reads (split reads are not trimmed) +discord_read_trim = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "": +$refGenomeSource.defuse_param.discord_read_trim +#else +#try +$ref_dict['discord_read_trim'] +#except +50 +#end try +#end if + +# Filtering parameters +clustering_precision = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != "" +$refGenomeSource.defuse_param.clustering_precision +#else +#try +$ref_dict['clustering_precision'] +#except +0.95 +#end try +#end if +span_count_threshold = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != "" +$refGenomeSource.defuse_param.span_count_threshold +#else +#try +$ref_dict['span_count_threshold'] +#except +5 +#end try +#end if +split_count_threshold = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != "" +$refGenomeSource.defuse_param.split_count_threshold +#else +#try +$ref_dict['split_count_threshold'] +#except +3 +#end try +#end if +percent_identity_threshold = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != "" +$refGenomeSource.defuse_param.percent_identity_threshold +#else +#try +$ref_dict['percent_identity_threshold'] +#except +0.90 +#end try +#end if +max_dist_pos = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != "" +$refGenomeSource.defuse_param.max_dist_pos +#else +#try +$ref_dict['max_dist_pos'] +#except +600 +#end try +#end if +num_dist_genes = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != "" +$refGenomeSource.defuse_param.num_dist_genes +#else +#try +$ref_dict['num_dist_genes'] +#except +500 +#end try +#end if +split_min_anchor = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != "" +$refGenomeSource.defuse_param.split_min_anchor +#else +#try +$ref_dict['split_min_anchor'] +#except +4 +#end try +#end if +max_concordant_ratio = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != "" +$refGenomeSource.defuse_param.max_concordant_ratio +#else +#try +$ref_dict['max_concordant_ratio'] +#except +0.1 +#end try +#end if +splice_bias = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != "" +$refGenomeSource.defuse_param.splice_bias +#else +#try +$ref_dict['splice_bias'] +#except +10 +#end try +#end if +denovo_assembly = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != "" +$refGenomeSource.defuse_param.denovo_assembly +#else +#try +$ref_dict['denovo_assembly'] +#except +no +#end try +#end if +probability_threshold = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != "" +$refGenomeSource.defuse_param.probability_threshold +#else +#try +$ref_dict['probability_threshold'] +#except +0.50 +#end try +#end if +positive_controls = \$(data_directory)/controls.txt + +# Position density when calculating covariance +covariance_sampling_density = #slurp +#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != "" +$refGenomeSource.defuse_param.covariance_sampling_density +#else +#try +$ref_dict['covariance_sampling_density'] +#except +0.01 +#end try +#end if + + +# Number of reads for each job in split +reads_per_job = 1000000 + +# Number of regions for each breakpoint sequence job in split +regions_per_job = 20 + +#raw +# If you have command line 'mail' and wish to be notified +# mailto = andrew.mcpherson@gmail.com + +# Remove temp files +remove_job_files = yes +remove_job_temp_files = yes + +# Converting to fastq +# Fastq converter config format 1 for reads stored in separate files for each end +# data_lane_rexex_N is a perl regex which stores the lane id in $1 +# data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1 +# data_compress_regex_N is a perl regex which stores the compression extension in $1 +# data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout +# Fastq converter config format 2 for reads stored in separate files for each end +# data_lane_regex_N is a perl regex which stores the lane id in $1 +# data_compress_regex_N is a perl regex which stores the compression extension in $1 +# data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout +# data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout + +data_lane_regex_1 = ^(.+)_[12]_export\.txt.*$ +data_end_regex_1 = ^.+_([12])_export\.txt.*$ +data_compress_regex_1 = ^.+_[12]_export\.txt(.*)$ +data_converter_1 = $(scripts_directory)/fq_all2std.pl export2std + +data_lane_regex_2 = ^(.+)_[12]_concat_qseq\.txt.*$ +data_end_regex_2 = ^.+_([12])_concat_qseq\.txt.*$ +data_compress_regex_2 = ^.+_[12]_concat_qseq\.txt(.*)$ +data_converter_2 = $(scripts_directory)/qseq2fastq.pl + +data_lane_regex_3 = ^(.+)\.bam.*$ +data_compress_regex_3 = ^.+\.bam(.*)$ +data_end1_converter_3 = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl +data_end2_converter_3 = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl + +data_lane_regex_4 = ^(.+).[12].fastq.*$ +data_end_regex_4 = ^.+.([12]).fastq.*$ +data_compress_regex_4 = ^.+.[12].fastq(.*)$ +data_converter_4 = cat +#end raw + +#end if + + + + + + + + + + + + + +**DeFuse** + +DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. + +Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138 + +.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page + +------ + +**Inputs** + +DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**). + +If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq. + +DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_: + - genome_fasta from Ensembl + - gene_models from Ensembl + - repeats_filename from UCSC RepeatMasker rmsk.txt + - est_fasta from UCSC + - est_alignments from UCSC intronEst.txt + - unigene_fasta from NCBI + +.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 + +------ + +**Outputs** + +The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates. + +DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt. + +The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order): + + - **Identification** + - cluster_id : random identifier assigned to each prediction + - library_name : library name given on the command line of defuse + - gene1 : ensembl id of gene 1 + - gene2 : ensembl id of gene 2 + - gene_name1 : name of gene 1 + - gene_name2 : name of gene 2 + - **Evidence** + - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable + - concordant_ratio : proportion of spanning reads considered concordant by blat + - denovo_min_count : minimum kmer count across denovo assembled sequence + - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly + - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive + - gene_align_strand1 : alignment strand for spanning read alignments to gene 1 + - gene_align_strand2 : alignment strand for spanning read alignments to gene 2 + - min_map_count : minimum of the number of genomic mappings for each spanning read + - max_map_count : maximum of the number of genomic mappings for each spanning read + - mean_map_count : average of the number of genomic mappings for each spanning read + - num_multi_map : number of spanning reads that map to more than one genomic location + - span_count : number of spanning reads supporting the fusion + - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage + - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage + - span_coverage_min : minimum of span_coverage1 and span_coverage2 + - span_coverage_max : maximum of span_coverage1 and span_coverage2 + - splitr_count : number of split reads supporting the prediction + - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive + - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive + - splitr_sequence : fusion sequence predicted by split reads + - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive + - **Annotation** + - adjacent : fusion between adjacent genes + - altsplice : fusion likely the product of alternative splicing between adjacent genes + - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1 + - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2 + - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2 + - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2 + - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands + - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna + - deletion : fusion produced by a genomic deletion + - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est + - eversion : fusion produced by a genomic eversion + - exonboundaries : fusion splice at exon boundaries + - expression1 : expression of gene 1 as number of concordant pairs aligned to exons + - expression2 : expression of gene 2 as number of concordant pairs aligned to exons + - gene_chromosome1 : chromosome of gene 1 + - gene_chromosome2 : chromosome of gene 2 + - gene_end1 : end position for gene 1 + - gene_end2 : end position for gene 2 + - gene_location1 : location of breakpoint in gene 1 + - gene_location2 : location of breakpoint in gene 2 + - gene_start1 : start of gene 1 + - gene_start2 : start of gene 2 + - gene_strand1 : strand of gene 1 + - gene_strand2 : strand of gene 2 + - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome + - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint + - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint + - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream + - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream + - interchromosomal : fusion produced by an interchromosomal translocation + - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1 + - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2 + - inversion : fusion produced by genomic inversion + - orf : fusion combines genes in a way that preserves a reading frame + - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt) + - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement + - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region + - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region + - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2 + - splice_score : number of nucleotides similar to GTAG at fusion splice + - num_splice_variants : number of potential splice variants for this gene pair + - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2 + - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1 + + +**Example** + +results.tsv:: + + cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2 + 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 - + 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - - + + + diff -r efddb7a0b3db -r 36306d8086fa defuse/README --- a/defuse/README Fri Sep 16 13:07:35 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -The DeFuse galaxy tool is based on DeFuse_Version_0.4.2 - http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page - -DeFuse is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. - - -Manual: - http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 - - -The included defuse source code is from: http://sourceforge.net/projects/defuse/files/defuse/0.4/defuse-0.4.2.tar.gz/download -(without the defuse-0.4.2 dir level) -tar zxf defuse-0.4.2.tar.gz -cd tool -make -cd .. - -To use with non human genome references: -tar zxf modified_scripts.tgz -Defuse source was modified to include 2 extra parameters for non human references: gene_id_pattern and transcript_id_pattern - scripts/alignjob.pl - scripts/annotate_fusions.pl - scripts/calculate_expression_simple.pl - scripts/filter_bulk_fusion_reads.pl - scripts/filter_sam_genes.pl - scripts/find_concordant_ensembl.pl - scripts/find_gene_clusters.pl - - -The defuse.xml galaxy tool wrapper will generate a defuse config.txt using values from tool-data/defuse.loc -and call scripts/defuse.pl - - diff -r efddb7a0b3db -r 36306d8086fa defuse/defuse-0.4.2.tar.gz Binary file defuse/defuse-0.4.2.tar.gz has changed diff -r efddb7a0b3db -r 36306d8086fa defuse/defuse.xml --- a/defuse/defuse.xml Fri Sep 16 13:07:35 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,637 +0,0 @@ - - identify fusion transcripts - - - - - scripts/defuse.pl - -c `cp $defuse_config $config_txt; echo $defuse_config` - -d `mkdir -p data_dir; ln -s $left_pairendreads data_dir/reads_1.fastq; ln -s $right_pairendreads data_dir/reads_2.fastq; echo data_dir` - -o output_dir -p 8 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Position density when calculating covariance - - - - - - - - - - - - - - - - - - -#import ast -#if $refGenomeSource.genomeSource == "history": -#include raw $refGenomeSource.config.__str__ -#else -#set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value)) -# -# Configuration file for defuse -# -# At a minimum, change all values enclused by [] -# -# Gene/Transcript id pattern -gene_id_pattern = #slurp -#try -$ref_dict['gene_id_pattern'] -transcript_id_pattern = #slurp -#except -ENSG\d+ -#end try -#try -$ref_dict['transcript_id_pattern'] -#except -ENST\d+ -#end try - -# Directory where the defuse code was unpacked -## Default location in the tool/defuse directory -# source_directory = ${__root_dir__}/tools/defuse -source_directory = #slurp -#try -$ref_dict['source_directory'] -#except -${__root_dir__}/tools/defuse -#end try - -# Directory where you want your dataset -dataset_directory = #slurp -#try -$ref_dict['dataset_directory'] -#except -/project/db/genomes/Hsapiens/hg19/defuse -#end try - -# Input genome and gene models -gene_models = #slurp -#try -$ref_dict['gene_models'] -#except -\$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf -#end try -genome_fasta = #slurp -#try -$ref_dict['genome_fasta'] -#except -\$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa -#end try - -# Repeat table from ucsc genome browser -repeats_filename = #slurp -#try -$ref_dict['repeats_filename'] -#except -\$(dataset_directory)/rmsk.txt -#end try - -# EST info downloaded from ucsc genome browser -est_fasta = #slurp -#try -$ref_dict['est_fasta'] -#except -\$(dataset_directory)/est.fa -#end try -est_alignments = #slurp -#try -$ref_dict['est_alignments'] -#except -\$(dataset_directory)/intronEst.txt -#end try - -# Unigene clusters downloaded from ncbi -unigene_fasta = #slurp -#try -$ref_dict['unigene_fasta'] -#except -\$(dataset_directory)/Hs.seq.uniq -#end try - -# Paths to external tools -bowtie_bin = #slurp -#try -$ref_dict['bowtie_bin'] -#except -/soft/bowtie/0.12.7/bowtie -#end try -bowtie_build_bin = #slurp -#try -$ref_dict['bowtie_build_bin'] -#except -/soft/bowtie/0.12.7/bowtie-build -#end try -blat_bin = #slurp -#try -$ref_dict['blat_bin'] -#except -/soft/blat/34/bin/blat -#end try -fatotwobit_bin = #slurp -#try -$ref_dict['fatotwobit_bin'] -#except -/soft/blat/34/bin/faToTwoBit -#end try -r_bin = #slurp -#try -$ref_dict['r_bin'] -#except -/project/sdml-sles11-weblocal/R-2.12.1/bin/R -#end try -rscript_bin = #slurp -#try -$ref_dict['rscript_bin'] -#except -/project/sdml-sles11-weblocal/R-2.12.1/bin/Rscript -#end try - -#raw -# Dataset files -dataset_prefix = $(dataset_directory)/defuse -chromosome_prefix = $(dataset_prefix).dna.chromosomes -exons_fasta = $(dataset_prefix).exons.fa -cds_fasta = $(dataset_prefix).cds.fa -cdna_regions = $(dataset_prefix).cdna.regions -cdna_fasta = $(dataset_prefix).cdna.fa -reference_fasta = $(dataset_prefix).reference.fa -rrna_fasta = $(dataset_prefix).rrna.fa -ig_gene_list = $(dataset_prefix).ig.gene.list -repeats_regions = $(dataset_directory)/repeats.regions -est_split_fasta1 = $(dataset_directory)/est.1.fa -est_split_fasta2 = $(dataset_directory)/est.2.fa -est_split_fasta3 = $(dataset_directory)/est.3.fa -est_split_fasta4 = $(dataset_directory)/est.4.fa -est_split_fasta5 = $(dataset_directory)/est.5.fa -est_split_fasta6 = $(dataset_directory)/est.6.fa -est_split_fasta7 = $(dataset_directory)/est.7.fa -est_split_fasta8 = $(dataset_directory)/est.8.fa -est_split_fasta9 = $(dataset_directory)/est.9.fa - -# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs -prefilter1 = $(unigene_fasta) - -# deFuse scripts and tools -scripts_directory = $(source_directory)/scripts -tools_directory = $(source_directory)/tools -data_directory = $(source_directory)/data -#end raw - -# Path to samtools, 0.1.8 is compiled for you, use other versions at your own risk -samtools_bin = #slurp -#try -$ref_dict['samtools_bin'] -#except -\$(source_directory)/external/samtools-0.1.8/samtools -#end try - -# Bowtie parameters -bowtie_threads = #slurp -#try -$ref_dict['bowtie_threads'] -#except -1 -#end try -bowtie_quals = #slurp -#try -$ref_dict['bowtie_quals'] -#except ---phred33-quals -#end try -max_insert_size = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "": -$refGenomeSource.defuse_param.max_insert_size -#else -#try -$ref_dict['max_insert_size'] -#except -500 -#end try -#end if - -# Parameters for building the dataset -chromosomes = #slurp -#try -$ref_dict.chromosomes -#except -1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT -#end try -mt_chromosome = #slurp -#try -$ref_dict['mt_chromosome'] -#except -MT -#end try -gene_sources = #slurp -#try -$ref_dict['gene_sources'] -#except -IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding -#end try -ig_gene_sources = #slurp -#try -$ref_dict['ig_gene_sources'] -#except -IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene -#end try -rrna_gene_sources = #slurp -#try -$ref_dict['rrna_gene_sources'] -#except -Mt_rRNA,rRNA,rRNA_pseudogene -#end try - -# Blat sequences per job -num_blat_sequences = #slurp -#try -$ref_dict['num_blat_sequences'] -#except -10000 -#end try - -# Minimum gene fusion range -dna_concordant_length = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "": -$refGenomeSource.defuse_param.dna_concordant_length -#else -#try -$ref_dict['dna_concordant_length'] -#except -2000 -#end try -#end if - -# Trim length for discordant reads (split reads are not trimmed) -discord_read_trim = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "": -$refGenomeSource.defuse_param.discord_read_trim -#else -#try -$ref_dict['discord_read_trim'] -#except -50 -#end try -#end if - -# Filtering parameters -clustering_precision = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != "" -$refGenomeSource.defuse_param.clustering_precision -#else -#try -$ref_dict['clustering_precision'] -#except -0.95 -#end try -#end if -span_count_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != "" -$refGenomeSource.defuse_param.span_count_threshold -#else -#try -$ref_dict['span_count_threshold'] -#except -5 -#end try -#end if -split_count_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != "" -$refGenomeSource.defuse_param.split_count_threshold -#else -#try -$ref_dict['split_count_threshold'] -#except -3 -#end try -#end if -percent_identity_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != "" -$refGenomeSource.defuse_param.percent_identity_threshold -#else -#try -$ref_dict['percent_identity_threshold'] -#except -0.90 -#end try -#end if -max_dist_pos = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != "" -$refGenomeSource.defuse_param.max_dist_pos -#else -#try -$ref_dict['max_dist_pos'] -#except -600 -#end try -#end if -num_dist_genes = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != "" -$refGenomeSource.defuse_param.num_dist_genes -#else -#try -$ref_dict['num_dist_genes'] -#except -500 -#end try -#end if -split_min_anchor = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != "" -$refGenomeSource.defuse_param.split_min_anchor -#else -#try -$ref_dict['split_min_anchor'] -#except -4 -#end try -#end if -max_concordant_ratio = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != "" -$refGenomeSource.defuse_param.max_concordant_ratio -#else -#try -$ref_dict['max_concordant_ratio'] -#except -0.1 -#end try -#end if -splice_bias = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != "" -$refGenomeSource.defuse_param.splice_bias -#else -#try -$ref_dict['splice_bias'] -#except -10 -#end try -#end if -denovo_assembly = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != "" -$refGenomeSource.defuse_param.denovo_assembly -#else -#try -$ref_dict['denovo_assembly'] -#except -no -#end try -#end if -probability_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != "" -$refGenomeSource.defuse_param.probability_threshold -#else -#try -$ref_dict['probability_threshold'] -#except -0.50 -#end try -#end if -positive_controls = \$(data_directory)/controls.txt - -# Position density when calculating covariance -covariance_sampling_density = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != "" -$refGenomeSource.defuse_param.covariance_sampling_density -#else -#try -$ref_dict['covariance_sampling_density'] -#except -0.01 -#end try -#end if - - -# Number of reads for each job in split -reads_per_job = 1000000 - -# Number of regions for each breakpoint sequence job in split -regions_per_job = 20 - -#raw -# If you have command line 'mail' and wish to be notified -# mailto = andrew.mcpherson@gmail.com - -# Remove temp files -remove_job_files = yes -remove_job_temp_files = yes - -# Converting to fastq -# Fastq converter config format 1 for reads stored in separate files for each end -# data_lane_rexex_N is a perl regex which stores the lane id in $1 -# data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1 -# data_compress_regex_N is a perl regex which stores the compression extension in $1 -# data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout -# Fastq converter config format 2 for reads stored in separate files for each end -# data_lane_regex_N is a perl regex which stores the lane id in $1 -# data_compress_regex_N is a perl regex which stores the compression extension in $1 -# data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout -# data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout - -data_lane_regex_1 = ^(.+)_[12]_export\.txt.*$ -data_end_regex_1 = ^.+_([12])_export\.txt.*$ -data_compress_regex_1 = ^.+_[12]_export\.txt(.*)$ -data_converter_1 = $(scripts_directory)/fq_all2std.pl export2std - -data_lane_regex_2 = ^(.+)_[12]_concat_qseq\.txt.*$ -data_end_regex_2 = ^.+_([12])_concat_qseq\.txt.*$ -data_compress_regex_2 = ^.+_[12]_concat_qseq\.txt(.*)$ -data_converter_2 = $(scripts_directory)/qseq2fastq.pl - -data_lane_regex_3 = ^(.+)\.bam.*$ -data_compress_regex_3 = ^.+\.bam(.*)$ -data_end1_converter_3 = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl -data_end2_converter_3 = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl - -data_lane_regex_4 = ^(.+).[12].fastq.*$ -data_end_regex_4 = ^.+.([12]).fastq.*$ -data_compress_regex_4 = ^.+.[12].fastq(.*)$ -data_converter_4 = cat -#end raw - -#end if - - - - - - - - - - - - - -**DeFuse** - -DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. - -Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138 - -.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page - ------- - -**Inputs** - -DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**). - -If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq. - -DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_: - - genome_fasta from Ensembl - - gene_models from Ensembl - - repeats_filename from UCSC RepeatMasker rmsk.txt - - est_fasta from UCSC - - est_alignments from UCSC intronEst.txt - - unigene_fasta from NCBI - -.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 - ------- - -**Outputs** - -The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates. - -DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt. - -The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order): - - - **Identification** - - cluster_id : random identifier assigned to each prediction - - library_name : library name given on the command line of defuse - - gene1 : ensembl id of gene 1 - - gene2 : ensembl id of gene 2 - - gene_name1 : name of gene 1 - - gene_name2 : name of gene 2 - - **Evidence** - - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable - - concordant_ratio : proportion of spanning reads considered concordant by blat - - denovo_min_count : minimum kmer count across denovo assembled sequence - - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly - - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive - - gene_align_strand1 : alignment strand for spanning read alignments to gene 1 - - gene_align_strand2 : alignment strand for spanning read alignments to gene 2 - - min_map_count : minimum of the number of genomic mappings for each spanning read - - max_map_count : maximum of the number of genomic mappings for each spanning read - - mean_map_count : average of the number of genomic mappings for each spanning read - - num_multi_map : number of spanning reads that map to more than one genomic location - - span_count : number of spanning reads supporting the fusion - - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage - - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage - - span_coverage_min : minimum of span_coverage1 and span_coverage2 - - span_coverage_max : maximum of span_coverage1 and span_coverage2 - - splitr_count : number of split reads supporting the prediction - - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive - - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive - - splitr_sequence : fusion sequence predicted by split reads - - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive - - **Annotation** - - adjacent : fusion between adjacent genes - - altsplice : fusion likely the product of alternative splicing between adjacent genes - - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1 - - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2 - - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2 - - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2 - - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands - - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna - - deletion : fusion produced by a genomic deletion - - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est - - eversion : fusion produced by a genomic eversion - - exonboundaries : fusion splice at exon boundaries - - expression1 : expression of gene 1 as number of concordant pairs aligned to exons - - expression2 : expression of gene 2 as number of concordant pairs aligned to exons - - gene_chromosome1 : chromosome of gene 1 - - gene_chromosome2 : chromosome of gene 2 - - gene_end1 : end position for gene 1 - - gene_end2 : end position for gene 2 - - gene_location1 : location of breakpoint in gene 1 - - gene_location2 : location of breakpoint in gene 2 - - gene_start1 : start of gene 1 - - gene_start2 : start of gene 2 - - gene_strand1 : strand of gene 1 - - gene_strand2 : strand of gene 2 - - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome - - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint - - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint - - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream - - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream - - interchromosomal : fusion produced by an interchromosomal translocation - - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1 - - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2 - - inversion : fusion produced by genomic inversion - - orf : fusion combines genes in a way that preserves a reading frame - - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt) - - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement - - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region - - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region - - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2 - - splice_score : number of nucleotides similar to GTAG at fusion splice - - num_splice_variants : number of potential splice variants for this gene pair - - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2 - - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1 - - -**Example** - -results.tsv:: - - cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2 - 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 - - 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - - - - - diff -r efddb7a0b3db -r 36306d8086fa defuse/modified_scripts.tgz Binary file defuse/modified_scripts.tgz has changed diff -r efddb7a0b3db -r 36306d8086fa defuse/tool-data/defuse.loc.sample --- a/defuse/tool-data/defuse.loc.sample Fri Sep 16 13:07:35 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -## Configurstion info for prepared data references for DeFuse -## http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 -## 3 columns separated by the TAB character -## The 3rd column has dictionary values that will be substituted in the config file for defuse -## It should likely contain keys: dataset_directory gene_models genome_fasta repeats_filename est_fasta est_alignments unigene_fasta -## If this is not a Homo_sapiens reference also need keys: gene_id_pattern transcript_id_pattern chromosomes - -#db_key name {'config_key':'config_value'} -hg19 GRCh37(hg19) {'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSG\d+', 'transcript_id_pattern':'ENST\d+', 'dataset_directory':'/project/db/genomes/Hsapiens/hg19/defuse', 'gene_models':'$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf', 'genome_fasta':'$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Hs.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'} -mm9 NCBIM37(mm9) {'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSMUSG\d+', 'transcript_id_pattern':'ENSMUST\d+', 'dataset_directory':'/project/db/genomes/Mmusculus/mm9/defuse', 'gene_models':'$(dataset_directory)/Mus_musculus.NCBIM37.63.gtf', 'genome_fasta':'$(dataset_directory)/Mus_musculus.NCBIM37.63.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Mm.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'} diff -r efddb7a0b3db -r 36306d8086fa modified_scripts.tgz Binary file modified_scripts.tgz has changed diff -r efddb7a0b3db -r 36306d8086fa tool-data/defuse.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/defuse.loc.sample Fri Sep 16 12:41:37 2011 -0500 @@ -0,0 +1,10 @@ +## Configurstion info for prepared data references for DeFuse +## http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 +## 3 columns separated by the TAB character +## The 3rd column has dictionary values that will be substituted in the config file for defuse +## It should likely contain keys: dataset_directory gene_models genome_fasta repeats_filename est_fasta est_alignments unigene_fasta +## If this is not a Homo_sapiens reference also need keys: gene_id_pattern transcript_id_pattern chromosomes + +#db_key name {'config_key':'config_value'} +hg19 GRCh37(hg19) {'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSG\d+', 'transcript_id_pattern':'ENST\d+', 'dataset_directory':'/project/db/genomes/Hsapiens/hg19/defuse', 'gene_models':'$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf', 'genome_fasta':'$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Hs.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'} +mm9 NCBIM37(mm9) {'samtools_bin':'/soft/samtools/0.1.12a/bin/samtools', 'gene_id_pattern':'ENSMUSG\d+', 'transcript_id_pattern':'ENSMUST\d+', 'dataset_directory':'/project/db/genomes/Mmusculus/mm9/defuse', 'gene_models':'$(dataset_directory)/Mus_musculus.NCBIM37.63.gtf', 'genome_fasta':'$(dataset_directory)/Mus_musculus.NCBIM37.63.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Mm.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'}