# HG changeset patch # User jjohnson # Date 1453057866 18000 # Node ID b22f8634ff847f8e5889d46ad7b6a2c432392cc3 # Parent f65857c1b92e0010dec64abffd7faaa3aa5baa2a planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 23b94b5747c6956360cd2eca0a07a669929ea141-dirty diff -r f65857c1b92e -r b22f8634ff84 README --- a/README Mon Jan 14 12:24:28 2013 -0600 +++ b/README Sun Jan 17 14:11:06 2016 -0500 @@ -1,11 +1,12 @@ -The DeFuse galaxy tool is based on DeFuse_Version_0.6.0 +The DeFuse galaxy tool is based on DeFuse_Version_0.6.2 http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page +https://bitbucket.org/dranew/defuse DeFuse is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. Manual: -http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.0 +http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.2 The included tool_dependencies.xml will download and install the defuse code. It will set the environment variable: "DEFUSE_PATH" to the location of the defuse install. @@ -34,8 +35,13 @@ These datasets should be referenced in the tool-data/defuse.loc file. +The create_reference_dataset will run the create_reference_dataset.pl script to generate deFuse genome reference data in a galaxy dataset. +This should me made available in the future as a Galaxy DataManager. -External Tools ( http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.0 ) + +Galaxy will try to auto-install dependencies: + +External Tools ( http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.2 ) deFuse relies on other publically available tools as part of its pipeline. Some of these tools are not included with the deFuse download. Obtain these tools as detailed below. Download samtools The latest version of samtools can be downloaded from sourceforge: https://sourceforge.net/projects/samtools/files/samtools. diff -r f65857c1b92e -r b22f8634ff84 create_reference_dataset.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create_reference_dataset.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,308 @@ + + create a defuse reference from Ensembl and UCSC sources + + macros.xml + + + + + + /bin/bash $defuse_script + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Examples: homo_sapiens, mus_musculus, rattus_norvegicus + ftp://ftp.ensembl.org/pub/release-$ensembl_version/fasta/$ensembl_organism/dna/$ensembl_prefix.$ensembl_genome_version.$ensembl_version.dna.chromosome.$chromosome.fa.gz + + + + + + + + + + Examples: + Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT + Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT + Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT + ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ ) + + + + + + + + + + + + + + + + + + + +# +# Configuration file for defuse +# +# Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__ +# will be set by the runtime script using the ENV PATH +# + +# Directory where the defuse code was unpacked +source_directory = __DEFUSE_PATH__ + +# Organism IDs +ensembl_organism = $genome.ensembl_organism +ensembl_prefix = $genome.ensembl_prefix +ensembl_version = $genome.ensembl_version +ensembl_genome_version = $genome.ensembl_genome_version +ucsc_genome_version = $genome.ucsc_genome_version +ncbi_organism = $genome.ncbi_organism +ncbi_prefix = $genome.ncbi_prefix + +# Directory where you want your dataset +dataset_directory = $config_txt.dataset.extra_files_path + +#raw +# Input genome and gene models +gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf +genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa + +# Repeat table from ucsc genome browser +repeats_filename = $(dataset_directory)/repeats.txt + +# EST info downloaded from ucsc genome browser +est_fasta = $(dataset_directory)/est.fa +est_alignments = $(dataset_directory)/intronEst.txt + +# Unigene clusters downloaded from ncbi +unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq +#end raw + +# Paths to external tools +samtools_bin = __SAMTOOLS_BIN__ +bowtie_bin = __BOWTIE_BIN__ +bowtie_build_bin = __BOWTIE_BUILD_BIN__ +blat_bin = __BLAT_BIN__ +fatotwobit_bin = __FATOTWOBIT_BIN__ +gmap_bin = __GMAP_BIN__ +gmap_setup_bin = __GMAP_SETUP_BIN__ +r_bin = __R_BIN__ +rscript_bin = __RSCRIPT_BIN__ + +#raw +# Directory where you want your dataset +gmap_index_directory = $(dataset_directory)/gmap +#end raw + +#raw +# Dataset files +dataset_prefix = $(dataset_directory)/defuse +chromosome_prefix = $(dataset_prefix).dna.chromosomes +exons_fasta = $(dataset_prefix).exons.fa +cds_fasta = $(dataset_prefix).cds.fa +cdna_regions = $(dataset_prefix).cdna.regions +cdna_fasta = $(dataset_prefix).cdna.fa +reference_fasta = $(dataset_prefix).reference.fa +rrna_fasta = $(dataset_prefix).rrna.fa +ig_gene_list = $(dataset_prefix).ig.gene.list +repeats_regions = $(dataset_directory)/repeats.regions +est_split_fasta1 = $(dataset_directory)/est.1.fa +est_split_fasta2 = $(dataset_directory)/est.2.fa +est_split_fasta3 = $(dataset_directory)/est.3.fa +est_split_fasta4 = $(dataset_directory)/est.4.fa +est_split_fasta5 = $(dataset_directory)/est.5.fa +est_split_fasta6 = $(dataset_directory)/est.6.fa +est_split_fasta7 = $(dataset_directory)/est.7.fa +est_split_fasta8 = $(dataset_directory)/est.8.fa +est_split_fasta9 = $(dataset_directory)/est.9.fa + +# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs +prefilter1 = $(unigene_fasta) + +# deFuse scripts and tools +scripts_directory = $(source_directory)/scripts +tools_directory = $(source_directory)/tools +data_directory = $(source_directory)/data +#end raw + +# Parameters for building the dataset +chromosomes = $genome.chromosomes +mt_chromosome = $genome.mt_chromosome +gene_sources = $genome.gene_sources +ig_gene_sources = $genome.ig_gene_sources +rrna_gene_sources = $genome.rrna_gene_sources +gene_biotypes = $genome.gene_sources +ig_gene_biotypes = $genome.ig_gene_sources +rrna_gene_biotypes = $genome.rrna_gene_sources + +#raw +# Remove temp files +remove_job_files = yes +remove_job_temp_files = yes +#end raw + + +#!/bin/bash +## define some things for cheetah proccessing +#set $amp = chr(38) +#set $gt = chr(62) +## substitute pathnames into config file +if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi +if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi +if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi +if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi +if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi +if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi +if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi +if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi +if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi +if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi +if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi +## copy config to output +cp $defuse_config $config_txt +## make a data_dir and ln -s the input fastq +mkdir -p $config_txt.dataset.extra_files_path +## create_reference_dataset.pl +perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config + + + + + + +**DeFuse** + +DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details. + +DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_: + - genome_fasta from Ensembl + - gene_models from Ensembl + - repeats_filename from UCSC RepeatMasker rmsk.txt + - est_fasta from UCSC + - est_alignments from UCSC intronEst.txt + - unigene_fasta from NCBI + +The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours. + + +It will generate a config.txt file that can be input into the deFuse Galaxy tool. + +Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138 + +.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page + +.. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1 + +------ + +**Outputs** + +The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths. + + + + diff -r f65857c1b92e -r b22f8634ff84 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,25 @@ + + + + + + + + + + + + ${value}/defuse + + + ${GALAXY_DATA_MANAGER_DATA_PATH}/${value}/defuse/${value}.config + abspath + + + + + + + diff -r f65857c1b92e -r b22f8634ff84 datamanager_create_reference.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datamanager_create_reference.py Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +import sys +import os +import re +import tempfile +import subprocess +import fileinput +import shutil +import optparse +import urllib2 +from ftplib import FTP +import tarfile + +from galaxy.util.json import from_json_string, to_json_string + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit(1) + +def get_config_dict(config,dataset_directory=None): + keys = ['dataset_directory','ensembl_organism','ensembl_prefix','ensembl_version','ensembl_genome_version','ucsc_genome_version','ncbi_organism','ncbi_prefix','chromosomes','mt_chromosome','gene_sources','ig_gene_sources','rrna_gene_sources'] + pat = '^([^=]+?)\s*=\s*(.*)$' + config_dict = {} + try: + fh = open(config) + for i,l in enumerate(fh): + line = l.strip() + if line.startswith('#'): + continue + m = re.match(pat,line) + if m and len(m.groups()) == 2: + (k,v) = m.groups() + if k in keys: + config_dict[k] = v + except Exception, e: + stop_err( 'Error parsing %s %s\n' % (config,str( e )) ) + else: + fh.close() + if dataset_directory: + config_dict['dataset_directory'] = dataset_directory + return config_dict + +def run_defuse_script(data_manager_dict, params, target_directory, dbkey, description, config, script): + if not os.path.isdir(target_directory): + os.makedirs(target_directory) + ## Name the config consistently with data_manager_conf.xml + # copy the config file to the target_directory + # when DataManager moves files to there tool-data location, the config will get moved as well, + # and the value_translation in data_manager_conf.xml will tell us the new location + # defuse.xml will use the path to this config file to set the dataset_directory + config_name = '%s.config' % dbkey + defuse_config = os.path.join( target_directory, config_name) + shutil.copyfile(config,defuse_config) + cmd = "/bin/bash %s %s" % (script,target_directory) + # Run + try: + tmp_out = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp_out, 'wb' ) + tmp_err = tempfile.NamedTemporaryFile().name + tmp_stderr = open( tmp_err, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp_err, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stdout.close() + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + + # TODO: look for errors in program output. + except Exception, e: + stop_err( 'Error creating defuse reference:\n' + str( e ) ) + config_dict = get_config_dict(config, dataset_directory=target_directory) + data_table_entry = dict(value=dbkey, dbkey=dbkey, name=description, path=config_name) + _add_data_table_entry( data_manager_dict, data_table_entry ) +def _add_data_table_entry( data_manager_dict, data_table_entry ): + data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) + data_manager_dict['data_tables']['defuse_reference'] = data_manager_dict['data_tables'].get( 'defuse_reference', [] ) + data_manager_dict['data_tables']['defuse_reference'].append( data_table_entry ) + return data_manager_dict + +def main(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-k', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey' ) + parser.add_option( '-d', '--description', dest='description', action='store', type="string", default=None, help='description' ) + parser.add_option( '-c', '--defuse_config', dest='defuse_config', action='store', type="string", default=None, help='defuse_config' ) + parser.add_option( '-s', '--defuse_script', dest='defuse_script', action='store', type="string", default=None, help='defuse_script' ) + (options, args) = parser.parse_args() + + filename = args[0] + + params = from_json_string( open( filename ).read() ) + target_directory = params[ 'output_data' ][0]['extra_files_path'] + os.mkdir( target_directory ) + data_manager_dict = {} + + + #Create Defuse Reference Data + run_defuse_script( data_manager_dict, params, target_directory, options.dbkey, options.description,options.defuse_config,options.defuse_script) + + #save info to json file + open( filename, 'wb' ).write( to_json_string( data_manager_dict ) ) + +if __name__ == "__main__": main() + diff -r f65857c1b92e -r b22f8634ff84 datamanager_create_reference.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datamanager_create_reference.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,307 @@ + + create a defuse reference from Ensembl and UCSC sources + + defuse + samtools + bowtie + gmap + kent + + datamanager_create_reference.py + --dbkey $genome.ensembl_genome_version + --description "$genome.ensembl_prefix $genome.ensembl_genome_version ($genome.ucsc_genome_version)" + --defuse_config $defuse_config + --defuse_script $defuse_script + $out_file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Examples: + Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT + Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT + Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT + ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ ) + + + + + + + + + + + + + + + + + + + +# +# Configuration file for defuse +# +# Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__ +# will be set by the runtime script using the ENV PATH +# + +# Directory where the defuse code was unpacked +source_directory = __DEFUSE_PATH__ + +# Organism IDs +ensembl_organism = $genome.ensembl_organism +ensembl_prefix = $genome.ensembl_prefix +ensembl_version = $genome.ensembl_version +ensembl_genome_version = $genome.ensembl_genome_version +ucsc_genome_version = $genome.ucsc_genome_version +ncbi_organism = $genome.ncbi_organism +ncbi_prefix = $genome.ncbi_prefix + +# Directory where you want your dataset +dataset_directory = __DATASET_DIRECTORY__ + +#raw +# Input genome and gene models +gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf +genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa + +# Repeat table from ucsc genome browser +repeats_filename = $(dataset_directory)/repeats.txt + +# EST info downloaded from ucsc genome browser +est_fasta = $(dataset_directory)/est.fa +est_alignments = $(dataset_directory)/intronEst.txt + +# Unigene clusters downloaded from ncbi +unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq +#end raw + +# Paths to external tools +samtools_bin = __SAMTOOLS_BIN__ +bowtie_bin = __BOWTIE_BIN__ +bowtie_build_bin = __BOWTIE_BUILD_BIN__ +blat_bin = __BLAT_BIN__ +fatotwobit_bin = __FATOTWOBIT_BIN__ +gmap_bin = __GMAP_BIN__ +gmap_setup_bin = __GMAP_SETUP_BIN__ +r_bin = __R_BIN__ +rscript_bin = __RSCRIPT_BIN__ + +#raw +# Directory where you want your dataset +gmap_index_directory = $(dataset_directory)/gmap +#end raw + +#raw +# Dataset files +dataset_prefix = $(dataset_directory)/defuse +chromosome_prefix = $(dataset_prefix).dna.chromosomes +exons_fasta = $(dataset_prefix).exons.fa +cds_fasta = $(dataset_prefix).cds.fa +cdna_regions = $(dataset_prefix).cdna.regions +cdna_fasta = $(dataset_prefix).cdna.fa +reference_fasta = $(dataset_prefix).reference.fa +rrna_fasta = $(dataset_prefix).rrna.fa +ig_gene_list = $(dataset_prefix).ig.gene.list +repeats_regions = $(dataset_directory)/repeats.regions +est_split_fasta1 = $(dataset_directory)/est.1.fa +est_split_fasta2 = $(dataset_directory)/est.2.fa +est_split_fasta3 = $(dataset_directory)/est.3.fa +est_split_fasta4 = $(dataset_directory)/est.4.fa +est_split_fasta5 = $(dataset_directory)/est.5.fa +est_split_fasta6 = $(dataset_directory)/est.6.fa +est_split_fasta7 = $(dataset_directory)/est.7.fa +est_split_fasta8 = $(dataset_directory)/est.8.fa +est_split_fasta9 = $(dataset_directory)/est.9.fa + +# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs +prefilter1 = $(unigene_fasta) + +# deFuse scripts and tools +scripts_directory = $(source_directory)/scripts +tools_directory = $(source_directory)/tools +data_directory = $(source_directory)/data +#end raw + +# Parameters for building the dataset +chromosomes = $genome.chromosomes +mt_chromosome = $genome.mt_chromosome +gene_sources = $genome.gene_sources +ig_gene_sources = $genome.ig_gene_sources +rrna_gene_sources = $genome.rrna_gene_sources +gene_biotypes = $genome.gene_sources +ig_gene_biotypes = $genome.ig_gene_sources +rrna_gene_biotypes = $genome.rrna_gene_sources + +#raw +# Remove temp files +remove_job_files = yes +remove_job_temp_files = yes +#end raw + + #slurp +#!/bin/bash +## define some things for cheetah proccessing +#set $amp = chr(38) +#set $gt = chr(62) +## substitute pathnames into config file +if `grep __DATASET_DIRECTORY__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DATASET_DIRECTORY__#\$1#" $defuse_config; fi +if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi +if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi +if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi +if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi +if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi +if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi +if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi +if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi +if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi +if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi +if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi +## copy config to output +cp $defuse_config \$1/defuse_config.txt +## Run the create_reference_dataset.pl +perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config + + + + + + +**DeFuse** + +DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details. + +DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_: + - genome_fasta from Ensembl + - gene_models from Ensembl + - repeats_filename from UCSC RepeatMasker rmsk.txt + - est_fasta from UCSC + - est_alignments from UCSC intronEst.txt + - unigene_fasta from NCBI + +The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours. + + +It will generate the refernce data for deFuse Galaxy tool. + +Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138 + +.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page + +.. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1 + +------ + +**Outputs** + +The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths. + + + diff -r f65857c1b92e -r b22f8634ff84 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,7 @@ + + + + + + + diff -r f65857c1b92e -r b22f8634ff84 defuse.xml --- a/defuse.xml Mon Jan 14 12:24:28 2013 -0600 +++ b/defuse.xml Sun Jan 17 14:11:06 2016 -0500 @@ -1,103 +1,150 @@ - - identify fusion transcripts - - defuse - samtools - bowtie - gmap - blat - fatotwobit - + + identify fusion transcripts + + macros.xml + + + + + + /bin/bash $shscript + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Position density when calculating covariance - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + Position density when calculating covariance + + + + Maximum number of alignments for a read pair, Pairs with more alignments are filtered, default is 10 + + + + + + + + + + + + - + + + + keep_output == True - - - + + do_get_reads == True + + breakpoints_bam == True + + -#import ast +#import re +#set $ds = chr(36) #if $refGenomeSource.genomeSource == "history": -#include raw $refGenomeSource.config.__str__ +#set config_file = $refGenomeSource.config.__str__ #else -#set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value)) +#set config_file = $refGenomeSource.index.value +#end if +#set pat = '^\s*([^#=][^=]*?)\s*=\s*(.*?)\s*$' +#set fh = open($config_file) +#set keys = ['dataset_directory','ensembl_organism','ensembl_prefix','ensembl_version','ensembl_genome_version','ucsc_genome_version','ncbi_organism','ncbi_prefix','chromosomes','mt_chromosome','gene_sources','ig_gene_sources','rrna_gene_sources'] +#set kv = [] +#for $line in $fh: + #set m = $re.match($pat,$line) + #if $m and len($m.groups()) == 2: + ## #echo $line + #if $m.groups()[0] in keys: + #set k = $m.groups()[0] + #if k == 'dataset_directory' and $refGenomeSource.genomeSource == "indexed": + ## The DataManager is conifgured to place the config file in the same directory as the defuse_data: dataset_directory + #set v = $os.path.dirname($config_file) + #else: + #set v = $m.groups()[1] + #end if + #set kv = $kv + [[$k, $v]] + #end if + #end if +#end for +## #echo $kv +#set ref_dict = dict($kv) +## #echo $ref_dict +## include raw $refGenomeSource.config.__str__ # # Configuration file for defuse # @@ -107,12 +154,7 @@ # Directory where the defuse code was unpacked ## Default location in the tool/defuse directory # source_directory = ${__root_dir__}/tools/defuse -source_directory = #slurp -#try -$ref_dict['source_directory'] -#except -__DEFUSE_PATH__ -#end try +source_directory = __DEFUSE_PATH__ # Directory where you want your dataset dataset_directory = #slurp @@ -122,18 +164,68 @@ /project/db/genomes/Hsapiens/hg19/defuse #end try +# Organism IDs +ensembl_organism = #slurp +#try +$ref_dict['ensembl_organism'] +#except +homo_sapiens +#end try + +ensembl_prefix = #slurp +#try +$ref_dict['ensembl_prefix'] +#except +Homo_sapiens +#end try + +ensembl_version = #slurp +#try +$ref_dict['ensembl_version'] +#except +71 +#end try + +ensembl_genome_version = #slurp +#try +$ref_dict['ensembl_genome_version'] +#except +GRCh37 +#end try + +ucsc_genome_version = #slurp +#try +$ref_dict['ucsc_genome_version'] +#except +hg19 +#end try + +ncbi_organism = #slurp +#try +$ref_dict['ncbi_organism'] +#except +Homo_sapiens +#end try + +ncbi_prefix = #slurp +#try +$ref_dict['ncbi_prefix'] +#except +Hs +#end try + # Input genome and gene models gene_models = #slurp #try $ref_dict['gene_models'] #except -\$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf +\$(dataset_directory)/\$(ensembl_prefix).\$(ensembl_genome_version).\$(ensembl_version).gtf #end try genome_fasta = #slurp #try $ref_dict['genome_fasta'] #except -\$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa +\$(dataset_directory)/\$(ensembl_prefix).\$(ensembl_genome_version).\$(ensembl_version).dna.chromosomes.fa #end try # Repeat table from ucsc genome browser @@ -163,71 +255,28 @@ #try $ref_dict['unigene_fasta'] #except -\$(dataset_directory)/Hs.seq.uniq +\$(dataset_directory)/\$(ncbi_prefix).seq.uniq #end try # Paths to external tools -bowtie_bin = #slurp -#try -$ref_dict['bowtie_bin'] -#except -__BOWTIE_BIN__ -#end try -bowtie_build_bin = #slurp -#try -$ref_dict['bowtie_build_bin'] -#except -__BOWTIE_BUILD_BIN__ -#end try -blat_bin = #slurp -#try -$ref_dict['blat_bin'] -#except -__BLAT_BIN__ -#end try -fatotwobit_bin = #slurp -#try -$ref_dict['fatotwobit_bin'] -#except -__FATOTWOBIT_BIN__ -#end try -gmap_bin = #slurp -#try -$ref_dict['gmap_bin'] -#except -__GMAP_BIN__ -#end try -gmap_bin = #slurp -#try -$ref_dict['gmap_bin'] -#except -__GMAP_BIN__ -#end try -gmap_setup_bin = #slurp -#try -$ref_dict['gmap_setup_bin'] -#except -__GMAP_SETUP_BIN__ -#end try -r_bin = #slurp -#try -$ref_dict['r_bin'] -#except -__R_BIN__ -#end try -rscript_bin = #slurp -#try -$ref_dict['rscript_bin'] -#except -__RSCRIPT_BIN__ -#end try +bowtie_bin = __BOWTIE_BIN__ +bowtie_build_bin = __BOWTIE_BUILD_BIN__ +blat_bin = __BLAT_BIN__ +fatotwobit_bin = __FATOTWOBIT_BIN__ +gmap_bin = __GMAP_BIN__ +gmap_bin = __GMAP_BIN__ +gmap_setup_bin = __GMAP_SETUP_BIN__ +r_bin = __R_BIN__ +rscript_bin = __RSCRIPT_BIN__ # Directory where you want your dataset gmap_index_directory = #slurp #try $ref_dict['gmap_index_directory'] #except -\$(dataset_directory)/gmap +#raw +$(dataset_directory)/gmap +#end raw #end try #raw @@ -282,9 +331,15 @@ #except --phred33-quals #end try +bowtie_params = #slurp +#try +$ref_dict['bowtie_params'] +#except +--chunkmbs 200 +#end try max_insert_size = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "": -$refGenomeSource.defuse_param.max_insert_size +#if $defuse_param.settings == "full" and $defuse_param.max_insert_size.__str__ != "": +$defuse_param.max_insert_size #else #try $ref_dict['max_insert_size'] @@ -335,8 +390,8 @@ # Minimum gene fusion range dna_concordant_length = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "": -$refGenomeSource.defuse_param.dna_concordant_length +#if $defuse_param.settings == "full" and $defuse_param.dna_concordant_length.__str__ != "": +$defuse_param.dna_concordant_length #else #try $ref_dict['dna_concordant_length'] @@ -347,8 +402,8 @@ # Trim length for discordant reads (split reads are not trimmed) discord_read_trim = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "": -$refGenomeSource.defuse_param.discord_read_trim +#if $defuse_param.settings == "full" and $defuse_param.discord_read_trim.__str__ != "": +$defuse_param.discord_read_trim #else #try $ref_dict['discord_read_trim'] @@ -356,11 +411,21 @@ 50 #end try #end if - +# Calculate extra annotations, fusion splice index and interrupted index +calculate_extra_annotations = #slurp +#if $defuse_param.settings == "full" and $defuse_param.calculate_extra_annotations.__str__ != "": +$defuse_param.calculate_extra_annotations +#else +#try +$ref_dict['calculate_extra_annotations'] +#except +no +#end try +#end if # Filtering parameters clustering_precision = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != "" -$refGenomeSource.defuse_param.clustering_precision +#if $defuse_param.settings == "full" and $defuse_param.clustering_precision.__str__ != "" +$defuse_param.clustering_precision #else #try $ref_dict['clustering_precision'] @@ -369,8 +434,8 @@ #end try #end if span_count_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != "" -$refGenomeSource.defuse_param.span_count_threshold +#if $defuse_param.settings == "full" and $defuse_param.span_count_threshold.__str__ != "" +$defuse_param.span_count_threshold #else #try $ref_dict['span_count_threshold'] @@ -378,19 +443,9 @@ 5 #end try #end if -split_count_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != "" -$refGenomeSource.defuse_param.split_count_threshold -#else -#try -$ref_dict['split_count_threshold'] -#except -3 -#end try -#end if percent_identity_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != "" -$refGenomeSource.defuse_param.percent_identity_threshold +#if $defuse_param.settings == "full" and $defuse_param.percent_identity_threshold.__str__ != "" +$defuse_param.percent_identity_threshold #else #try $ref_dict['percent_identity_threshold'] @@ -398,29 +453,9 @@ 0.90 #end try #end if -max_dist_pos = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != "" -$refGenomeSource.defuse_param.max_dist_pos -#else -#try -$ref_dict['max_dist_pos'] -#except -600 -#end try -#end if -num_dist_genes = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != "" -$refGenomeSource.defuse_param.num_dist_genes -#else -#try -$ref_dict['num_dist_genes'] -#except -500 -#end try -#end if split_min_anchor = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != "" -$refGenomeSource.defuse_param.split_min_anchor +#if $defuse_param.settings == "full" and $defuse_param.split_min_anchor.__str__ != "" +$defuse_param.split_min_anchor #else #try $ref_dict['split_min_anchor'] @@ -428,19 +463,9 @@ 4 #end try #end if -max_concordant_ratio = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != "" -$refGenomeSource.defuse_param.max_concordant_ratio -#else -#try -$ref_dict['max_concordant_ratio'] -#except -0.1 -#end try -#end if splice_bias = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != "" -$refGenomeSource.defuse_param.splice_bias +#if $defuse_param.settings == "full" and $defuse_param.splice_bias.__str__ != "" +$defuse_param.splice_bias #else #try $ref_dict['splice_bias'] @@ -449,8 +474,8 @@ #end try #end if denovo_assembly = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != "" -$refGenomeSource.defuse_param.denovo_assembly +#if $defuse_param.settings == "full" and $defuse_param.denovo_assembly.__str__ != "" +$defuse_param.denovo_assembly #else #try $ref_dict['denovo_assembly'] @@ -459,8 +484,8 @@ #end try #end if probability_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != "" -$refGenomeSource.defuse_param.probability_threshold +#if $defuse_param.settings == "full" and $defuse_param.probability_threshold.__str__ != "" +$defuse_param.probability_threshold #else #try $ref_dict['probability_threshold'] @@ -470,10 +495,23 @@ #end if positive_controls = \$(data_directory)/controls.txt +# Use multiple exon transcripts for stats calculations (yes/no) +# should be enabled for very small libraries +multi_exon_transcripts_stats = #slurp +#if $defuse_param.settings == "full" and $defuse_param.multi_exon_transcripts_stats.__str__ != "" +$defuse_param.multi_exon_transcripts_stats +#else +#try +$ref_dict['multi_exon_transcripts_stats'] +#except +no +#end try +#end if + # Position density when calculating covariance covariance_sampling_density = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != "" -$refGenomeSource.defuse_param.covariance_sampling_density +#if $defuse_param.settings == "full" and $defuse_param.covariance_sampling_density.__str__ != "" +$defuse_param.covariance_sampling_density #else #try $ref_dict['covariance_sampling_density'] @@ -482,12 +520,30 @@ #end try #end if +# Maximum number of alignments for a read pair +# Pairs with more alignments are filtered +max_paired_alignments = #slurp +#if $defuse_param.settings == "full" and $defuse_param.max_paired_alignments.__str__ != "" +$defuse_param.max_paired_alignments +#else +#try +$ref_dict['max_paired_alignments'] +#except +10 +#end try +#end if # Number of reads for each job in split -reads_per_job = 1000000 - -# Number of regions for each breakpoint sequence job in split -regions_per_job = 20 +reads_per_job = #slurp +#if $defuse_param.settings == "full" and $defuse_param.reads_per_job.__str__ != "" +$defuse_param.reads_per_job +#else +#try +$ref_dict['reads_per_job'] +#except +1000000 +#end try +#end if #raw # If you have command line 'mail' and wish to be notified @@ -497,40 +553,10 @@ remove_job_files = yes remove_job_temp_files = yes -# Converting to fastq -# Fastq converter config format 1 for reads stored in separate files for each end -# data_lane_rexex_N is a perl regex which stores the lane id in $1 -# data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1 -# data_compress_regex_N is a perl regex which stores the compression extension in $1 -# data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout -# Fastq converter config format 2 for reads stored in separate files for each end -# data_lane_regex_N is a perl regex which stores the lane id in $1 -# data_compress_regex_N is a perl regex which stores the compression extension in $1 -# data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout -# data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout +qsub_params = "" -data_lane_regex_1 = ^(.+)_[12]_export\.txt.*$ -data_end_regex_1 = ^.+_([12])_export\.txt.*$ -data_compress_regex_1 = ^.+_[12]_export\.txt(.*)$ -data_converter_1 = $(scripts_directory)/fq_all2std.pl export2std - -data_lane_regex_2 = ^(.+)_[12]_concat_qseq\.txt.*$ -data_end_regex_2 = ^.+_([12])_concat_qseq\.txt.*$ -data_compress_regex_2 = ^.+_[12]_concat_qseq\.txt(.*)$ -data_converter_2 = $(scripts_directory)/qseq2fastq.pl - -data_lane_regex_3 = ^(.+)\.bam.*$ -data_compress_regex_3 = ^.+\.bam(.*)$ -data_end1_converter_3 = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl -data_end2_converter_3 = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl - -data_lane_regex_4 = ^(.+).[12].fastq.*$ -data_end_regex_4 = ^.+.([12]).fastq.*$ -data_compress_regex_4 = ^.+.[12].fastq(.*)$ -data_converter_4 = cat #end raw -#end if @@ -588,29 +614,42 @@ cp $defuse_config $config_txt ## make a data_dir and ln -s the input fastq mkdir -p data_dir -ln -s $left_pairendreads data_dir/reads_1.fastq -ln -s $right_pairendreads data_dir/reads_2.fastq +## ln -s "$left_pairendreads" data_dir/reads_1.fastq +## ln -s "$right_pairendreads" data_dir/reads_2.fastq +cp "$left_pairendreads" data_dir/reads_1.fastq +cp "$right_pairendreads" data_dir/reads_2.fastq ## ln to output_dir in from_work_dir #if $defuse_out.__str__ != 'None': -mkdir -p $defuse_out.extra_files_path -ln -s $defuse_out.extra_files_path output_dir +mkdir -p $defuse_out.dataset.extra_files_path +ln -s $defuse_out.dataset.extra_files_path output_dir #else mkdir -p output_dir #end if ## run defuse.pl -perl \${DEFUSE_PATH}/scripts/defuse.pl -c $defuse_config -d data_dir -o output_dir -p 8 +perl \${DEFUSE_PATH}/scripts/defuse.pl -name "$library_name" -c $defuse_config -1 data_dir/reads_1.fastq -2 data_dir/reads_2.fastq -o output_dir -p \$GALAXY_SLOTS ## copy primary results to output datasets if [ -e output_dir/log/defuse.log ]; then cp output_dir/log/defuse.log $defuse_log; fi -if [ -e output_dir/results.tsv ]; then cp output_dir/results.tsv $results_tsv; fi +## if [ -e output_dir/results.tsv ]; then cp output_dir/results.tsv $results_tsv; fi if [ -e output_dir/results.filtered.tsv ]; then cp output_dir/results.filtered.tsv $results_filtered_tsv; fi if [ -e output_dir/results.classify.tsv ]; then cp output_dir/results.classify.tsv $results_classify_tsv; fi +#if $breakpoints_bam: +if [ -e output_dir/results.filtered.tsv ] ${amp}${amp} [ -e output_dir/breakpoints.genome.psl ] +then + awk "\\$10 ~ /^(`awk '\\$1 ~ /[0-9]+/{print \\$1}' output_dir/results.filtered.tsv | tr '\n' '|'`)\\$/{print \\$0}" output_dir/breakpoints.genome.psl > breakpoints.genome.filtered.psl ${amp}${amp} + psl2sam.pl breakpoints.genome.filtered.psl > breakpoints.genome.filtered.sam ${amp}${amp} + samtools view -b -T /panfs/roc/rissdb/galaxy/genomes/NCBIM37/defuse/defuse.reference.fa -o breakpoints.genome.filtered.bam breakpoints.genome.filtered.sam ${amp}${amp} + samtools sort breakpoints.genome.filtered.bam breakpoints ${amp}${amp} + ## samtools index breakpoints.bam + cp breakpoints.bam $fusions_bam +fi +#end if ## create html with links for output_dir #if $defuse_out.__str__ != 'None': if [ -e $defuse_out ] then echo '${lt}html${gt}${lt}head${gt}${lt}title${gt}Defuse Output${lt}/title${gt}${lt}/head${gt}${lt}body${gt}' ${gt} $defuse_out echo '${lt}h2${gt}Defuse Output Files${lt}/h2${gt}${lt}ul${gt}' ${gt}${gt} $defuse_out - pushd $defuse_out.extra_files_path + pushd $defuse_out.dataset.extra_files_path for f in `find -L . -maxdepth 1 -type f`; do fn=`basename ${ds}f`; echo '${lt}li${gt}${lt}a href="'${ds}fn'"${gt}'${ds}fn'${lt}/a${gt}${lt}/li${gt}' ${gt}${gt} $defuse_out; done @@ -623,8 +662,8 @@ #if $fusion_reads.__str__ != 'None': if [ -e output_dir/results.filtered.tsv -a -e $fusion_reads ] then - mkdir -p $fusion_reads.extra_files_path - results2html output_dir/results.filtered.tsv $fusion_reads $fusion_reads.extra_files_path + mkdir -p $fusion_reads.dataset.extra_files_path + results2html output_dir/results.filtered.tsv $fusion_reads $fusion_reads.dataset.extra_files_path fi #end if @@ -753,4 +792,5 @@ 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - - + diff -r f65857c1b92e -r b22f8634ff84 defuse_bamfastq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse_bamfastq.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,63 @@ + + + converts a bam file to fastq files. + + macros.xml + + + + + bamfastq + #if $pair == True : + $pair + #end if + #if $multiple == True : + $multiple + #end if + #if $rename == True : + $rename + #end if + -b $bamfile + -1 $fastq1 + -2 $fastq2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + bamfastq converts a bam file input into a pair of fastq files that can be used as input to deFuse. + + + diff -r f65857c1b92e -r b22f8634ff84 defuse_results_to_vcf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse_results_to_vcf.py Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,273 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2012, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# Jesse Erdmann +# +#------------------------------------------------------------------------------ +""" + + +""" +This tool takes the defuse results.tsv tab-delimited file as input and creates a Variant Call Format file as output. +""" + +import sys,re,os.path +import optparse +from optparse import OptionParser + +""" +http://www.1000genomes.org/wiki/analysis/variant-call-format/vcf-variant-call-format-version-42 + +5. INFO keys used for structural variants +When the INFO keys reserved for encoding structural variants are used for imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). +The following INFO keys are reserved for encoding structural variants. In general, when these keys are used by imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). +##INFO= +##INFO= +##INFO= +For precise variants, END is POS + length of REF allele - 1, and the for imprecise variants the corresponding best estimate. +##INFO= +Value should be one of DEL, INS, DUP, INV, CNV, BND. This key can be derived from the REF/ALT fields but is useful for filtering. +##INFO= +One value for each ALT allele. Longer ALT alleles (e.g. insertions) have positive values, shorter ALT alleles (e.g. deletions) have negative values. +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +For precise variants, the consensus sequence the alternate allele assembly is derivable from the REF and ALT fields. However, the alternate allele assembly file may contain additional information about the characteristics of the alt allele contigs. +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +6. FORMAT keys used for structural variants +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +These keys are analogous to GT/GQ/GL and are provided for genotyping imprecise events by copy number (either because there is an unknown number of alternate alleles or because the haplotypes cannot be determined). CN specifies the integer copy number of the variant in this sample. CNQ is encoded as a phred quality -10log_10p(copy number genotype call is wrong). CNL specifies a list of log10 likelihoods for each potential copy number, starting from zero. When possible, GT/GQ/GL should be used instead of (or in addition to) these keys. + +Specifying Complex Rearrangements with Breakends +An arbitrary rearrangement event can be summarized as a set of novel adjacencies. +Each adjacency ties together 2 breakends. The two breakends at either end of a novel adjacency are called mates. +There is one line of VCF (i.e. one record) for each of the two breakends in a novel adjacency. A breakend record is identified with the tag SYTYPE=BND" in the INFO field. The REF field of a breakend record indicates a base or sequence s of bases beginning at position POS, as in all VCF records. The ALT field of a breakend record indicates a replacement for s. This "breakend replacement" has three parts: +the string t that replaces places s. The string t may be an extended version of s if some novel bases are inserted during the formation of the novel adjacency. +The position p of the mate breakend, indicated by a string of the form "chr:pos". This is the location of the first mapped base in the piece being joined at this novel adjacency. +The direction that the joined sequence continues in, starting from p. This is indicated by the orientation of square brackets surrounding p. +These 3 elements are combined in 4 possible ways to create the ALT. In each of the 4 cases, the assertion is that s is replaced with t, and then some piece starting at position p is joined to t. The cases are: +REF ALT Meaning +s t[p[ piece extending to the right of p is joined after t +s t]p] reverse comp piece extending left of p is joined after t +s ]p]t piece extending to the left of p is joined before t +s [p[t reverse comp piece extending right of p is joined before t + +Examples: +#CHROM POS ID REF ALT QUAL FILT INFO +2 321681 bnd_W G G]17:198982] 6 PASS SVTYPE=BND;MATEID=bnd_Y +2 321682 bnd_V T ]13:123456]T 6 PASS SVTYPE=BND;MATEID=bnd_U +13 123456 bnd_U C C[2:321682[ 6 PASS SVTYPE=BND;MATEID=bnd_V +13 123457 bnd_X A [17:198983[A 6 PASS SVTYPE=BND;MATEID=bnd_Z +17 198982 bnd_Y A A]2:321681] 6 PASS SVTYPE=BND;MATEID=bnd_W +17 198983 bnd_Z C [13:123457[C 6 PASS SVTYPE=BND;MATEID=bnd_X +""" + +vcf_header = """\ +##fileformat=VCFv4.1 +##source=defuse +##reference=%s +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO\ +""" + +def cmp_alphanumeric(s1,s2): + if s1 == s2: + return 0 + a1 = re.findall("\d+|[a-zA-Z]+",s1) + a2 = re.findall("\d+|[a-zA-Z]+",s2) + for i in range(min(len(a1),len(a2))): + if a1[i] == a2[i]: + continue + if a1[i].isdigit() and a2[i].isdigit(): + return int(a1[i]) - int(a2[i]) + return 1 if a1[i] > a2[i] else -1 + return len(a1) - len(a2) + +def __main__(): + # VCF functions + chr_dict = dict() + def add_vcf_line(chr,pos,id,line): + if chr not in chr_dict: + pos_dict = dict() + chr_dict[chr] = pos_dict + if pos not in chr_dict[chr]: + id_dict = dict() + chr_dict[chr][pos] = id_dict + chr_dict[chr][pos][id] = line + + def write_vcf(): + print >> outputFile, vcf_header % (refname) + for chr in sorted(chr_dict.keys(),cmp=cmp_alphanumeric): + for pos in sorted(chr_dict[chr].keys()): + for id in chr_dict[chr][pos]: + print >> outputFile, chr_dict[chr][pos][id] + #Parse Command Line + parser = optparse.OptionParser() + # files + parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' ) + parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) + parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference id' ) + (options, args) = parser.parse_args() + + # results.tsv input + if options.input != None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(2) + else: + inputFile = sys.stdin + # vcf output + if options.output != None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + else: + outputFile = sys.stdout + + refname = options.reference if options.reference else 'unknown' + + svtype = 'SVTYPE=BND' + filt = 'PASS' + columns = [] + try: + for linenum,line in enumerate(inputFile): + ## print >> sys.stderr, "%d: %s\n" % (linenum,line) + fields = line.strip().split('\t') + if line.startswith('cluster_id'): + columns = fields + ## print >> sys.stderr, "columns: %s\n" % columns + continue + cluster_id = fields[columns.index('cluster_id')] + gene_chromosome1 = fields[columns.index('gene_chromosome1')] + gene_chromosome2 = fields[columns.index('gene_chromosome2')] + genomic_strand1 = fields[columns.index('genomic_strand1')] + genomic_strand2 = fields[columns.index('genomic_strand2')] + gene1 = fields[columns.index('gene1')] + gene2 = fields[columns.index('gene2')] + gene_info = 'GENEID=%s,%s' % (gene1,gene2) + gene_name1 = fields[columns.index('gene_name1')] + gene_name2 = fields[columns.index('gene_name2')] + gene_name_info = 'GENE=%s,%s' % (gene_name1,gene_name2) + gene_location1 = fields[columns.index('gene_location1')] + gene_location2 = fields[columns.index('gene_location2')] + gene_loc = 'GENELOC=%s,%s' % (gene_location1,gene_location2) + expression1 = int(fields[columns.index('expression1')]) + expression2 = int(fields[columns.index('expression2')]) + expr = 'EXPR=%d,%d' % (expression1,expression2) + genomic_break_pos1 = int(fields[columns.index('genomic_break_pos1')]) + genomic_break_pos2 = int(fields[columns.index('genomic_break_pos2')]) + breakpoint_homology = int(fields[columns.index('breakpoint_homology')]) + homlen = 'HOMLEN=%s' % breakpoint_homology + orf = fields[columns.index('orf')] == 'Y' + exonboundaries = fields[columns.index('exonboundaries')] == 'Y' + read_through = fields[columns.index('read_through')] == 'Y' + interchromosomal = fields[columns.index('interchromosomal')] == 'Y' + adjacent = fields[columns.index('adjacent')] == 'Y' + altsplice = fields[columns.index('altsplice')] == 'Y' + deletion = fields[columns.index('deletion')] == 'Y' + eversion = fields[columns.index('eversion')] == 'Y' + inversion = fields[columns.index('inversion')] == 'Y' + span_count = int(fields[columns.index('span_count')]) + splitr_count = int(fields[columns.index('splitr_count')]) + splice_score = int(fields[columns.index('splice_score')]) + probability = fields[columns.index('probability')] if columns.index('probability') else '.' + splitr_sequence = fields[columns.index('splitr_sequence')] + split_seqs = splitr_sequence.split('|') + mate_id1 = "bnd_%s_1" % cluster_id + mate_id2 = "bnd_%s_2" % cluster_id + ref1 = split_seqs[0][-1] + ref2 = split_seqs[1][0] + b1 = '[' if genomic_strand1 == '+' else ']' + b2 = '[' if genomic_strand2 == '+' else ']' + alt1 = "%s%s%s:%d%s" % (ref1,b2,gene_chromosome2,genomic_break_pos2,b2) + alt2 = "%s%s:%d%s%s" % (b1,gene_chromosome1,genomic_break_pos1,b1,ref2) + #TODO evaluate what should be included in the INFO field + info = ['DP=%d' % (span_count + splitr_count),'SPLITCNT=%d' % splitr_count,'SPANCNT=%d' % span_count,gene_name_info,gene_info,gene_loc,expr,homlen,'SPLICESCORE=%d' % splice_score] + if orf: + info.append('ORF') + if exonboundaries: + info.append('EXONBND') + if interchromosomal: + info.append('INTERCHROM') + if read_through: + info.append('READTHROUGH') + if adjacent: + info.append('ADJACENT') + if altsplice: + info.append('ALTSPLICE') + if deletion: + info.append('DELETION') + if eversion: + info.append('EVERSION') + if inversion: + info.append('INVERSION') + info1 = [svtype,'MATEID=%s;MATELOC=%s:%d' % (mate_id2,gene_chromosome2,genomic_break_pos2)] + info + info2 = [svtype,'MATEID=%s;MATELOC=%s:%d' % (mate_id1,gene_chromosome1,genomic_break_pos1)] + info + qual = int(float(fields[columns.index('probability')]) * 255) if columns.index('probability') else '.' + vcf1 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome1,genomic_break_pos1, mate_id1, ref1, alt1, qual, filt, ';'.join(info1) ) + vcf2 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome2,genomic_break_pos2, mate_id2, ref2, alt2, qual, filt, ';'.join(info2) ) + add_vcf_line(gene_chromosome1,genomic_break_pos1,mate_id1,vcf1) + add_vcf_line(gene_chromosome2,genomic_break_pos2,mate_id2,vcf2) + write_vcf() + except Exception, e: + print >> sys.stderr, "failed: %s" % e + sys.exit(1) + +if __name__ == "__main__" : __main__() + diff -r f65857c1b92e -r b22f8634ff84 defuse_results_to_vcf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse_results_to_vcf.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,34 @@ + + + generate a VCF from a DeFuse Results file + + defuse + + defuse_results_to_vcf.py --input $defuse_results --reference ${defuse_results.metadata.dbkey} --output $vcf + + + + + + + + + + + + + + + + + +**Defuse Results to VCF** + +Generates a VCF_ Variant Call Format file from a DeFuse_ results.tsv file. + +This program relies on the header line of the results.tsv to determine which columns to use for genrating the VCF file. + +.. _VCF: http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 +.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse + + diff -r f65857c1b92e -r b22f8634ff84 defuse_trinity_analysis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse_trinity_analysis.py Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,466 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2014, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" + + +""" +This tool takes the defuse results.tsv tab-delimited file, trinity +and creates a tabular report + +Would it be possible to create 2 additional files from the deFuse-Trinity comparison program. +One containing all the Trinity records matched to deFuse records (with the deFuse ID number), +and the other with the ORFs records matching back to the Trinity records in the first files? + +M045_Report.csv +"","deFuse_subset.count","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","deFuse.gene_chromosome1","deFuse.gene_location1","deFuse.gene_chromosome2","deFuse.gene_location2","deFuse_subset.type" +"1",1,"Rps6","Dennd4c",7,0.814853504,"4","coding","4","coding","TIC " + + + +OS03_Matched_Rev.csv +"count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein" + +"","deFuse.splitr_sequence","deFuse.gene_chromosome1","deFuse.gene_chromosome2","deFuse.gene_location1","deFuse.gene_location2","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","word1","word2","fusion_part_1","fusion_part_2","fusion_point","fusion_point_rc","count","transcript" + +""" + +import sys,re,os.path,math +import textwrap +import optparse +from optparse import OptionParser + +revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1]) + +codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", + "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", + "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", + "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", + "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", + "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", + "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", + "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", + "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", + "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", + "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", + "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", + "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", + "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", + "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", + "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} + +def translate(seq) : + rna = seq.upper().replace('T','U') + aa = [] + for i in range(0,len(rna) - 2, 3): + codon = rna[i:i+3] + aa.append(codon_map[codon] if codon in codon_map else 'X') + return ''.join(aa) + +def get_stop_codons(seq) : + rna = seq.upper().replace('T','U') + stop_codons = [] + for i in range(0,len(rna) - 2, 3): + codon = rna[i:i+3] + aa = codon_map[codon] if codon in codon_map else 'X' + if aa == '*': + stop_codons.append(codon) + return stop_codons + +def read_fasta(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + + +def test_rcomplement(seq, target): + try: + comp = revcompl(seq) + return comp in target + except: + pass + return False + +def test_reverse(seq,target): + return options.test_reverse and seq and seq[::-1] in target + +def cmp_alphanumeric(s1,s2): + if s1 == s2: + return 0 + a1 = re.findall("\d+|[a-zA-Z]+",s1) + a2 = re.findall("\d+|[a-zA-Z]+",s2) + for i in range(min(len(a1),len(a2))): + if a1[i] == a2[i]: + continue + if a1[i].isdigit() and a2[i].isdigit(): + return int(a1[i]) - int(a2[i]) + return 1 if a1[i] > a2[i] else -1 + return len(a1) - len(a2) + +def parse_defuse_results(inputFile): + defuse_results = [] + columns = [] + coltype_int = ['expression1', 'expression2', 'gene_start1', 'gene_start2', 'gene_end1', 'gene_end2', 'genomic_break_pos1', 'genomic_break_pos2', 'breakpoint_homology', 'span_count', 'splitr_count', 'splice_score'] + coltype_float = ['probability'] + coltype_yn = [ 'orf', 'exonboundaries', 'read_through', 'interchromosomal', 'adjacent', 'altsplice', 'deletion', 'eversion', 'inversion'] + try: + for linenum,line in enumerate(inputFile): + ## print >> sys.stderr, "%d: %s\n" % (linenum,line) + fields = line.strip().split('\t') + if line.startswith('cluster_id'): + columns = fields + ## print >> sys.stderr, "columns: %s\n" % columns + continue + elif fields and len(fields) == len(columns): + cluster_id = fields[columns.index('cluster_id')] + cluster = dict() + flags = [] + defuse_results.append(cluster) + for i,v in enumerate(columns): + if v in coltype_int: + cluster[v] = int(fields[i]) + elif v in coltype_float: + cluster[v] = float(fields[i]) + elif v in coltype_yn: + cluster[v] = fields[i] == 'Y' + if cluster[v]: + flags.append(columns[i]) + else: + cluster[v] = fields[i] + cluster['flags'] = ','.join(flags) + except Exception, e: + print >> sys.stderr, "failed to read cluster_dict: %s" % e + exit(1) + return defuse_results + +## deFuse params to the mapping application? + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + # files + parser.add_option( '-i', '--input', dest='input', default=None, help='The input defuse results.tsv file (else read from stdin)' ) + parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' ) + parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' ) + parser.add_option( '-o', '--output', dest='output', default=None, help='The output report (else write to stdout)' ) + parser.add_option( '-m', '--matched', dest='matched', default=None, help='The output matched report' ) + parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', default=None, help='The output alignment file' ) + parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', default=None, help='The output ORF alignment file' ) + parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' ) + parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' ) + parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' ) + parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' ) + parser.add_option( '-I', '--incomplete_orfs', dest='incomplete_orfs', action='store_true', default=False, help='Count incomplete ORFs' ) + parser.add_option( '-O', '--orf_type', dest='orf_type', action='append', default=['complete','5prime_partial'], choices=['complete','5prime_partial','3prime_partial','internal'], help='ORF types to report' ) + parser.add_option( '-r', '--readthrough', dest='readthrough', type='int', default=3, help='Number of stop_codons to read through' ) + # min_orf_len + # split_na_len + # tic_len = 1000000 + # prior + # deFuse direction reversed + # in frame ? + # contain known protein elements + # what protein change + # trinity provides full transctipt, defuse doesn't show full + #parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference fasta' ) + #parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='The genomic reference gtf feature file') + (options, args) = parser.parse_args() + + # results.tsv input + if options.input != None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(2) + else: + inputFile = sys.stdin + # vcf output + if options.output != None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + else: + outputFile = sys.stdout + outputTxFile = None + outputOrfFile = None + if options.transcript_alignment: + try: + outputTxFile = open(options.transcript_alignment,'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + if options.orf_alignment: + try: + outputOrfFile = open(options.orf_alignment,'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + # Add percent match after transcript + report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2'] + report_fields = ['cluster_id','gene_name1','gene_name2','span_count','probability','genomic_bkpt1','gene_location1','genomic_bkpt2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2'] + report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point', 'cluster_id':'cluster_id', 'splitr_sequence':'splitr_sequence', 'splitr_count':'splitr_count', 'splitr_span_pvalue':'splitr_span_pvalue', 'splitr_pos_pvalue':'splitr_pos_pvalue', 'splitr_min_pvalue':'splitr_min_pvalue', 'adjacent':'adjacent', 'altsplice':'altsplice', 'break_adj_entropy1':'break_adj_entropy1', 'break_adj_entropy2':'break_adj_entropy2', 'break_adj_entropy_min':'break_adj_entropy_min', 'breakpoint_homology':'breakpoint_homology', 'breakseqs_estislands_percident':'breakseqs_estislands_percident', 'cdna_breakseqs_percident':'cdna_breakseqs_percident', 'deletion':'deletion', 'est_breakseqs_percident':'est_breakseqs_percident', 'eversion':'eversion', 'exonboundaries':'exonboundaries', 'expression1':'expression1', 'expression2':'expression2', 'gene1':'gene1', 'gene2':'gene2', 'gene_align_strand1':'gene_align_strand1', 'gene_align_strand2':'gene_align_strand2', 'gene_end1':'gene_end1', 'gene_end2':'gene_end2', 'gene_start1':'gene_start1', 'gene_start2':'gene_start2', 'gene_strand1':'gene_strand1', 'gene_strand2':'gene_strand2', 'genome_breakseqs_percident':'genome_breakseqs_percident', 'genomic_break_pos1':'genomic_break_pos1', 'genomic_break_pos2':'genomic_break_pos2', 'genomic_strand1':'genomic_strand1', 'genomic_strand2':'genomic_strand2', 'interchromosomal':'interchromosomal', 'interrupted_index1':'interrupted_index1', 'interrupted_index2':'interrupted_index2', 'inversion':'inversion', 'library_name':'library_name', 'max_map_count':'max_map_count', 'max_repeat_proportion':'max_repeat_proportion', 'mean_map_count':'mean_map_count', 'min_map_count':'min_map_count', 'num_multi_map':'num_multi_map', 'num_splice_variants':'num_splice_variants', 'orf':'orf', 'read_through':'read_through', 'repeat_proportion1':'repeat_proportion1', 'repeat_proportion2':'repeat_proportion2', 'span_coverage1':'span_coverage1', 'span_coverage2':'span_coverage2', 'span_coverage_max':'span_coverage_max', 'span_coverage_min':'span_coverage_min', 'splice_score':'splice_score', 'splicing_index1':'splicing_index1', 'splicing_index2':'splicing_index2', 'fusion_type':'Type', 'coverage':'fusion%','Transcript':'Transcript?','Protein':'Protein?','flags':'descriptions','fwd_seq':'fusion','alignments1':'alignments1','alignments2':'alignments2','genomic_bkpt1':'From Chr', 'genomic_bkpt2':'To Chr'} + + ## Read defuse results + fusions = parse_defuse_results(inputFile) + ## Create a field with the 12 nt before and after the fusion point. + ## Create a field with the reverse complement of the 24 nt fusion point field. + ## Add fusion type filed (INTER, INTRA, TIC) + for i,fusion in enumerate(fusions): + fusion['ordinal'] = i + 1 + fusion['genomic_bkpt1'] = "%s:%d" % (fusion['gene_chromosome1'], fusion['genomic_break_pos1']) + fusion['genomic_bkpt2'] = "%s:%d" % (fusion['gene_chromosome2'], fusion['genomic_break_pos2']) + fusion['alignments1'] = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1']) + fusion['alignments2'] = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) + split_seqs = fusion['splitr_sequence'].split('|') + fusion['split_seqs'] = split_seqs + fusion['split_seqs'] = split_seqs + fusion['split_seq_lens'] = [len(split_seqs[0]),len(split_seqs[1])] + fusion['split_max_lens'] = [len(split_seqs[0]),len(split_seqs[1])] + fwd_off = min(abs(options.nbases),len(split_seqs[0])) + rev_off = min(abs(options.nbases),len(split_seqs[1])) + fusion['fwd_off'] = fwd_off + fusion['rev_off'] = rev_off + fwd_seq = split_seqs[0][-fwd_off:] + split_seqs[1][:rev_off] + rev_seq = revcompl(fwd_seq) + fusion['fwd_seq'] = fwd_seq + fusion['rev_seq'] = rev_seq + fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC' + fusion['fusion_type'] = fusion_type + fusion['transcripts'] = dict() + fusion['Transcript'] = 'No' + fusion['coverage'] = 0 + fusion['Protein'] = 'No' + # print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2']) + inputFile.close() + + ## Process Trinity data and compare to deFuse + matched_transcripts = dict() + matched_orfs = dict() + transcript_orfs = dict() + fusions_with_transcripts = set() + fusions_with_orfs = set() + ## fusion['transcripts'][tx_id] { revcompl:?, bkpt:n, seq1: , seq2: , match1:n, match2:n} + n = 0 + if options.transcripts: + with open(options.transcripts) as fp: + for tx_full_id, seq in read_fasta(fp): + n += 1 + for i,fusion in enumerate(fusions): + if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq: + fusions_with_transcripts.add(i) + fusion['Transcript'] = 'Yes' + tx_id = tx_full_id.lstrip('>').split()[0] + matched_transcripts[tx_full_id] = seq + fusion['transcripts'][tx_id] = dict() + fusion['transcripts'][tx_id]['seq'] = seq + fusion['transcripts'][tx_id]['full_id'] = tx_full_id + pos = seq.find(fusion['fwd_seq']) + if pos >= 0: + tx_bkpt = pos + fusion['fwd_off'] + # fusion['transcripts'][tx_full_id] = tx_bkpt + if tx_bkpt > fusion['split_max_lens'][0]: + fusion['split_max_lens'][0] = tx_bkpt + len2 = len(seq) - tx_bkpt + if len2 > fusion['split_max_lens'][1]: + fusion['split_max_lens'][1] = len2 + fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt + fusion['transcripts'][tx_id]['revcompl'] = False + fusion['transcripts'][tx_id]['seq1'] = seq[:tx_bkpt] + fusion['transcripts'][tx_id]['seq2'] = seq[tx_bkpt:] + else: + pos = seq.find(fusion['rev_seq']) + tx_bkpt = pos + fusion['rev_off'] + # fusion['transcripts'][tx_full_id] = -tx_bkpt + if tx_bkpt > fusion['split_max_lens'][1]: + fusion['split_max_lens'][1] = tx_bkpt + len2 = len(seq) - tx_bkpt + if len2 > fusion['split_max_lens'][0]: + fusion['split_max_lens'][0] = len2 + rseq = revcompl(seq) + pos = rseq.find(fusion['fwd_seq']) + tx_bkpt = pos + fusion['fwd_off'] + fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt + fusion['transcripts'][tx_id]['revcompl'] = True + fusion['transcripts'][tx_id]['seq1'] = rseq[:tx_bkpt] + fusion['transcripts'][tx_id]['seq2'] = rseq[tx_bkpt:] + fseq = fusion['split_seqs'][0] + tseq = fusion['transcripts'][tx_id]['seq1'] + mlen = min(len(fseq),len(tseq)) + fusion['transcripts'][tx_id]['match1'] = mlen + for j in range(1,mlen+1): + if fseq[-j] != tseq[-j]: + fusion['transcripts'][tx_id]['match1'] = j - 1 + break + fseq = fusion['split_seqs'][1] + tseq = fusion['transcripts'][tx_id]['seq2'] + mlen = min(len(fseq),len(tseq)) + fusion['transcripts'][tx_id]['match2'] = mlen + for j in range(mlen): + if fseq[j] != tseq[j]: + fusion['transcripts'][tx_id]['match2'] = j + break + # coverage = math.floor(float(fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 100. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1])) + coverage = int((fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 1000. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1])) * .1 + # print >> sys.stderr, "%s\t%d\t%d\t%d\%s\t\t%d\t%d\t%d\t%d" % (tx_id,fusion['transcripts'][tx_id]['match1'],fusion['transcripts'][tx_id]['match2'],len(fusion['split_seqs'][0]+fusion['split_seqs'][1]),coverage,len( fusion['split_seqs'][0]),len(fusion['transcripts'][tx_id]['seq1']),len(fusion['split_seqs'][1]),len(fusion['transcripts'][tx_id]['seq2'])) + fusion['coverage'] = max(coverage,fusion['coverage']) + print >> sys.stdout, "fusions_with_transcripts: %d %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts)) + ##for i,fusion in enumerate(fusions): + ## print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts']) + ## Process ORFs and compare to matched deFuse and Trinity data. + ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*". + if options.peptides: + with open(options.peptides) as fp: + for orf_full_id, seq in read_fasta(fp): + n += 1 + if len(seq) < options.min_pep_len: + continue + orf_type = re.match('^.* type:(\S+) .*$',orf_full_id).groups()[0] + ## if not seq[-1] == '*' and not options.incomplete_orfs: + ## if not orf_type 'complete' and not options.incomplete_orfs: + if orf_type not in options.orf_type: + continue + for i,fusion in enumerate(fusions): + if len(fusion['transcripts']) > 0: + for tx_id in fusion['transcripts']: + ## >m.196252 g.196252 ORF g.196252 m.196252 type:complete len:237 (+) comp100000_c5_seq2:315-1025(+) + ## >m.134565 g.134565 ORF g.134565 m.134565 type:5prime_partial len:126 (-) comp98702_c1_seq21:52-429(-) + if tx_id+':' not in orf_full_id: + continue + m = re.match("^.*%s:(\d+)-(\d+)[(]([+-])[)].*" % re.sub('([|.{}()$?^])','[\\1]',tx_id),orf_full_id) + if m: + if not m.groups() or len(m.groups()) < 3 or m.groups()[0] == None: + print >> sys.stderr, "Error:\n%s\n%s\n" % (tx_id,orf_full_id) + orf_id = orf_full_id.lstrip('>').split()[0] + if not tx_id in transcript_orfs: + transcript_orfs[tx_id] = [] + alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) + # print >> sys.stdout, "%d %s bkpt:%d %s rc:%s (%s) %s" % (fusion['ordinal'], tx_id, int(fusion['transcripts'][tx_id]['bkpt']), str(m.groups()), str(fusion['transcripts'][tx_id]['revcompl']), alignments, orf_full_id) + start = seq.find('M') + pep_len = len(seq) + if pep_len - start < options.min_pep_len: + continue + orf_dict = dict() + transcript_orfs[tx_id].append(orf_dict) + fusions_with_orfs.add(i) + matched_orfs[orf_full_id] = seq + fusion['Protein'] = 'Yes' + tx_start = int(m.groups()[0]) + tx_end = int(m.groups()[1]) + tx_strand = m.groups()[2] + tx_bkpt = fusion['transcripts'][tx_id]['bkpt'] + orf_dict['orf_id'] = orf_id + orf_dict['tx_start'] = tx_start + orf_dict['tx_end'] = tx_end + orf_dict['tx_strand'] = tx_strand + orf_dict['tx_bkpt'] = tx_bkpt + orf_dict['seq'] = seq[:start].lower() + seq[start:] if start > 0 else seq + ## >m.208656 g.208656 ORF g.208656 m.208656 type:5prime_partial len:303 (+) comp100185_c2_seq9:2-910(+) + ## translate(tx34[1:910]) + ## translate(tx34[1:2048]) + ## comp99273_c1_seq1 len=3146 (-2772) + ## >m.158338 g.158338 ORF g.158338 m.158338 type:complete len:785 (-) comp99273_c1_seq1:404-2758(-) + ## translate(tx[-2758:-403]) + ## comp100185_c2_seq9 len=2048 (904) + ## novel protein sequence + ## find first novel AA + ## get prior n AAs + ## get novel AA seq thru n stop codons + ### tx_seq = matched_transcripts[tx_full_id] if tx_bkpt >= 0 else revcompl(tx_seq) + tx_seq = fusion['transcripts'][tx_id]['seq'] + orf_dict['tx_seq'] = tx_seq + novel_tx_seq = tx_seq[tx_start - 1:] if tx_strand == '+' else revcompl(tx_seq[:tx_end]) + read_thru_pep = translate(novel_tx_seq) + # fusion['transcripts'][tx_id]['revcompl'] = True + # tx_bkpt = fusion['transcripts'][tx_id]['bkpt'] + # bkpt_aa_pos = tx_bkpt - tx_start - 1 + # bkpt_aa_pos = (tx_bkpt - tx_start - 1) / 3 if tx_strand == '+' else tx_end + # print >> sys.stdout, "%s\n%s" % (seq,read_thru_pep) + stop_codons = get_stop_codons(novel_tx_seq) + if options.readthrough: + readthrough = options.readthrough + 1 + read_thru_pep = '*'.join(read_thru_pep.split('*')[:readthrough]) + stop_codons = stop_codons[:readthrough] + orf_dict['read_thru_pep'] = read_thru_pep + orf_dict['stop_codons'] = ','.join(stop_codons) + print >> sys.stdout, "fusions_with_orfs: %d %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs)) + ## Alignments 3 columns, seq columns padded out to longest seq, UPPERCASE_match diffs lowercase + ### defuse_id pre_split_seq post_split_seq + ### trinity_id pre_split_seq post_split_seq + ## Transcripts alignment output + ## Peptide alignment output + ## Write reports + ## OS03_Matched_Rev.csv + ## "count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein" + if options.transcripts and options.matched: + #match_fields = ['ordinal','gene_name1','gene_name2','fwd_seq'] + outputMatchFile = open(options.matched,'w') + #print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","read_through","stop_codons"]) + print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","stop_codons"]) + for i,fusion in enumerate(fusions): + if len(fusion['transcripts']) > 0: + for tx_id in fusion['transcripts'].keys(): + if tx_id in transcript_orfs: + for orf_dict in transcript_orfs[tx_id]: + if 'tx_seq' not in orf_dict: + print >> sys.stderr, "orf_dict %s" % orf_dict + #fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['seq'],orf_dict['read_thru_pep'],orf_dict['stop_codons']] + fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['read_thru_pep'],orf_dict['stop_codons']] + print >> outputMatchFile, '\t'.join(fields) + outputMatchFile.close() + if options.transcripts and options.transcript_alignment: + if outputTxFile: + id_fields = ['gene_name1','alignments1','gene_name2','alignments2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein','flags'] + fa_width = 80 + for i,fusion in enumerate(fusions): + if len(fusion['transcripts']) > 0: + alignments1 = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1']) + alignments2 = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) + alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) + fusion_id = "%s (%s) %s" % (i + 1,alignments,' '.join([str(fusion[x]) for x in report_fields])) + for tx_id in fusion['transcripts'].keys(): + m1 = fusion['transcripts'][tx_id]['match1'] + f_seq1 = fusion['split_seqs'][0][:-m1].lower() + fusion['split_seqs'][0][-m1:] + t_seq1 = fusion['transcripts'][tx_id]['seq1'][:-m1].lower() + fusion['transcripts'][tx_id]['seq1'][-m1:] + if len(f_seq1) > len(t_seq1): + t_seq1 = t_seq1.rjust(len(f_seq1),'.') + elif len(f_seq1) < len(t_seq1): + f_seq1 = f_seq1.rjust(len(t_seq1),'.') + m2 = fusion['transcripts'][tx_id]['match2'] + f_seq2 = fusion['split_seqs'][1][:m2] + fusion['split_seqs'][1][m2:].lower() + t_seq2 = fusion['transcripts'][tx_id]['seq2'][:m2] + fusion['transcripts'][tx_id]['seq2'][m2:].lower() + if len(f_seq2) > len(t_seq2): + t_seq2 = t_seq2.ljust(len(f_seq2),'.') + elif len(f_seq2) < len(t_seq2): + f_seq2 = f_seq2.ljust(len(t_seq2),'.') + print >> outputTxFile, ">%s\n%s\n%s" % (fusion_id,'\n'.join(textwrap.wrap(f_seq1,fa_width)),'\n'.join(textwrap.wrap(f_seq2,fa_width))) + print >> outputTxFile, "%s bkpt:%d rev_compl:%s\n%s\n%s" % (fusion['transcripts'][tx_id]['full_id'],fusion['transcripts'][tx_id]['bkpt'],str(fusion['transcripts'][tx_id]['revcompl']),'\n'.join(textwrap.wrap(t_seq1,fa_width)),'\n'.join(textwrap.wrap(t_seq2,fa_width))) + """ + if options.peptides and options.orf_alignment: + pass + """ + print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields])) + for i,fusion in enumerate(fusions): + print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields])) + +if __name__ == "__main__" : __main__() + diff -r f65857c1b92e -r b22f8634ff84 defuse_trinity_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse_trinity_analysis.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,55 @@ + + + verify fusions with trinity + + + + defuse_trinity_analysis.py --input $defuse_results --transcripts $trinity_transcripts --peptides $trinity_orfs + --nbases $nbases --min_pep_len $min_pep_len --ticdist $ticdist --readthrough=$readthrough + #if 'matched' in str($outputs).split(','): + --matched="$matched_output" + #end if + #if 'aligned' in str($outputs).split(','): + --transcript_alignment="$aligned_output" + #end if + --output $output + + + + + + + + + + + + + + + + + (outputs and 'matched' in outputs) + + + (outputs and 'aligned' in outputs) + + + + + + + + + + +**Defuse Results** + +Verifies DeFuse_ fusion predictions in results.tsv with TrinityRNAseq_ assembled transcripts and ORFs. + +This program relies on the header line of the results.tsv to determine which columns to use for analysis. + +.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse +.. _TrinityRNAseq: http://trinityrnaseq.github.io/ + + diff -r f65857c1b92e -r b22f8634ff84 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,28 @@ + + 0.6.2 + + defuse + + + samtools + bowtie + gmap + blat + + + R + ada + + + + + + + + + + 10.1371/journal.pcbi.1001138 + + + + diff -r f65857c1b92e -r b22f8634ff84 test-data/mm10_results.filtered.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mm10_results.filtered.tsv Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,46 @@ +cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2 probability +8647 GTGCTCCTGCTGCCAGGCGCAGCTGGGCGACATTGGCACGTCCTGTTACACCAAGAGCGGCATGATCCTTTGCAGAAATGACTACATTAGGT|GAAGATTGTAAAAAATTGACATCAGAAATATTTACAGAAATAGATACCTGTTTGAATAAAGTTAGAGATGAAATTTTTGCTAAACTTCAACCGAAGCTTAGATGCACATTAGGTGACATGGAAAGTCCTGTGTTTGCACTTCCTG 4 0.849232794977309 0.875860929877954 0.794775556794258 N N 3.72551845106187 3.02448896101185 3.02448896101185 1 0.0366733649981732 0 N 0 N Y 2482 2085 ENSMUSG00000028266 ENSMUSG00000041264 + - 3 5 144205220 149215434 coding coding Lmo4 Uspl1 144188530 149184350 - + 0 144201813 149198645 - - Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 5 0.831776131589327 0.982288003019777 0.982288003019777 0.831776131589327 4 - - 0.950339354539546 +12095 CTTCCAGGGTCCCCCGAGCCTAATGGATGCCGAGACAGACGAGGGCATGGACTATACAGGCTGTAGCCCTGGAGCGGCGTCCTCAGAGTCTTCCACCATGGACCGTAGCTGTTCCAGCACCC|CTGGCCCTTGACATCTAGCACCCCTTCACCCTCTTCCTGGGGACCCAGCAGGTGGTATGTGGCCGTGGAGCCCTCCGGGCTGTGGCTGTCCTTCCCAGGAGAGGATGACGTAGACTCGTTGCTGACAGGGGAGATGTCACTGCTGC 6 0.869976758916331 0.907802910133282 0.774396849342807 Y Y 3.64400585547602 3.28189243439864 3.28189243439864 0 0.983667499542934 0 Y 0 N N 0 1453 ENSMUSG00000086606 ENSMUSG00000028975 - + 4 4 148948914 149099876 intron coding Gm13205 Pex14 148947492 148960535 - - 0 148948907 148961548 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 5 0.760481034595956 1.02189639023832 1.02189639023832 0.760481034595956 4 - - 0.976704594819493 +4068 CGACACGCCGGGGCTGGCTGAGGAAAACAAAACGAAGCCCCTGGAGACCCGGCTTATCTCAGAGACCACAGCTATTTGCAAACCAGAGCAGGTGGCCAAACAAATTGTCAAAGATGCCATA|TACCGTTCCCCAGCTGAAGAGTTCTGAATCCACGCCGGATCCTTCTCAACAGTCTGTTTTACGGGAACTTTTATTAACCACTCCTTCCCCGTGATGCAGTTCTGAATCCTCCCTGTAGCAGGGGGTCTTCACTCATGCCTGAAGATGTTTCTTTTCC 8 1 0.541521196503195 0.26082452496772 N Y 3.46128890676658 3.81976875220098 3.46128890676658 2 0 0 N 0.794128973014604 N N 775 35409 ENSMUSG00000009905 ENSMUSG00000022816 + - 1 16 106759742 37836514 coding intron Kdsr Fstl1 106720410 37776873 - + 0.00358422939068104 106734547 37799068 - - Y - - N dataset_6344_files 4 1 1.81818181818182 1 4 1 N N 0 1 11 0.974366325576069 1.17240826166877 1.17240826166877 0.974366325576069 4 - - 0.913877182950088 +12868 GCGGTCTCGGCTCCAGCGGCAGTAGCAGCGGCGCCGGTCCCGTGTGCAGGAGCTCCTTTGCGGCCCAGTTTCTTGGCCATCGCCTGCTCTCCCCACAGCGCCAGGACGAGTCCCGTGCGCGTCCGTCCGCGGAGGTCTTTCTCATCTCGCTCGGCTGCGGGAAATCGGGCTGAAGCGACTGAGTCCGCGATGGAGA|AAACTTTAGAAACTGTTCCTTTGGAGAGGAAAAAGAGAGAAAAGGAAAACTT 69 1 0.439895614069599 0.332872660216425 N Y 3.30124852419771 2.96787791690762 2.96787791690762 0 0.0997837503861599 0.00401606425702794 N 0.540315106580167 N N 41631 0 ENSMUSG00000004980 ENSMUSG00000085456 + + 6 10 51469894 73327027 coding intron Hnrnpa2b1 Gm15398 51460434 73201399 - - 0.0997837503861599 51467295 73201702 - - Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 41 1.14864322933764 0.720872647377417 1.14864322933764 0.720872647377417 1 - - 0.520098627306064 +5160 TACGGATTCATTCAGTGTTCAGAACGGCAAGCTAGACTTTTCTTCCACTGTTCACAATATAATGGCAACCTCCAAGACTTAAAAGTAGGAGATGATGTTGAATTTGAAGTATCATCTGACCGGAGGACTGGGAAACCTATTGCTATTAAATTGGTGAAGATAAAACCAGAAATACATCCTGAAGAACGAATGAACGGAC|AAGAAGTATTTTATCTGACTTACACCCCTGAAGATGTGGAAGGGAAAGTTCAGC 33 1 0.767323324767952 0.471361574015707 N Y 3.3022815939676 3.55553920790569 3.3022815939676 91 0.944663167104112 0.00393700787401585 N 0.944663167104112 N N 10681 0 ENSMUSG00000068823 ENSMUSG00000087940 + - 3 18 103058189 28309891 coding upstream Csde1 SNORA17 103020546 28309760 + + 0 103040040 28188917 + - Y - - N dataset_6344_files 2 0 1.03333333333333 1 1 1 N N 0 0 30 1.1011131646754 0.67334258271517 1.1011131646754 0.67334258271517 3 - - 0.835626866413202 +8748 GGGGTAGATCACCTTCCGAGGGTCTCCATGGGTCCAGGCTATGATGCCCACAGCCACATAGCCTACGATGGCCAGGAAGAGCAACACACAGCAAATGACATCTGTGCATCCCCTGTTGTAAATGGGTCCTTTGAAGGTGGGGTCGTATTTCTGAGGCGTCCC|ATAGACTGCGTCCTTCCGATCGTCCTCCATGGCCTCAACCGAGGAGAGCTGAGTCCGAAGCCAGCGCGACCCCAACCCAAGCGGGCGGGAGACACCGCGCGCTGCTGGCCCCGCC 51 1 0.844316325604616 0.871821260113135 Y Y 3.29447165365238 3.73336824417043 3.29447165365238 2 0 0 Y 0.991335627150454 N N 6343 2 ENSMUSG00000057193 ENSMUSG00000003309 - - 9 9 21355025 21312333 coding upstream Slc44a2 Ap1m2 21337828 21295457 + - 0 21337962 21320850 - + N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 52 1.3070767782118 0.958522970688653 1.3070767782118 0.958522970688653 2 - - 0.74403558965543 +11169 CCACATCTGACAGAACTTGCCACTGTGCCTGCAACCTTGTCTGAGAGGAACCCTTCTCTG|AGGATGGACACTTCTCACACTACAAAGTCCTGTTTGCTGATTCTTCTTGTGGCCCTACTGTGTGCAGAAAGAGCTCAGGGACTGGAGTGTTACCAGTGCTATGGAGTCCCATTTGAGACTTCTTGCCCATCAATTACCTGCCCCTACCCTGATGGAGTCTGTGTTACTCAGGAGGCAGCAGTTATTGTGGATTCTCAAACAAGGAAAGTAAAGAACAATCTTTGCTTACCC 410 1 0.87136611677351 0.978590093310325 Y Y 3.45293525254824 3.55976477294109 3.45293525254824 0 0 0.152910958904109 Y 0.966780821917808 N Y 1416 5049 ENSMUSG00000079018 ENSMUSG00000075602 + - 15 15 75048837 74997634 utr5p utr5p Ly6c1 Ly6a 75045017 74994878 - - 0 75048442 74996568 - + N - - N dataset_6344_files 2 0 1.53082191780822 1 155 2 Y Y 0 0 292 0.728794324821125 1.49719703686079 1.49719703686079 0.728794324821125 4 - - 0.93076564160702 +12600 CTAAAATCGCCAAGCCTGTCAAGTTTGAGCTTTCTGGCTGCACCAGTGTGAAGACATACAGGGCTAAGTTCTGCGGGGTGTGCACAGACGGCCGCTGCTGCACACCGCACAGAACCACCACTCTGCCAGTGGAGTTCAAATGCCCCGATGGCGAGATCATGAAAAAGAATATGATGTTCATCAAG|CCCTGCCCCTGCCATTACCACTGTTCTGGGGTCCCTGGCATGCTCCCATTAC 1 1 0.820229055260519 0.873212995139997 N N 3.28548615724827 3.17936818462789 3.17936818462789 0 0.482950872656755 0.482950872656755 N 0.482950872656755 N N 7298 2735 ENSMUSG00000019997 ENSMUSG00000015133 + + 10 7 24598683 66388350 coding intron Ctgf Lrrk1 24595442 66226912 + - 0.482950872656755 24597524 66355922 + - Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 5 1.02981806768202 0.41192722707281 1.02981806768202 0.41192722707281 3 - - 0.58275607356487 +1855 CCTGGTCACACAGCTCTCCTAGGAAGCGCGTCTCCAATGACTCGTTTGTCAACCCCAGGTGTCTTCCACGTTGTGGTTTCAACTTCATAATTCTCTGAAGTCTTATTCAT|CTCTTTGTCAAGCCCACGGCGTCAGCCTCTGACACGAGGGCGGGCGTCCTCGCCTCCGGGGTGAAAGAGGGGCGCAGCGGGCCGCCCCTCCCCCCCGCCCCCCTACGACACGCGGGGCCCTGCCTCGGGCGACCTGTTGGCAGGGCGCGTCACGTGACGCGGCGGGCCGCGCCGTCCCC 4 0.769054178023969 0.159736089157399 0.42467594723228 Y Y 3.40528677690917 3.51935717616377 3.40528677690917 0 0.981880877742947 0 Y 0.0487460815047023 N N 0 8476 ENSMUSG00000097162 ENSMUSG00000039361 - - 7 7 90129474 90209447 intron upstream AC130210.1 Picalm 90124858 90130232 - + 0 90125032 90129872 + - N - - N dataset_6344_files 1 0.23489932885906 1 1 0 1 N Y 0 0.23489932885906 9 0.887227873695282 1.18032993911247 1.18032993911247 0.887227873695282 4 - - 0.949188379753168 +1870 CTGGGTCAAGAGCCGGAGGGACAGGACCAGAGCACCCCTTACGCCAGAACTAGCTCTCCTTGTTCCTACTGGGTGACCTCATCTCGCCACGCCTCCTCAGGTGAACACCCGGGCTGGTAACGTCACTTCCTGC|CAGGGTTTCACTATGTAGACCCTGGTCGGCCTGGAACTCTATAGACCAGACTGGCCTCGAGCTCAGATCCGTCCCCCTCTGCTGTCCCAGCACGGGGATTAAGAACGCGCCACCACTACAGCTGACCGGA 2 0.788660184540219 0.550050944563454 0.182990959521483 Y Y 3.63552422018169 3.63193049733206 3.63193049733206 4 0 0 Y 0 N N 1543 697 ENSMUSG00000044786 ENSMUSG00000003444 - + 7 7 28379255 28392708 downstream intron Zfp36 Med29 28376784 28386146 - - 0 28376683 28392166 + - N - - N dataset_6344_files 1 0.869158878504673 1 1 0 1 N Y 0 0.869158878504673 8 1.08526980978798 0.847619486476743 1.08526980978798 0.847619486476743 1 - - 0.841318537051835 +3153 CAGAGCTATGTAGAAAGACCCTGTCTGGTAAGTAAATAAAAACATAGCCAGGCATGGTGGCAATCAGCAGGTAGATTGGAGTTTGAGGTCATCCTGGTCTGGAGAGTGAGTTCCAGGAGAGCCAAGATTACACAAACCC|TGTCTTTTTCTTTCTTTCTGTTGTTGTTGTTGCTGTTCCTGCTGCTGCTGCTGTTTTGCTTTTCATGACAGGGTTTCTCTGTGTCTCTGTGCAACTTTGGCTATCCTGGAATTCACACTGTAGACCAGGCTGGCCTTGAATTCACACAGATCCATCTG 16 1 0.656277930324132 0.671834466829468 Y Y 3.40740634961144 2.41484814640267 2.41484814640267 0 0.9928298971561 0 Y 0.985659794312201 N N 0 30 ENSMUSG00000097379 ENSMUSG00000047786 - - 17 17 17395303 17459387 intron intron AC154200.1 Lix1 17389943 17402672 - + 0 17395298 17411818 + - N - - N dataset_6344_files 1 0.709459459459459 1 1 0 1 N Y 0.685314685314685 0.709459459459459 25 1.13279987445023 1.17240826166877 1.17240826166877 1.13279987445023 1 - - 0.81041504882724 +11596 GCCGGAGCAGATCAGGCTGAAAGTTGGTGGTGTGGACCCAAAGCAGCTAGCCGTCTATGAAGAGTTTGCACGAAATGTGCCTGGCTTCTTACCTACAAATGACTTAAGTCAGCCTACAGGGTTTTTAGCTCAGCCCATGAA|GTTTCTGGATCAAGATGTGAGCTTTCAGCTGTTGCTTGAGCATCATGCCTGCCTGCCTGCCACCACGCTCTCGGCCAGGATAGTGATGGATTCCTTCCCTCGGACTATAAGCCCCAAATGAACCCTTCCTTCTATGAGTTGCCT 8 0.974101244860941 0.684001003266195 0.530048089286902 N Y 3.49640736351508 3.32783191248938 3.32783191248938 0 0 0 Y 0 N N 6004 544 ENSMUSG00000036550 ENSMUSG00000036564 + + 8 8 95807462 95715119 coding intron Cnot1 Ndrg4 95719451 95676980 - + 0 95739809 95682960 - + N - - N dataset_6344_files 1 1 1 1 0 1 N N 0 1 16 1.12487819700652 1.17240826166877 1.17240826166877 1.12487819700652 1 - - 0.873952352979217 +3783 CTCTGTTCGTGCATCCCTGGGATATGCAGATGGATGGACGAATGGCCAAATACTGGCTGCCTTGGCTCGTAGGTTTGTGCTGGACTAGGGTTGGGAACACAGCACCAACTCTTGGGTTTTTTCTGTTATCCATGGAATTCTGTTCT|TTTCTTTCCGAGTGACCCCCACTTTACGGCGTCTCCCAATGACAAGAGGCGCAGAGCATTGTGGTCCCGCGGGTCCGAGA 2 0.867750583125795 0.425502304916607 0.692249690893423 Y Y 3.22804973003685 3.42048008240514 3.22804973003685 0 0 0 Y 0.987555066079295 N N 0 752 ENSMUSG00000026348 ENSMUSG00000026349 + - 1 1 127767564 127808061 intron upstream Acmsd Ccnt2 127729413 127774164 + + 0 127753955 127773930 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 6 1.02189639023832 0.649577550384046 1.02189639023832 0.649577550384046 3 - - 0.711778687837589 +4912 TAACAGAACAAGAGCAATGTGCTAGAATAGAAGACCAGAAAATGAAATGGTGGAGTTTGA|GGCTGGATAGACAGTTTGAAAGGTAAGTATTGAAAAACACTTGAATTTGGGTCAGTACAAAGGGACATGCAGAGACTTTGAATCATCAAAACTCCAGCATGCATTGTCTTACGGATGTTTAGATAGGTGTGTTTTGGACAACACTCTGGGTTCTTGTAATGATGTTGATCAAATGTCTGAG 15 1 0.895469718833858 0.922600375157455 N Y 3.29868787111796 3.36447795254773 3.29868787111796 0 0.933608815426997 0 N 0.933608815426997 N N 0 0 ENSMUSG00000088422 ENSMUSG00000053332 + - 5 1 79639109 161038539 downstream intron 7SK Gas5 79638815 161034422 - + 0.933608815426997 79638354 161036831 - - Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 5 0.459457291735057 0.808011099258204 0.808011099258204 0.459457291735057 1 - - 0.592482164323654 +5141 TCTTATCTCTGTGTTGGTGTGCTTCTCTGTTTGATGACAGAGCAGGGTCTTGCTGAACTGCACCTACTGGAGTAGGCAACTGTTTACCAATCCAGTCATATTCATAATCAAACATATACCCTTTTCGATCAAACAAGTCAGTAAAAAGCTTTCTTAGGTAATCATAGTCTGGCTTTTCAAAAAAATCCAGCCTTCGTACGTAACGGAGAT|ACGTTGCCATTTCTGGGAAGTTCTCACACAACACTTCTATTGGTGTGGCTCGTTT 9 1 0.853164221793603 0.997416520640376 N Y 3.60831833727406 3.59847695253922 3.59847695253922 266 0.981886534518113 0.981886534518113 N 0.981886534518113 N N 2227 0 ENSMUSG00000073563 ENSMUSG00000083798 - + 18 X 53955684 53418967 coding intron Csnk1g3 Gm14584 53862113 53418331 + + 0 53932206 53418862 - + Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 17 1.2040949714436 0.990209680463485 1.2040949714436 0.990209680463485 2 - - 0.788432447586541 +1886 CGCGGGGCGCCCGCCGGCTCACACTCCCGCGCTCTCTCTCCGGGTTTGGCGGCCGCCAGGAGGAGGAAGAGGAGGAGGAGGAGAAGAAGAAGGAGGAGGAGTGGAGCGAGCGCAGAAG|TAGCTGCTGCTGCTGGTGGTGACAATGTCAAATAACGGCGTAGACATCCAGGACAAACCCCCAGCCCCTCCGATGAGAAACACCAGCACTATGATTGGAGCCGGCAGCAAAGACACTGGAACCCTAAACCACGGCTCCAAACCTCTGCCTCCAAACCCAGAGGAG 18 1 0.708948431161152 0.783431002546196 Y Y 2.65454686157692 3.52101274054962 2.65454686157692 0 0.983110527572213 0 Y 0.983110527572213 N N 16 2476 ENSMUSG00000042797 ENSMUSG00000030774 - - 7 7 97738247 97912381 upstream utr5p Aqp11 Pak1 97726379 97842935 - + 0 97788974 97854436 + - N - - N dataset_6344_files 1 0.366666666666667 1 1 0 1 N Y 0.366666666666667 0 18 0.950601293244945 1.29915510076809 1.29915510076809 0.950601293244945 4 - - 0.970255256454431 +9297 GTGCTGGGCAGGAAGTCCCGGGCCAGGCAGCCCATGGCCACCAGATTCTTATCAGACAGGGGGCTCTCGCAGGAGACGAGGGGGAAGACATTTGGGAAGGACTGACTCT|CATTTGCGGTGCCTGGTTTCGGAGAGGTCCAGAGTCTTTGTGTGGAATTGTTCCTTCAAAGCCACCGAGGCTGGCTGGTCCATGAGCAGCCAGGTGGATGGGTGGCAGAAGCC 20 0.318065745640931 0.64026046146514 0.56228459372928 N Y 3.05166433869594 3.43614916339873 3.05166433869594 0 0 0 Y 0.990866828485621 N N 1488 0 ENSMUSG00000076617 ENSMUSG00000092748 - + 12 12 113422730 113425227 coding upstream Ighm AC073553.3 113418950 113425150 - - 0 113422730 113426655 + - N - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 6 0.665420905271462 0.879306196251575 0.879306196251575 0.665420905271462 4 - - 0.831715391282697 +5326 GTACACTGTAGCTGTCTTCAGACACACCAGAAGAGGGAGTCAGATCTTGTTACGGATGGTTGTGAGCCACCATGTGGTTGCTGGGATTTGAACTCTGGACCTTCGGAAGAGCAGTCGGGTGCTCTTACCCACTGAGCCATCTCACCAGCCCC|GAGTTACACAGTTTTAATGACTTCCTTACTTCTCATGATCTCTATACTGTTTAACTTTCCTCGGTGTGTTTTTAAGTCTTTGCTATCTTGTTCTAGTTTTTTGCAAGAATCACGAAAATGATTCTTGGATTTCCAGACTCTTCCTTTTGAAAGCTG 7 1 0.607242467201396 0.755062684986217 N Y 3.49744043328497 3.49528100778935 3.49528100778935 9 0.960654062340317 0 N 0.967211718616931 N N 2963 0 ENSMUSG00000039671 ENSMUSG00000073647 + + 2 18 165899016 4198969 utr3p downstream Zmynd8 Gm10557 165784152 4198216 - - 0.00979390223130638 165827729 4198107 - - Y - - N dataset_6344_files 476 0.993464052287582 298.461538461538 168 13 1 N N 0.993464052287582 0 13 1.21201664888731 1.34668516543034 1.34668516543034 1.21201664888731 1 - - 0.536685629613134 +1721 CGTACCTCCCTCCCAGCAACCGGCCTGGCGGCAGCGCGGCTACAAAACTGAGGAGGCGGAGCCGAGACGGAGTCGGTACTGCGCTCTGACTCCTAGACCAGGTTTAAGTTTTTGAAGTTGAAGTAGGTCTACACAGTAGGAACCCATGTCTTTTCTTGTAAGTAAACCAGAGCGCATTA|GGGCCAATGAGGCGAGCTCAGAGTCCATAGCATTGTTCTCCAAACCA 8 0.903915350299452 0.697638501162383 0.785699237375327 N Y 3.65384724021086 3.68527868815389 3.65384724021086 142 0.936451401255975 0.343331146311745 N 0.936451401255975 N N 800 0 ENSMUSG00000069631 ENSMUSG00000087034 + + 11 5 106202168 60232198 coding upstream Strada Cbfa2t2-ps1 106163330 60231482 - + 0 106187103 60176541 - + Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 5 0.879306196251575 0.475300646622473 0.879306196251575 0.475300646622473 1 - - 0.738431070020728 +1027 GCGGCAGCCGGTTCGGGCGGGCGGCATCATGGACGAGAAGTTGTTCACCAAGGAGCTGGACCAGTGGATCGAGCAGCTGAACGAGTGCAAGCAGCTCTCCGAGTCCCAGGTCAAGAGCCTCTGCGAGAAG|TGCTGCCTTTGACAGAGATGACATGCTTATACCATGCGGGTGGCACGAAGCTGTGAAGTGGTGATGACGGGGATGAGCTTGGACATCCTACGAGAGTGGCAAAGGTGAAGCAAGCCCAGGTGCTGGCTGTGCAAGG 2 1 0.148625497007208 0.21082086276253 N N 3.33524062522047 3.43039601505348 3.33524062522047 3 0 0 N 0 N N 10063 1785 ENSMUSG00000020349 ENSMUSG00000035021 + + 11 12 52127778 55014348 coding intron Ppp2ca Baz1a 52098681 54892989 + - 0 52099150 54980379 + - Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 10 0.942679615801238 1.07734813234427 1.07734813234427 0.942679615801238 4 - - 0.83438414122479 +7746 AAAGGGAGTCGAGACTGCCTTCTGCGCGCGCCCGGCTTTGCGCGCCTCCGCCACCAGATGTGGGGGGATGGGAGGCCCCCTCCGCGGCCCCTTCCCCACCCAGCCCAGAAAGCTGAACTGGCAAG|AGGCAATTTGAAACAAGCCACTCCAACCTCTTTTTCAAAGTTATAGGAGGTTCCCTGTCTTGAAGTCTCCTGCCTTGGATTTTCTGAGGTGCTGCTATTCTGGGCAACTATCAAAATCCTACCTGTTAAAACATGATGGATTAGAGAAAAAAAACAACCCAC 26 1 0.707412664952061 0.861438628736582 Y Y 3.14574680596571 3.22961716614005 3.14574680596571 0 0 0 Y 0 N N 0 0 ENSMUSG00000087658 ENSMUSG00000087626 + - 6 6 52162119 52174059 intron intron Gm15051 Gm15050 52156902 52172881 + + 0 52158684 52173634 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 20 1.02189639023832 1.23578168121843 1.23578168121843 1.02189639023832 2 - - 0.746044848941682 +8762 GGGGGCGGGGCTGAGCGCGGCCGCAGCCATTTTGGTGGAAGAGAAACAATAGGACGGAAGCGTCGCGGGACTGGGCTGTGGCCGCAG|AGTGTCCTGGCCTGCACAAACCGAGGAGCTGAGATCAAACAGGTGGCTGTAAGGACAGACAGTGAACGGAGGGCAAGCCGGCCTCTGGACCCCGCTGCCTCCCCTTTCTCCCTGCTGCTCGTGTCCAGAGGATGAGCCCAGCCTTCAGGACCATGGACGTGGAGCCCCGCACCAAGGGCATCC 28 0.498252441818171 0.890426951441059 0.30351938174741 Y Y 3.55041790174621 3.58424216889964 3.55041790174621 0 0.977096322687365 0.0380455528693218 Y 0.977096322687365 N N 0 714 ENSMUSG00000088626 ENSMUSG00000032599 - - 9 9 108795388 108806333 downstream utr5p SNORA28 Ip6k2 108795243 108795994 - + 0 108783870 108796064 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 8 0.705029292490001 1.27539006843697 1.27539006843697 0.705029292490001 4 - - 0.953988239876003 +13800 GATTGTTAGGGATGGGTCCTTGGTCAGCTGTCCAGAATGCTAGAGCTTCGTCCTCCTGGGAGATGGTTTCAGTCCTTACCCCCAGGACTCTGATGAGATCCTGAGGGAATCCACCCTCTGCTGATGGCCCAGTGAAAGCC|AGGCTCCGTGGCTTTACAGTGGATGGGATCTGGGAGAGTAGAGGGAAGGTTCTAGAACCCTGAAACCAGACCACTCTATTAGCGAACTCACAGCTGCCTTGTGGTGTAGAC 3 0.978409273957491 0.969552944230377 0.898574822628826 N Y 3.329991337985 3.51679652308403 3.329991337985 0 0 0 Y 0 N N 575 443 ENSMUSG00000053799 ENSMUSG00000024987 + - 19 19 37683245 37701528 intron upstream Exoc6 Cyp26a1 37550418 37697808 + + 0 37678397 37696729 + - N - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 12 0.839697809033035 0.855541163920451 0.855541163920451 0.839697809033035 1 - - 0.80116231284735 +5983 CCATGTCAAACCACCATCCACGGCTGTCGTCCTCGAAGTGCAGGTAGTCCATCCTGTGAGCGCCGTGCGGGAACTGCCTCCGTGTCACTGGGGCGGCGCGCCTGTGGAAT|CCTCCGAAGAGATGGAATCCTTTCCTGCAGCTCGGCAAGGGCCACTTCGCAGAGCTGGATTTCTGAAAGCTTTGCTTGATTTTCAAATATTCTTTAGTAAAGAATGTCTTTGTGGCATTGTTC 1 1 1 0.945022483912395 Y Y 3.49015970162987 3.55334927718654 3.49015970162987 0 0.990947940947941 0.00427350427350426 Y 0.990947940947941 N N 0 60 ENSMUSG00000021718 ENSMUSG00000021720 + + 13 13 105121782 105271039 downstream intron 4933425L06Rik Rnf180 105082122 105149352 + - 0 105131562 105167527 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 5 0.78424606692708 0.974366325576069 0.974366325576069 0.78424606692708 1 - - 0.813708699275449 +12092 TGCCCAATGAGCTGCTGGCACTCAGCACAGGTGTTGGCGAAGGTGTTGTCATAGCACGGAACGCAGTAGGGGCCACTGTCTGTCTGGATGTATTTGCGGCCGTACAAGGACTCGTTGCATTTTGCACAGTCAAATGCCTCGCTCATGGTGGCGGTGCCCAGTGAGC|CCTCAAACTCAAGAAGCCCCATCTCAGTCGGTCTTCTTACTTTGCAAGAGTTTTCAAAGGACTGCGCTGGGTCTCTTCTGCCAAGCGGCTCATGGCTTCCTCTGGGTGCTGAATCATTCTCTGGTGCCTGCAACCACAAGACCTCCTTCC 2 1 0.16313785091306 0.240986946418107 Y Y 3.43784458057793 3.48805370929674 3.43784458057793 0 0.99335436382755 0.00315457413249232 Y 0 N N 2072 0 ENSMUSG00000032643 ENSMUSG00000088067 - - 4 4 124708611 124697607 utr5p downstream Fhl3 U6 124700701 124697505 + - 0 124705614 124696071 - + N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 9 1.21993832633101 1.18825161655618 1.21993832633101 1.18825161655618 1 - - 0.736022708835765 +11605 CTCCACACCACAGCGTCACCGATGTCCAGCGCTTTCGAGCCGGCACACAGCTGTTGCAGAAGACTACAGGGGTGGGATCCGAGCTGGGAATGTAGAAGGAGGAACAGCGGCCAAAACAGAGATGATTAAGGACCCGGGCACTTGTGCAACCAGGCCTGGAGATCACCTGC|GCCGTACGGACTAAAGCAGCGCGGCGCTCCTCCGCTCCCCGGCCGGAGGCCCCCGGTGTTTCCGCCGCGCAGGCAGCGCCGTAGCCAGCCCCGCTGCCGCGAGGACCCACAGCCAAG 11 0.982340211125991 0.568397069242315 0.586452494205497 Y Y 3.57167229721571 3.28292550416853 3.28292550416853 0 0 0 Y 0.770032051282051 N N 167 2428 ENSMUSG00000053226 ENSMUSG00000033751 - - 8 8 84822823 84835482 intron utr5p Dand5 Gadd45gip1 84815405 84831522 - + 0 84816537 84832174 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 12 1.26746839099326 0.990209680463485 1.26746839099326 0.990209680463485 2 - - 0.758045795414889 +7456 AAATTCTGGAGTCAGTTCTGGAGACATAGATAGCTCCCAAATTATAACCAACCCTCTTCCTCCCGTGGCCTCCCCTCCTCCTGCATCTAAAGCAAAGGAAGTTTCCGATGGGGAAAATCTCGAGCAAGATCTATGTACGTTCTTGATATCAAGAGCCTGTAAGAACTCAACACTGGCTAATTATTTATACTGG|TCACTTTTACTCTGCTATTCCCCTATCTATGCTCCGAAAATCATTCTGACTCCTGCAATGACCCCTATTACCCAGATTTTGAAGATACTCCCTGCTATCTACTCAGCTCTGAAGACTTG 9 1 0.674019218743678 0.827309111200933 N N 3.64400585547602 3.3012485241977 3.3012485241977 0 0 0 N 0 N N 1791 0 ENSMUSG00000033628 ENSMUSG00000056822 + - 18 16 30348126 19487830 coding downstream Pik3c3 Olfr166 30272747 19486793 + + 0 30311220 19493776 + - Y - - N dataset_6344_files 1 1 1 1 0 1 N N 0 1 10 1.37837187520517 0.942679615801238 1.37837187520517 0.942679615801238 1 - - 0.745013975183866 +2385 CGGGACGGGGCGGGGCCGGCGAACTTCTGTGCCTCACTGTCCCCGGACACTGAGGGACACCGGGCAGGCAGCTGGCACCATGAAGATCTGGACTTCGGAGCACGTCTTTGAC|TACATCCGGTTTTCCCAGATCTGTGCAAAAGCAGTGAGGGATGCCCTGAAGACCGAGTTCAAAGCGAACGCTGAGAAGACTTCGGGCAGCAGCATAAAAATTGTGAAAGTCTCGAAGAAGGAGTAGCTGAATCTGAAGCCTGAAGTGCTGAGTCTTGAA 7 1 0.168340481582645 0.162692865850249 Y Y 3.69784855983781 3.60831833727406 3.60831833727406 0 0 0 Y 0 N Y 3085 3617 ENSMUSG00000016257 ENSMUSG00000016252 + - 2 2 174473081 174464105 coding coding Slmo2 Atp5e 174465067 174461072 - - 0 174472832 174462629 - + N - - N dataset_6344_files 2 0.190909090909091 1.2 1 1 1 Y Y 0.190909090909091 0 5 0.871384518807867 1.29915510076809 1.29915510076809 0.871384518807867 4 - - 0.7622459420951 +13923 AGACTGTTGAGAAGGATTCAACTGCCGAATTCAGAACTCATCAGCTGGGGAACGACGGTGATAAAGGTTCCCGTAAAGCAGACTGTTGAGAAGGATTCAACTGCCGAATTCAGAACTCATCAG|CCAGAGTCGGCGCTCTCCGGCGAGCTATCCCCTTCTCACCACACTCTGAGAACGGAGCTTGGTGCCGGCTCGGCCGCCTCCGCCAATTCCGGGTCCCTCTTCA 4 0.0503617839630236 0.900541946891923 0.647988756014038 N Y 3.6988816296077 3.47080361183081 3.47080361183081 0 0.980668063812497 0 N 0.980668063812497 N N 590 367 ENSMUSG00000046079 ENSMUSG00000051671 - - 5 8 105832436 126425434 intron upstream Lrrc8d 1810063B05Rik 105699969 126422501 + + 0 105728723 126422269 - - Y - - N dataset_6344_files 10 0.968503937007874 8 4 7 1 N N 0.968503937007874 0 7 0.974366325576069 0.617890840609215 0.974366325576069 0.617890840609215 4 - - 0.812231785796686 +11177 ATCTGACAGAACTTGCCACTGTGCCTGCAACCTTGTCTGAGAGGAA|CCCTTCTCTGAGGATGGACACTTCTCACACTACAAAGTCCTGTTTGCTGATTCTTCTTGTGGCCCTACTGTGTGCAGAAAGAGCTCAGGGACTGGAGTGTTACCAGTGCTATGGAGTCCCATTTGAGACTTCTTGCCCATCAATTACCTGCCCCTACCCTGATGGAGTCTGTGTTACTCAGGAGGCAGCAGTTATTGTGGATTCTCAAACAAGGA 434 0.873359569934578 0.121885088610831 0.552181129473186 Y Y 3.55081912933034 3.43784458057793 3.43784458057793 0 0 0.00381679389312994 Y 0.956687686691006 N Y 1416 5049 ENSMUSG00000079018 ENSMUSG00000075602 + - 15 15 75048837 74997634 utr5p utr5p Ly6c1 Ly6a 75045017 74994878 - - 0 75048442 74996568 - + N - - N dataset_6344_files 2 0 1.65079365079365 1 41 2 Y Y 0 0 63 0.665420905271462 1.36252852031776 1.36252852031776 0.665420905271462 4 - - 0.566839236883877 +3839 GTCACAGCCACCAATGTGTCAGCCCATGGAAGCCAAGCTAACTCGCCCTCTACTCCCAACTCAGCGGGTGGATACCCTTCGCCATGTTATCAGCCAGACAGGAGGATACAGTGACGGACTCGCAGCCAGTCAGATGTACAGTCCGCAGGGCATCAGT|GATATGTCAAGAACCTACTGATCCTCACAAGAACCTACTGTCTCTCTTCTCTTGACTGAAAACAAAAGTCTTCTTCTACCTGACCCGTGGCCTGACTTCTGGAAGAATGCATATGGACTTTCGAAGAAGTCAGAGGATATCTGCTGGCCACTTGTATCAGACAAAACAAGGCTGGTGAGCA 13 1 0.804385830976137 0.0961220231684031 Y Y 3.42779550918039 3.49697642496855 3.42779550918039 0 0 0 Y 0.993649362117881 N N 3895 0 ENSMUSG00000052534 ENSMUSG00000093538 + + 1 1 168432270 168122465 coding intron Pbx1 Gm20711 168153527 168115244 - + 0 168183530 168121480 - + N - - N dataset_6344_files 1 0.745098039215686 1 1 0 1 N Y 0 0.745098039215686 16 1.29915510076809 1.21201664888731 1.29915510076809 1.21201664888731 4 - - 0.97679698937665 +66 CTGAGAACGAGGAGCAGGAAGAACACACCAGCATGGGCGCGTTCAACGATCCGTTCCTGGCTCAGCCCCCCGATGAAGATTCACATTCCAGTTTTCCTGATGGTGAACAAATAGACCCTGAAAATCTCCACTTCAACCCTGATGAAGGAGGTGGA|AGACTGCTTGTTCTTGGAACCCAGCAGCCATACTGTGAGGAAGTCCAAACCAGCCAACCTGGAGAGACGGCATGCACAGGGTCCCACGGATA 13 0.719595391393931 0.399734668051015 0.808813953308524 Y Y 3.39487633072086 3.67849247003875 3.39487633072086 0 0 0 Y 0.989174263674614 N N 1514 627 ENSMUSG00000018548 ENSMUSG00000046442 + + 11 11 87220683 87359023 coding intron Trim37 Ppm1e 87127077 87226906 + - 0 87218328 87250589 + - N - - N dataset_6344_files 1 1 1 1 0 1 N Y 0 1 11 0.863462841364159 0.689185937602586 0.863462841364159 0.689185937602586 1 - - 0.615916282680748 +11601 GGCACTGGGAACCCGAGCGCAGCTTGGACAAGTGGACTGCACGCAGCGCCTGCCTGGTCTTTCACACTTCTCTGGGGACCTCAGGAGAGGAAGGAAGACTTTTGGATATGG|GGTGCTTCGAGTGCTGCATTAAATGCCTGGGAGGTATTCCCTATGCTTCTCTGATTGCAACCATCCTGCTGTATGCAGGCGTTGCCCTGTTCTGTGGCTGTGGCCATGAAGCCCTTTCTGGAACAGTCAACATTCTGCAGACCTACTTTGAGTTGGCAAGGACTGCTG 45 1 0.903984782676282 0.890308208832683 Y Y 3.25993583872649 3.64553343878587 3.25993583872649 0 0 0 Y 0.991023166023166 N N 195 2438 ENSMUSG00000039375 ENSMUSG00000031517 - - 8 8 54887184 55060877 intron coding Wdr17 Gpm6a 54629055 54954728 - + 0 54779650 55037328 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 33 0.887227873695282 1.25162503610584 1.25162503610584 0.887227873695282 4 - - 0.961094875436964 +11202 CAAACACGTCCTGGAGCAAGGCCTCCACAGCCACGGGGTCCCCATCCAGGGTCCTGAGTGGTTTTGAGTCAGCCATCAGGATGAAATCTTCCAGCTCAGGAGGGCTCAGAGAAGGATCAGACGAGCTGGTGACAAGGTACACAATGGACTTTTCAGAATAGGG|CTGGACTGCTTATTCCTTTAGAAGCAAGGTCTCTTCCCCAACCTGGGGCTCTAATTTTCTCAGTTAAGGTGGAATCCAGCAAGTCCAGGGGATTCTCCAGTCTCCACCTTTGTCAGAGCAGGCTTGTTAACC 21 1 0.362314658679907 0.288186846563607 Y Y 3.57167229721571 3.6035982586987 3.57167229721571 0 0 0 Y 0.984899672399672 N N 1306 3566 ENSMUSG00000023044 ENSMUSG00000046897 - - 15 15 102189043 102215606 utr5p upstream Csad Zfp740 102176998 102203648 - + 0 102188962 102202444 + - N - - N dataset_6344_files 1 0.961240310077519 1 1 0 3 N Y 0 0.961240310077519 10 1.21993832633101 1.02189639023832 1.21993832633101 1.02189639023832 4 - - 0.973142113883806 +9292 GGAGGTATCAAAGGACTTTTCAAAGGCGGTGATATGTCTAAGAATGTGAGTCAGTCACAGATGGCAAAATTAAACCAACAAATGGCCAAAATGATGGACCCACGAGTTCTTCATCACATG|GGAGGAGGAGGAGGAAGAAGAAAATAGGATGTCAGAAGAAGCAGAAAGACAATACCAACAAAACAAGCTGCAGGCCGATTCCATTGTACAGACAGACCAACCAGAAACAGTGTCGTCCAGCTTTGTAAATATTAATTTTGAAATGGAGGAAGACTGTGAAGCAATTAAG 13 0.627587070077396 0.568913321228002 0.464690965079815 N Y 3.57233464462502 2.71992636785275 2.71992636785275 0 0 0 Y 0.368850574712644 N Y 1463 391 ENSMUSG00000073079 ENSMUSG00000094103 + - 12 12 55112891 55214076 coding intron Srp54c 1700047I17Rik2 55089202 55199533 + + 0 55111181 55213840 + - N - - N dataset_6344_files 9 0 9 9 9 1 N N 0 0 9 0.93475793835753 1.17240826166877 1.17240826166877 0.93475793835753 4 - - 0.529926984288339 +8690 CTGGAGGAAAGCACCGCAGGTCTGAGCAGCCCTGAGCCGGGCAGGGTGGGGGCAGTGGCTAAGGCCTAGCTGGGGACGATTTAAAGGTATCGCGCCACCCAGCCACACCCCACAGGCCAGGCGAGGGTGCCACCCCCGGAGATCAGAGGTCATTGCTGGCGTTCAGA|GCCTAGGAAGTGGGCTGCGTTTCAGGGGGAAGTCCATGATCACCACGTGGCAACATGCAAGCGGGTGCTG 4 0.685704457350918 0.684620189710687 0.966832507460471 N Y 3.58424216889964 3.53768019619294 3.53768019619294 39 0.98577430972389 0.260264105642257 N 0.260264105642257 N N 4781 556 ENSMUSG00000032000 ENSMUSG00000016409 + - 9 X 7873186 37150746 utr5p intron Birc3 Nkap 7848699 37126795 - + 0 7872842 37143984 - - Y - - N dataset_6344_files 1 0 1 1 0 1 N N 0 0 5 0.831776131589327 0.712950969933709 0.831776131589327 0.712950969933709 1 - - 0.520824161374586 +11497 CCTTTCCAGCGAGGTTCCAAGTTCTTAGTCTGGTGCCGGCGTACCCACACGGCGTCACCGACACGGAAGGGGTGTGGTATCACTGGCTGATCTAGCTGGTCCTGATAAGCAGCGGCCAGCGGCTTCCAGACCTCTCGTTGTACTGCTTGGAGGGCCTGTAAGTGAG|CTTCTCTTCTGGAAGTCGGACCAATTCACCTCAAGCACCAGAGCTTGAATTCATGATCATCCTGGACACAGCACTCATCAGGACCGCGCCGCGGCTGA 67 0.883499713768307 0.731092666993054 0.679758623906517 N Y 3.66336194527509 3.6508225788147 3.6508225788147 0 0.0444358875625721 0 N 0.94917212167886 N N 0 1604 ENSMUSG00000096832 ENSMUSG00000039007 - + 5 15 23711061 33594552 downstream intron SNORD93 Pgcp 23710991 33083129 - + 0 23703360 33221484 + + Y - - N dataset_6344_files 76 1 38.7730061349693 5 163 1 N N 1 0 163 1.34668516543034 0.760481034595956 1.34668516543034 0.760481034595956 4 - - 0.903135577741999 +8726 GAGCAAGCTGACAGCTGAGCAGAAGCTGAGCATGGACACCTTCAGATCCAACTCAGCGGACATCATTCTTTCTGCAGGGCGGCAAGAGCTCAAGAGCAAGCCAAGGCTGATAAGCATGAAGAGGATGGAGATGAGGAACAAGGCTCTGGAGTACTGGAGATAAT|CATGTCAGTTCGGCCGCGCAGGCGGGCTGCGTCGTCCTCGACGAGGCCTTTCGACGCTACCGTAACCTGCTCTTCGGTTCCGGCTCTTGGCCCCGACCCAGCTTCTCAAGTGAGTCACTGCCC 3 0.70117526311829 0.321035960603558 0.664396670389762 Y Y 3.42949092635959 3.29652844562235 3.29652844562235 0 0 0 Y 0 N N 44 8769 ENSMUSG00000049526 ENSMUSG00000025232 - - 9 9 59525501 59565105 coding coding Tmem202 Hexa 59518686 59539667 - + 0 59520238 59539868 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 5 1.02981806768202 0.744637679708541 1.02981806768202 0.744637679708541 4 - - 0.931447658206354 +5020 CAAACCATCATTTGTATTTTTCAAACTGTCTATCGAGCCTCAAACTCC|ACCATTTCATTTTCTGGTCTTCTATTCTAGCACCTACAACAAACAAATAGTTAAAGCTGCCCGGTTAGGCCCCTGACTCAGCACTTAAGATGGGGGAACCAGAGTCCACCAAATAGGGGGTAAATGCGTTAACGACCACTAGCTCCCACATATCAAAAACCAGCCTCCTCAGATACGCAGAAACAATGCCCTGACTTCAGAC 27 0.772639018637575 0.542772921402035 0.769655490407199 N Y 3.43152237077921 3.41592783769912 3.41592783769912 0 0.89624833997344 0 N 0.584993359893758 N N 0 0 ENSMUSG00000088422 ENSMUSG00000053332 - + 5 1 79639109 161038539 downstream intron 7SK Gas5 79638815 161034422 - + 0.89624833997344 79638362 161036581 + + Y - - N dataset_6344_files 1 0 1 1 0 3 N N 0 0 9 0.52283071128472 1.06942645490056 1.06942645490056 0.52283071128472 1 - - 0.818668251983994 +3184 CATTGGAGCTGTGGTGGCTTTTGTGATGAAGAGAAGGAGAAACACAGGTGGAAAAGGAGGGGACTATGCTCTGGCTCCAG|AGAACAGCGCCTGATGTTCCCTGTGAGCCTATGGGCTCAATGTGAAGAATTGTGGAGCCCAGCCTTCGCCTACACACCAGGACCCTGTCTCTGCATTGCCCTGTGTTCCCTTCCACCGCCAACCTTCCGGGTCTGCAGTGGAAACTAAGGGTTCTTTGGAAAGTCGG 40 1 0.843266171585359 0.786568876550746 N Y 3.49424793801946 3.684245618384 3.49424793801946 0 0 0.551814516129032 Y 0.950201612903226 N Y 23545 2145 ENSMUSG00000073411 ENSMUSG00000035929 + - 17 17 35267499 35385290 coding utr3p H2-D1 H2-Q4 35262730 35379617 + + 0 35266514 35383995 + - N - - N dataset_6344_files 4 0.22972972972973 2.57142857142857 2 7 1 N N 0 0.22972972972973 7 0.562439098503259 1.03773974512573 1.03773974512573 0.562439098503259 4 - - 0.85501289117385 +9158 CCGAATTTCAACCTCCTTATCAACAGTGGGATCTTCAAAGAGTTGTACCCTGAAGTTGCTCTTTCTCAGTGGAGTCCCACACTCAGGACAGTTTCCAGCTCCTCTTACAAACAGTAAGTCCACACAACTCTCACAC|CTTCCCCACCAGCCTGGTCCGGCTGCCCACCTCTCCCCGCCCCCCACCTCGCTTCCCTACCGGGGTGGTAGGGGGGACGACGGTGGCAACGAGCGGGCGGGGGATCCTCCC 5 1 0.770112168741176 0.964592797110167 Y Y 3.1490579347374 3.11232376639641 3.11232376639641 0 0 0 Y 0 N N 1882 677 ENSMUSG00000021103 ENSMUSG00000034460 - - 12 12 73273988 73114037 coding intron Mnat1 Six4 73123717 73099609 + - 0 73168000 73112254 - + N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 5 1.0060530353509 0.78424606692708 1.0060530353509 0.78424606692708 4 - - 0.934413835802699 +1851 GCTGCAGGTGGGCTTATTCTACCATTGCTACTGTTCTGTCTTTGAAGATGTTCTTTATAGCATACTGAACACATGCCATTTGTACGAGGGTTTCCATAAAATCCGCAGCCAGTGGAGCAAAGCATAGGTGCTTGGCTGTGATTAGTTTCTTGAGCCATGTTCTTCTGTTGCA|CACCTGGCGGCGGCCCTCTCCGCCGGACGCGCTGCCGCCGCCGCCTCTCGCCGCCGCTGAGAGTGAGGACAGGTGAGGCCGCCAAACCCCCACTCGCTCCCGGCCCGCCGCCGCCGGCCCTCCGTCCGC 21 0.826443478467522 0.547250352165575 0.543160934834641 Y Y 3.38908332958226 2.95732075160099 2.95732075160099 0 0 0 Y 0.992273730684327 N N 1944 1 ENSMUSG00000030629 ENSMUSG00000085236 - - 7 7 84679361 84776549 utr5p intron Zfand6 2610206C17Rik 84615054 84689640 - + 0 84634406 84689743 + - N - - N dataset_6344_files 2 0.159420289855072 1.04878048780488 1 2 1 N Y 0 0.159420289855072 41 1.14864322933764 1.09319148723169 1.14864322933764 1.09319148723169 1 - - 0.715005473321084 +9962 CTGCGGCCCGCCGGGTCCCGGAGCCCACTGCCCCAGCACCCCGCGCTCGGCGCCCGCAGACGGCGCGGACCTCAGCGCGCACTTATGGGCTCGTTACCAGGACATGCGGAGGCTGGTGCACG|ACCTTCTGCCCCCTGAGGTCTGCAGCCTCCTAAACCCAGCAGCTATTTATGCCAACAATGAGATCAGCCTGAGTGACGTCGAAGTCTATGGCTTTGACTACGACTACACGCTGGCCCAGTATGCGGATGCACTGCACCCTGAGATCTTCAATGCTGCCCGGGACATCTTGATAGAGC 93 0.0971360740885943 0.213119196903586 0.467950871740144 Y Y 3.71720464963688 3.27448372166755 3.27448372166755 0 0.983661202185792 0 Y 0.991830601092896 N N 211 4122 ENSMUSG00000058351 ENSMUSG00000071547 - - 14 14 31128930 31139121 upstream upstream 2010107H07Rik Nt5dc2 31088869 31134853 - + 0 31131376 31134739 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 33 1.1011131646754 0.808011099258204 1.1011131646754 0.808011099258204 4 - - 0.743419539119423 +12549 TTGGAGATGCCAGTACCATGAGATGACCACCAAGAGCAGCAGCAGCAGTGGAGTACAGGCATCTGGAGCCTAGAGGATGACACATGTGCTACAAAGGGCCTGGCTGGAGAAGTGACCCAAGCCCTTGGAGGAGCCCAGAAGATC|GTCCATCCTGATAAAAATCACCATCCCCGGGCTGAGGAGGCCTTCAAAATTTTGCGGGCAGCTTGGGACATTGTCAGCAACCCAGAGAGGCGGAAGGAATATGAGATGAAACGGAT 18 1 0.96278392593137 0.394484707065412 N Y 3.2808195118354 3.46448140203209 3.2808195118354 1 0.965649359228432 0 N 0.974237019421324 N Y 1732 2437 ENSMUSG00000039307 ENSMUSG00000025354 + - 11 10 121222655 128819446 utr5p coding Hexdc Dnajc14 121204433 128805676 + + 0 121206748 128814038 + - Y - - N dataset_6344_files 3 0.993103448275862 1.72222222222222 1 11 1 N N 0.993103448275862 0 18 1.14864322933764 0.879306196251575 1.14864322933764 0.879306196251575 4 - - 0.927135541379163 +3179 CTGGAGCAGTCCCCGTGACGCCGGGTGGCGACTGGCTCCCGGGTCTGAGGGGCTTCTGCTTGTCAGGTTCT|AGATATGTGCTGACTAGCAGGCTCACGTGCACAGTGTGGAGGATAAGCTATATCTTACAAAATGGGATTTGGGAGTGACCTGAAGAACTCACAGGAAGCTGTGTTAAAGTTGCAAGACTGGGAACTACGGTTGCTGGAGACAGTGAAGAAATTTATGGCTCTGAG 10 1 0.557528435226891 0.364482194613623 Y Y 3.15741158895574 3.54759612884129 3.15741158895574 0 0.985974921257503 0 Y 0.873774291317525 N N 10 1325 ENSMUSG00000045506 ENSMUSG00000000127 - - 17 17 63863791 64139494 upstream upstream A930002H24Rik Fer 63863300 63896018 - + 0 63864053 63896016 + - N - - N dataset_6344_files 1 0 1 1 0 1 N Y 0 0 7 0.499065678953596 1.12487819700652 1.12487819700652 0.499065678953596 3 - - 0.767822892174251 diff -r f65857c1b92e -r b22f8634ff84 test-data/mm10_results.filtered.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mm10_results.filtered.vcf Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,115 @@ +##fileformat=VCFv4.1 +##source=defuse +##reference=mm10 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 106734547 bnd_4068_1 A A]16:37799068] 233 PASS SVTYPE=BND;MATEID=bnd_4068_2;DP=19;SPLITCNT=8;SPANCNT=11;GENE=Kdsr,Fstl1;GENEID=ENSMUSG00000009905,ENSMUSG00000022816;GENELOC=coding,intron;EXPR=775,35409;HOMLEN=2;SPLICESCORE=4;INTERCHROM;ALTSPLICE +1 127753955 bnd_3783_1 T T]1:127773930] 181 PASS SVTYPE=BND;MATEID=bnd_3783_2;DP=8;SPLITCNT=2;SPANCNT=6;GENE=Acmsd,Ccnt2;GENEID=ENSMUSG00000026348,ENSMUSG00000026349;GENELOC=intron,upstream;EXPR=0,752;HOMLEN=0;SPLICESCORE=3;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +1 127773930 bnd_3783_2 T [1:127753955[T 181 PASS SVTYPE=BND;MATEID=bnd_3783_1;DP=8;SPLITCNT=2;SPANCNT=6;GENE=Acmsd,Ccnt2;GENEID=ENSMUSG00000026348,ENSMUSG00000026349;GENELOC=intron,upstream;EXPR=0,752;HOMLEN=0;SPLICESCORE=3;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +1 161036581 bnd_5020_2 A [5:79638362[A 208 PASS SVTYPE=BND;MATEID=bnd_5020_1;DP=36;SPLITCNT=27;SPANCNT=9;GENE=7SK,Gas5;GENEID=ENSMUSG00000088422,ENSMUSG00000053332;GENELOC=downstream,intron;EXPR=0,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM;ALTSPLICE +1 161036831 bnd_4912_2 G ]5:79638354]G 151 PASS SVTYPE=BND;MATEID=bnd_4912_1;DP=20;SPLITCNT=15;SPANCNT=5;GENE=7SK,Gas5;GENEID=ENSMUSG00000088422,ENSMUSG00000053332;GENELOC=downstream,intron;EXPR=0,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM;ALTSPLICE +1 168121480 bnd_3839_2 G ]1:168183530]G 249 PASS SVTYPE=BND;MATEID=bnd_3839_1;DP=29;SPLITCNT=13;SPANCNT=16;GENE=Pbx1,Gm20711;GENEID=ENSMUSG00000052534,ENSMUSG00000093538;GENELOC=coding,intron;EXPR=3895,0;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +1 168183530 bnd_3839_1 T T[1:168121480[ 249 PASS SVTYPE=BND;MATEID=bnd_3839_2;DP=29;SPLITCNT=13;SPANCNT=16;GENE=Pbx1,Gm20711;GENEID=ENSMUSG00000052534,ENSMUSG00000093538;GENELOC=coding,intron;EXPR=3895,0;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +2 165827729 bnd_5326_1 C C]18:4198107] 136 PASS SVTYPE=BND;MATEID=bnd_5326_2;DP=20;SPLITCNT=7;SPANCNT=13;GENE=Zmynd8,Gm10557;GENEID=ENSMUSG00000039671,ENSMUSG00000073647;GENELOC=utr3p,downstream;EXPR=2963,0;HOMLEN=9;SPLICESCORE=1;INTERCHROM;ALTSPLICE +2 174462629 bnd_2385_2 T ]2:174472832]T 194 PASS SVTYPE=BND;MATEID=bnd_2385_1;DP=12;SPLITCNT=7;SPANCNT=5;GENE=Slmo2,Atp5e;GENEID=ENSMUSG00000016257,ENSMUSG00000016252;GENELOC=coding,coding;EXPR=3085,3617;HOMLEN=0;SPLICESCORE=4;ORF;EXONBND;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +2 174472832 bnd_2385_1 C C[2:174462629[ 194 PASS SVTYPE=BND;MATEID=bnd_2385_2;DP=12;SPLITCNT=7;SPANCNT=5;GENE=Slmo2,Atp5e;GENEID=ENSMUSG00000016257,ENSMUSG00000016252;GENELOC=coding,coding;EXPR=3085,3617;HOMLEN=0;SPLICESCORE=4;ORF;EXONBND;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +3 103040040 bnd_5160_1 C C]18:28188917] 213 PASS SVTYPE=BND;MATEID=bnd_5160_2;DP=63;SPLITCNT=33;SPANCNT=30;GENE=Csde1,SNORA17;GENEID=ENSMUSG00000068823,ENSMUSG00000087940;GENELOC=coding,upstream;EXPR=10681,0;HOMLEN=91;SPLICESCORE=3;INTERCHROM;ALTSPLICE +3 144201813 bnd_8647_1 T T]5:149198645] 242 PASS SVTYPE=BND;MATEID=bnd_8647_2;DP=9;SPLITCNT=4;SPANCNT=5;GENE=Lmo4,Uspl1;GENEID=ENSMUSG00000028266,ENSMUSG00000041264;GENELOC=coding,coding;EXPR=2482,2085;HOMLEN=1;SPLICESCORE=4;EXONBND;INTERCHROM +4 124696071 bnd_12092_2 C ]4:124705614]C 187 PASS SVTYPE=BND;MATEID=bnd_12092_1;DP=11;SPLITCNT=2;SPANCNT=9;GENE=Fhl3,U6;GENEID=ENSMUSG00000032643,ENSMUSG00000088067;GENELOC=utr5p,downstream;EXPR=2072,0;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +4 124705614 bnd_12092_1 C C[4:124696071[ 187 PASS SVTYPE=BND;MATEID=bnd_12092_2;DP=11;SPLITCNT=2;SPANCNT=9;GENE=Fhl3,U6;GENEID=ENSMUSG00000032643,ENSMUSG00000088067;GENELOC=utr5p,downstream;EXPR=2072,0;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +4 148948907 bnd_12095_1 C C]4:148961548] 249 PASS SVTYPE=BND;MATEID=bnd_12095_2;DP=11;SPLITCNT=6;SPANCNT=5;GENE=Gm13205,Pex14;GENEID=ENSMUSG00000086606,ENSMUSG00000028975;GENELOC=intron,coding;EXPR=0,1453;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +4 148961548 bnd_12095_2 C [4:148948907[C 249 PASS SVTYPE=BND;MATEID=bnd_12095_1;DP=11;SPLITCNT=6;SPANCNT=5;GENE=Gm13205,Pex14;GENEID=ENSMUSG00000086606,ENSMUSG00000028975;GENELOC=intron,coding;EXPR=0,1453;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +5 23703360 bnd_11497_1 G G[15:33221484[ 230 PASS SVTYPE=BND;MATEID=bnd_11497_2;DP=230;SPLITCNT=67;SPANCNT=163;GENE=SNORD93,Pgcp;GENEID=ENSMUSG00000096832,ENSMUSG00000039007;GENELOC=downstream,intron;EXPR=0,1604;HOMLEN=0;SPLICESCORE=4;INTERCHROM;ALTSPLICE +5 60176541 bnd_1721_2 G ]11:106187103]G 188 PASS SVTYPE=BND;MATEID=bnd_1721_1;DP=13;SPLITCNT=8;SPANCNT=5;GENE=Strada,Cbfa2t2-ps1;GENEID=ENSMUSG00000069631,ENSMUSG00000087034;GENELOC=coding,upstream;EXPR=800,0;HOMLEN=142;SPLICESCORE=1;INTERCHROM;ALTSPLICE +5 79638354 bnd_4912_1 A A]1:161036831] 151 PASS SVTYPE=BND;MATEID=bnd_4912_2;DP=20;SPLITCNT=15;SPANCNT=5;GENE=7SK,Gas5;GENEID=ENSMUSG00000088422,ENSMUSG00000053332;GENELOC=downstream,intron;EXPR=0,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM;ALTSPLICE +5 79638362 bnd_5020_1 C C[1:161036581[ 208 PASS SVTYPE=BND;MATEID=bnd_5020_2;DP=36;SPLITCNT=27;SPANCNT=9;GENE=7SK,Gas5;GENEID=ENSMUSG00000088422,ENSMUSG00000053332;GENELOC=downstream,intron;EXPR=0,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM;ALTSPLICE +5 105728723 bnd_13923_1 G G]8:126422269] 207 PASS SVTYPE=BND;MATEID=bnd_13923_2;DP=11;SPLITCNT=4;SPANCNT=7;GENE=Lrrc8d,1810063B05Rik;GENEID=ENSMUSG00000046079,ENSMUSG00000051671;GENELOC=intron,upstream;EXPR=590,367;HOMLEN=0;SPLICESCORE=4;INTERCHROM;ALTSPLICE +5 149198645 bnd_8647_2 G ]3:144201813]G 242 PASS SVTYPE=BND;MATEID=bnd_8647_1;DP=9;SPLITCNT=4;SPANCNT=5;GENE=Lmo4,Uspl1;GENEID=ENSMUSG00000028266,ENSMUSG00000041264;GENELOC=coding,coding;EXPR=2482,2085;HOMLEN=1;SPLICESCORE=4;EXONBND;INTERCHROM +6 51467295 bnd_12868_1 A A]10:73201702] 132 PASS SVTYPE=BND;MATEID=bnd_12868_2;DP=110;SPLITCNT=69;SPANCNT=41;GENE=Hnrnpa2b1,Gm15398;GENEID=ENSMUSG00000004980,ENSMUSG00000085456;GENELOC=coding,intron;EXPR=41631,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM;ALTSPLICE +6 52158684 bnd_7746_1 G G]6:52173634] 190 PASS SVTYPE=BND;MATEID=bnd_7746_2;DP=46;SPLITCNT=26;SPANCNT=20;GENE=Gm15051,Gm15050;GENEID=ENSMUSG00000087658,ENSMUSG00000087626;GENELOC=intron,intron;EXPR=0,0;HOMLEN=0;SPLICESCORE=2;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +6 52173634 bnd_7746_2 A [6:52158684[A 190 PASS SVTYPE=BND;MATEID=bnd_7746_1;DP=46;SPLITCNT=26;SPANCNT=20;GENE=Gm15051,Gm15050;GENEID=ENSMUSG00000087658,ENSMUSG00000087626;GENELOC=intron,intron;EXPR=0,0;HOMLEN=0;SPLICESCORE=2;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 28376683 bnd_1870_1 C C]7:28392166] 214 PASS SVTYPE=BND;MATEID=bnd_1870_2;DP=10;SPLITCNT=2;SPANCNT=8;GENE=Zfp36,Med29;GENEID=ENSMUSG00000044786,ENSMUSG00000003444;GENELOC=downstream,intron;EXPR=1543,697;HOMLEN=4;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 28392166 bnd_1870_2 C [7:28376683[C 214 PASS SVTYPE=BND;MATEID=bnd_1870_1;DP=10;SPLITCNT=2;SPANCNT=8;GENE=Zfp36,Med29;GENEID=ENSMUSG00000044786,ENSMUSG00000003444;GENELOC=downstream,intron;EXPR=1543,697;HOMLEN=4;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 66355922 bnd_12600_2 C [10:24597524[C 148 PASS SVTYPE=BND;MATEID=bnd_12600_1;DP=6;SPLITCNT=1;SPANCNT=5;GENE=Ctgf,Lrrk1;GENEID=ENSMUSG00000019997,ENSMUSG00000015133;GENELOC=coding,intron;EXPR=7298,2735;HOMLEN=0;SPLICESCORE=3;INTERCHROM +7 84634406 bnd_1851_1 A A]7:84689743] 182 PASS SVTYPE=BND;MATEID=bnd_1851_2;DP=62;SPLITCNT=21;SPANCNT=41;GENE=Zfand6,2610206C17Rik;GENEID=ENSMUSG00000030629,ENSMUSG00000085236;GENELOC=utr5p,intron;EXPR=1944,1;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 84689743 bnd_1851_2 C [7:84634406[C 182 PASS SVTYPE=BND;MATEID=bnd_1851_1;DP=62;SPLITCNT=21;SPANCNT=41;GENE=Zfand6,2610206C17Rik;GENEID=ENSMUSG00000030629,ENSMUSG00000085236;GENELOC=utr5p,intron;EXPR=1944,1;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 90125032 bnd_1855_1 T T]7:90129872] 242 PASS SVTYPE=BND;MATEID=bnd_1855_2;DP=13;SPLITCNT=4;SPANCNT=9;GENE=AC130210.1,Picalm;GENEID=ENSMUSG00000097162,ENSMUSG00000039361;GENELOC=intron,upstream;EXPR=0,8476;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 90129872 bnd_1855_2 C [7:90125032[C 242 PASS SVTYPE=BND;MATEID=bnd_1855_1;DP=13;SPLITCNT=4;SPANCNT=9;GENE=AC130210.1,Picalm;GENEID=ENSMUSG00000097162,ENSMUSG00000039361;GENELOC=intron,upstream;EXPR=0,8476;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 97788974 bnd_1886_1 G G]7:97854436] 247 PASS SVTYPE=BND;MATEID=bnd_1886_2;DP=36;SPLITCNT=18;SPANCNT=18;GENE=Aqp11,Pak1;GENEID=ENSMUSG00000042797,ENSMUSG00000030774;GENELOC=upstream,utr5p;EXPR=16,2476;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +7 97854436 bnd_1886_2 T [7:97788974[T 247 PASS SVTYPE=BND;MATEID=bnd_1886_1;DP=36;SPLITCNT=18;SPANCNT=18;GENE=Aqp11,Pak1;GENEID=ENSMUSG00000042797,ENSMUSG00000030774;GENELOC=upstream,utr5p;EXPR=16,2476;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +8 54779650 bnd_11601_1 G G]8:55037328] 245 PASS SVTYPE=BND;MATEID=bnd_11601_2;DP=78;SPLITCNT=45;SPANCNT=33;GENE=Wdr17,Gpm6a;GENEID=ENSMUSG00000039375,ENSMUSG00000031517;GENELOC=intron,coding;EXPR=195,2438;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +8 55037328 bnd_11601_2 G [8:54779650[G 245 PASS SVTYPE=BND;MATEID=bnd_11601_1;DP=78;SPLITCNT=45;SPANCNT=33;GENE=Wdr17,Gpm6a;GENEID=ENSMUSG00000039375,ENSMUSG00000031517;GENELOC=intron,coding;EXPR=195,2438;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +8 84816537 bnd_11605_1 C C]8:84832174] 193 PASS SVTYPE=BND;MATEID=bnd_11605_2;DP=23;SPLITCNT=11;SPANCNT=12;GENE=Dand5,Gadd45gip1;GENEID=ENSMUSG00000053226,ENSMUSG00000033751;GENELOC=intron,utr5p;EXPR=167,2428;HOMLEN=0;SPLICESCORE=2;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +8 84832174 bnd_11605_2 G [8:84816537[G 193 PASS SVTYPE=BND;MATEID=bnd_11605_1;DP=23;SPLITCNT=11;SPANCNT=12;GENE=Dand5,Gadd45gip1;GENEID=ENSMUSG00000053226,ENSMUSG00000033751;GENELOC=intron,utr5p;EXPR=167,2428;HOMLEN=0;SPLICESCORE=2;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +8 95682960 bnd_11596_2 G ]8:95739809]G 222 PASS SVTYPE=BND;MATEID=bnd_11596_1;DP=24;SPLITCNT=8;SPANCNT=16;GENE=Cnot1,Ndrg4;GENEID=ENSMUSG00000036550,ENSMUSG00000036564;GENELOC=coding,intron;EXPR=6004,544;HOMLEN=0;SPLICESCORE=1;ALTSPLICE;DELETION +8 95739809 bnd_11596_1 A A[8:95682960[ 222 PASS SVTYPE=BND;MATEID=bnd_11596_2;DP=24;SPLITCNT=8;SPANCNT=16;GENE=Cnot1,Ndrg4;GENEID=ENSMUSG00000036550,ENSMUSG00000036564;GENELOC=coding,intron;EXPR=6004,544;HOMLEN=0;SPLICESCORE=1;ALTSPLICE;DELETION +8 126422269 bnd_13923_2 C ]5:105728723]C 207 PASS SVTYPE=BND;MATEID=bnd_13923_1;DP=11;SPLITCNT=4;SPANCNT=7;GENE=Lrrc8d,1810063B05Rik;GENEID=ENSMUSG00000046079,ENSMUSG00000051671;GENELOC=intron,upstream;EXPR=590,367;HOMLEN=0;SPLICESCORE=4;INTERCHROM;ALTSPLICE +9 7872842 bnd_8690_1 A A]X:37143984] 132 PASS SVTYPE=BND;MATEID=bnd_8690_2;DP=9;SPLITCNT=4;SPANCNT=5;GENE=Birc3,Nkap;GENEID=ENSMUSG00000032000,ENSMUSG00000016409;GENELOC=utr5p,intron;EXPR=4781,556;HOMLEN=39;SPLICESCORE=1;INTERCHROM;ALTSPLICE +9 21320850 bnd_8748_2 A ]9:21337962]A 189 PASS SVTYPE=BND;MATEID=bnd_8748_1;DP=103;SPLITCNT=51;SPANCNT=52;GENE=Slc44a2,Ap1m2;GENEID=ENSMUSG00000057193,ENSMUSG00000003309;GENELOC=coding,upstream;EXPR=6343,2;HOMLEN=2;SPLICESCORE=2;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +9 21337962 bnd_8748_1 C C[9:21320850[ 189 PASS SVTYPE=BND;MATEID=bnd_8748_2;DP=103;SPLITCNT=51;SPANCNT=52;GENE=Slc44a2,Ap1m2;GENEID=ENSMUSG00000057193,ENSMUSG00000003309;GENELOC=coding,upstream;EXPR=6343,2;HOMLEN=2;SPLICESCORE=2;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +9 59520238 bnd_8726_1 T T]9:59539868] 237 PASS SVTYPE=BND;MATEID=bnd_8726_2;DP=8;SPLITCNT=3;SPANCNT=5;GENE=Tmem202,Hexa;GENEID=ENSMUSG00000049526,ENSMUSG00000025232;GENELOC=coding,coding;EXPR=44,8769;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +9 59539868 bnd_8726_2 C [9:59520238[C 237 PASS SVTYPE=BND;MATEID=bnd_8726_1;DP=8;SPLITCNT=3;SPANCNT=5;GENE=Tmem202,Hexa;GENEID=ENSMUSG00000049526,ENSMUSG00000025232;GENELOC=coding,coding;EXPR=44,8769;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +9 108783870 bnd_8762_1 G G]9:108796064] 243 PASS SVTYPE=BND;MATEID=bnd_8762_2;DP=36;SPLITCNT=28;SPANCNT=8;GENE=SNORA28,Ip6k2;GENEID=ENSMUSG00000088626,ENSMUSG00000032599;GENELOC=downstream,utr5p;EXPR=0,714;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +9 108796064 bnd_8762_2 A [9:108783870[A 243 PASS SVTYPE=BND;MATEID=bnd_8762_1;DP=36;SPLITCNT=28;SPANCNT=8;GENE=SNORA28,Ip6k2;GENEID=ENSMUSG00000088626,ENSMUSG00000032599;GENELOC=downstream,utr5p;EXPR=0,714;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +10 24597524 bnd_12600_1 G G]7:66355922] 148 PASS SVTYPE=BND;MATEID=bnd_12600_2;DP=6;SPLITCNT=1;SPANCNT=5;GENE=Ctgf,Lrrk1;GENEID=ENSMUSG00000019997,ENSMUSG00000015133;GENELOC=coding,intron;EXPR=7298,2735;HOMLEN=0;SPLICESCORE=3;INTERCHROM +10 73201702 bnd_12868_2 A ]6:51467295]A 132 PASS SVTYPE=BND;MATEID=bnd_12868_1;DP=110;SPLITCNT=69;SPANCNT=41;GENE=Hnrnpa2b1,Gm15398;GENEID=ENSMUSG00000004980,ENSMUSG00000085456;GENELOC=coding,intron;EXPR=41631,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM;ALTSPLICE +10 128814038 bnd_12549_2 G [11:121206748[G 236 PASS SVTYPE=BND;MATEID=bnd_12549_1;DP=36;SPLITCNT=18;SPANCNT=18;GENE=Hexdc,Dnajc14;GENEID=ENSMUSG00000039307,ENSMUSG00000025354;GENELOC=utr5p,coding;EXPR=1732,2437;HOMLEN=1;SPLICESCORE=4;EXONBND;INTERCHROM;ALTSPLICE +11 52099150 bnd_1027_1 G G]12:54980379] 212 PASS SVTYPE=BND;MATEID=bnd_1027_2;DP=12;SPLITCNT=2;SPANCNT=10;GENE=Ppp2ca,Baz1a;GENEID=ENSMUSG00000020349,ENSMUSG00000035021;GENELOC=coding,intron;EXPR=10063,1785;HOMLEN=3;SPLICESCORE=4;INTERCHROM +11 87218328 bnd_66_1 A A]11:87250589] 157 PASS SVTYPE=BND;MATEID=bnd_66_2;DP=24;SPLITCNT=13;SPANCNT=11;GENE=Trim37,Ppm1e;GENEID=ENSMUSG00000018548,ENSMUSG00000046442;GENELOC=coding,intron;EXPR=1514,627;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +11 87250589 bnd_66_2 A [11:87218328[A 157 PASS SVTYPE=BND;MATEID=bnd_66_1;DP=24;SPLITCNT=13;SPANCNT=11;GENE=Trim37,Ppm1e;GENEID=ENSMUSG00000018548,ENSMUSG00000046442;GENELOC=coding,intron;EXPR=1514,627;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +11 106187103 bnd_1721_1 A A[5:60176541[ 188 PASS SVTYPE=BND;MATEID=bnd_1721_2;DP=13;SPLITCNT=8;SPANCNT=5;GENE=Strada,Cbfa2t2-ps1;GENEID=ENSMUSG00000069631,ENSMUSG00000087034;GENELOC=coding,upstream;EXPR=800,0;HOMLEN=142;SPLICESCORE=1;INTERCHROM;ALTSPLICE +11 121206748 bnd_12549_1 C C]10:128814038] 236 PASS SVTYPE=BND;MATEID=bnd_12549_2;DP=36;SPLITCNT=18;SPANCNT=18;GENE=Hexdc,Dnajc14;GENEID=ENSMUSG00000039307,ENSMUSG00000025354;GENELOC=utr5p,coding;EXPR=1732,2437;HOMLEN=1;SPLICESCORE=4;EXONBND;INTERCHROM;ALTSPLICE +12 54980379 bnd_1027_2 T [11:52099150[T 212 PASS SVTYPE=BND;MATEID=bnd_1027_1;DP=12;SPLITCNT=2;SPANCNT=10;GENE=Ppp2ca,Baz1a;GENEID=ENSMUSG00000020349,ENSMUSG00000035021;GENELOC=coding,intron;EXPR=10063,1785;HOMLEN=3;SPLICESCORE=4;INTERCHROM +12 55111181 bnd_9292_1 G G]12:55213840] 135 PASS SVTYPE=BND;MATEID=bnd_9292_2;DP=22;SPLITCNT=13;SPANCNT=9;GENE=Srp54c,1700047I17Rik2;GENEID=ENSMUSG00000073079,ENSMUSG00000094103;GENELOC=coding,intron;EXPR=1463,391;HOMLEN=0;SPLICESCORE=4;EXONBND;ALTSPLICE;DELETION +12 55213840 bnd_9292_2 G [12:55111181[G 135 PASS SVTYPE=BND;MATEID=bnd_9292_1;DP=22;SPLITCNT=13;SPANCNT=9;GENE=Srp54c,1700047I17Rik2;GENEID=ENSMUSG00000073079,ENSMUSG00000094103;GENELOC=coding,intron;EXPR=1463,391;HOMLEN=0;SPLICESCORE=4;EXONBND;ALTSPLICE;DELETION +12 73112254 bnd_9158_2 C ]12:73168000]C 238 PASS SVTYPE=BND;MATEID=bnd_9158_1;DP=10;SPLITCNT=5;SPANCNT=5;GENE=Mnat1,Six4;GENEID=ENSMUSG00000021103,ENSMUSG00000034460;GENELOC=coding,intron;EXPR=1882,677;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +12 73168000 bnd_9158_1 C C[12:73112254[ 238 PASS SVTYPE=BND;MATEID=bnd_9158_2;DP=10;SPLITCNT=5;SPANCNT=5;GENE=Mnat1,Six4;GENEID=ENSMUSG00000021103,ENSMUSG00000034460;GENELOC=coding,intron;EXPR=1882,677;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +12 113422730 bnd_9297_1 T T]12:113426655] 212 PASS SVTYPE=BND;MATEID=bnd_9297_2;DP=26;SPLITCNT=20;SPANCNT=6;GENE=Ighm,AC073553.3;GENEID=ENSMUSG00000076617,ENSMUSG00000092748;GENELOC=coding,upstream;EXPR=1488,0;HOMLEN=0;SPLICESCORE=4;ALTSPLICE;DELETION +12 113426655 bnd_9297_2 C [12:113422730[C 212 PASS SVTYPE=BND;MATEID=bnd_9297_1;DP=26;SPLITCNT=20;SPANCNT=6;GENE=Ighm,AC073553.3;GENEID=ENSMUSG00000076617,ENSMUSG00000092748;GENELOC=coding,upstream;EXPR=1488,0;HOMLEN=0;SPLICESCORE=4;ALTSPLICE;DELETION +13 105131562 bnd_5983_1 T T]13:105167527] 207 PASS SVTYPE=BND;MATEID=bnd_5983_2;DP=6;SPLITCNT=1;SPANCNT=5;GENE=4933425L06Rik,Rnf180;GENEID=ENSMUSG00000021718,ENSMUSG00000021720;GENELOC=downstream,intron;EXPR=0,60;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +13 105167527 bnd_5983_2 C [13:105131562[C 207 PASS SVTYPE=BND;MATEID=bnd_5983_1;DP=6;SPLITCNT=1;SPANCNT=5;GENE=4933425L06Rik,Rnf180;GENEID=ENSMUSG00000021718,ENSMUSG00000021720;GENELOC=downstream,intron;EXPR=0,60;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +14 31131376 bnd_9962_1 G G]14:31134739] 189 PASS SVTYPE=BND;MATEID=bnd_9962_2;DP=126;SPLITCNT=93;SPANCNT=33;GENE=2010107H07Rik,Nt5dc2;GENEID=ENSMUSG00000058351,ENSMUSG00000071547;GENELOC=upstream,upstream;EXPR=211,4122;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +14 31134739 bnd_9962_2 A [14:31131376[A 189 PASS SVTYPE=BND;MATEID=bnd_9962_1;DP=126;SPLITCNT=93;SPANCNT=33;GENE=2010107H07Rik,Nt5dc2;GENEID=ENSMUSG00000058351,ENSMUSG00000071547;GENELOC=upstream,upstream;EXPR=211,4122;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +15 33221484 bnd_11497_2 C [5:23703360[C 230 PASS SVTYPE=BND;MATEID=bnd_11497_1;DP=230;SPLITCNT=67;SPANCNT=163;GENE=SNORD93,Pgcp;GENEID=ENSMUSG00000096832,ENSMUSG00000039007;GENELOC=downstream,intron;EXPR=0,1604;HOMLEN=0;SPLICESCORE=4;INTERCHROM;ALTSPLICE +15 74996568 bnd_11177_2 C ]15:75048442]C 144 PASS SVTYPE=BND;MATEID=bnd_11177_1;DP=497;SPLITCNT=434;SPANCNT=63;GENE=Ly6c1,Ly6a;GENEID=ENSMUSG00000079018,ENSMUSG00000075602;GENELOC=utr5p,utr5p;EXPR=1416,5049;HOMLEN=0;SPLICESCORE=4;ORF;EXONBND;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +15 74996568 bnd_11169_2 A ]15:75048442]A 237 PASS SVTYPE=BND;MATEID=bnd_11169_1;DP=702;SPLITCNT=410;SPANCNT=292;GENE=Ly6c1,Ly6a;GENEID=ENSMUSG00000079018,ENSMUSG00000075602;GENELOC=utr5p,utr5p;EXPR=1416,5049;HOMLEN=0;SPLICESCORE=4;ORF;EXONBND;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +15 75048442 bnd_11177_1 A A[15:74996568[ 144 PASS SVTYPE=BND;MATEID=bnd_11177_2;DP=497;SPLITCNT=434;SPANCNT=63;GENE=Ly6c1,Ly6a;GENEID=ENSMUSG00000079018,ENSMUSG00000075602;GENELOC=utr5p,utr5p;EXPR=1416,5049;HOMLEN=0;SPLICESCORE=4;ORF;EXONBND;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +15 75048442 bnd_11169_1 G G[15:74996568[ 237 PASS SVTYPE=BND;MATEID=bnd_11169_2;DP=702;SPLITCNT=410;SPANCNT=292;GENE=Ly6c1,Ly6a;GENEID=ENSMUSG00000079018,ENSMUSG00000075602;GENELOC=utr5p,utr5p;EXPR=1416,5049;HOMLEN=0;SPLICESCORE=4;ORF;EXONBND;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +15 102188962 bnd_11202_1 G G]15:102202444] 248 PASS SVTYPE=BND;MATEID=bnd_11202_2;DP=31;SPLITCNT=21;SPANCNT=10;GENE=Csad,Zfp740;GENEID=ENSMUSG00000023044,ENSMUSG00000046897;GENELOC=utr5p,upstream;EXPR=1306,3566;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +15 102202444 bnd_11202_2 C [15:102188962[C 248 PASS SVTYPE=BND;MATEID=bnd_11202_1;DP=31;SPLITCNT=21;SPANCNT=10;GENE=Csad,Zfp740;GENEID=ENSMUSG00000023044,ENSMUSG00000046897;GENELOC=utr5p,upstream;EXPR=1306,3566;HOMLEN=0;SPLICESCORE=4;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +16 19493776 bnd_7456_2 T [18:30311220[T 189 PASS SVTYPE=BND;MATEID=bnd_7456_1;DP=19;SPLITCNT=9;SPANCNT=10;GENE=Pik3c3,Olfr166;GENEID=ENSMUSG00000033628,ENSMUSG00000056822;GENELOC=coding,downstream;EXPR=1791,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM +16 37799068 bnd_4068_2 T ]1:106734547]T 233 PASS SVTYPE=BND;MATEID=bnd_4068_1;DP=19;SPLITCNT=8;SPANCNT=11;GENE=Kdsr,Fstl1;GENEID=ENSMUSG00000009905,ENSMUSG00000022816;GENELOC=coding,intron;EXPR=775,35409;HOMLEN=2;SPLICESCORE=4;INTERCHROM;ALTSPLICE +17 17395298 bnd_3153_1 C C]17:17411818] 206 PASS SVTYPE=BND;MATEID=bnd_3153_2;DP=41;SPLITCNT=16;SPANCNT=25;GENE=AC154200.1,Lix1;GENEID=ENSMUSG00000097379,ENSMUSG00000047786;GENELOC=intron,intron;EXPR=0,30;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +17 17411818 bnd_3153_2 T [17:17395298[T 206 PASS SVTYPE=BND;MATEID=bnd_3153_1;DP=41;SPLITCNT=16;SPANCNT=25;GENE=AC154200.1,Lix1;GENEID=ENSMUSG00000097379,ENSMUSG00000047786;GENELOC=intron,intron;EXPR=0,30;HOMLEN=0;SPLICESCORE=1;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +17 35266514 bnd_3184_1 G G]17:35383995] 218 PASS SVTYPE=BND;MATEID=bnd_3184_2;DP=47;SPLITCNT=40;SPANCNT=7;GENE=H2-D1,H2-Q4;GENEID=ENSMUSG00000073411,ENSMUSG00000035929;GENELOC=coding,utr3p;EXPR=23545,2145;HOMLEN=0;SPLICESCORE=4;EXONBND;ALTSPLICE;DELETION +17 35383995 bnd_3184_2 A [17:35266514[A 218 PASS SVTYPE=BND;MATEID=bnd_3184_1;DP=47;SPLITCNT=40;SPANCNT=7;GENE=H2-D1,H2-Q4;GENEID=ENSMUSG00000073411,ENSMUSG00000035929;GENELOC=coding,utr3p;EXPR=23545,2145;HOMLEN=0;SPLICESCORE=4;EXONBND;ALTSPLICE;DELETION +17 63864053 bnd_3179_1 T T]17:63896016] 195 PASS SVTYPE=BND;MATEID=bnd_3179_2;DP=17;SPLITCNT=10;SPANCNT=7;GENE=A930002H24Rik,Fer;GENEID=ENSMUSG00000045506,ENSMUSG00000000127;GENELOC=upstream,upstream;EXPR=10,1325;HOMLEN=0;SPLICESCORE=3;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +17 63896016 bnd_3179_2 A [17:63864053[A 195 PASS SVTYPE=BND;MATEID=bnd_3179_1;DP=17;SPLITCNT=10;SPANCNT=7;GENE=A930002H24Rik,Fer;GENEID=ENSMUSG00000045506,ENSMUSG00000000127;GENELOC=upstream,upstream;EXPR=10,1325;HOMLEN=0;SPLICESCORE=3;READTHROUGH;ADJACENT;ALTSPLICE;DELETION +18 4198107 bnd_5326_2 G ]2:165827729]G 136 PASS SVTYPE=BND;MATEID=bnd_5326_1;DP=20;SPLITCNT=7;SPANCNT=13;GENE=Zmynd8,Gm10557;GENEID=ENSMUSG00000039671,ENSMUSG00000073647;GENELOC=utr3p,downstream;EXPR=2963,0;HOMLEN=9;SPLICESCORE=1;INTERCHROM;ALTSPLICE +18 28188917 bnd_5160_2 A [3:103040040[A 213 PASS SVTYPE=BND;MATEID=bnd_5160_1;DP=63;SPLITCNT=33;SPANCNT=30;GENE=Csde1,SNORA17;GENEID=ENSMUSG00000068823,ENSMUSG00000087940;GENELOC=coding,upstream;EXPR=10681,0;HOMLEN=91;SPLICESCORE=3;INTERCHROM;ALTSPLICE +18 30311220 bnd_7456_1 G G]16:19493776] 189 PASS SVTYPE=BND;MATEID=bnd_7456_2;DP=19;SPLITCNT=9;SPANCNT=10;GENE=Pik3c3,Olfr166;GENEID=ENSMUSG00000033628,ENSMUSG00000056822;GENELOC=coding,downstream;EXPR=1791,0;HOMLEN=0;SPLICESCORE=1;INTERCHROM +18 53932206 bnd_5141_1 T T[X:53418862[ 201 PASS SVTYPE=BND;MATEID=bnd_5141_2;DP=26;SPLITCNT=9;SPANCNT=17;GENE=Csnk1g3,Gm14584;GENEID=ENSMUSG00000073563,ENSMUSG00000083798;GENELOC=coding,intron;EXPR=2227,0;HOMLEN=266;SPLICESCORE=2;INTERCHROM;ALTSPLICE +19 37678397 bnd_13800_1 C C]19:37696729] 204 PASS SVTYPE=BND;MATEID=bnd_13800_2;DP=15;SPLITCNT=3;SPANCNT=12;GENE=Exoc6,Cyp26a1;GENEID=ENSMUSG00000053799,ENSMUSG00000024987;GENELOC=intron,upstream;EXPR=575,443;HOMLEN=0;SPLICESCORE=1;ALTSPLICE;DELETION +19 37696729 bnd_13800_2 A [19:37678397[A 204 PASS SVTYPE=BND;MATEID=bnd_13800_1;DP=15;SPLITCNT=3;SPANCNT=12;GENE=Exoc6,Cyp26a1;GENEID=ENSMUSG00000053799,ENSMUSG00000024987;GENELOC=intron,upstream;EXPR=575,443;HOMLEN=0;SPLICESCORE=1;ALTSPLICE;DELETION +X 37143984 bnd_8690_2 G ]9:7872842]G 132 PASS SVTYPE=BND;MATEID=bnd_8690_1;DP=9;SPLITCNT=4;SPANCNT=5;GENE=Birc3,Nkap;GENEID=ENSMUSG00000032000,ENSMUSG00000016409;GENELOC=utr5p,intron;EXPR=4781,556;HOMLEN=39;SPLICESCORE=1;INTERCHROM;ALTSPLICE +X 53418862 bnd_5141_2 A ]18:53932206]A 201 PASS SVTYPE=BND;MATEID=bnd_5141_1;DP=26;SPLITCNT=9;SPANCNT=17;GENE=Csnk1g3,Gm14584;GENEID=ENSMUSG00000073563,ENSMUSG00000083798;GENELOC=coding,intron;EXPR=2227,0;HOMLEN=266;SPLICESCORE=2;INTERCHROM;ALTSPLICE diff -r f65857c1b92e -r b22f8634ff84 test-data/tophat_out2h.bam Binary file test-data/tophat_out2h.bam has changed diff -r f65857c1b92e -r b22f8634ff84 tool-data/defuse.loc.sample --- a/tool-data/defuse.loc.sample Mon Jan 14 12:24:28 2013 -0600 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -## Configurstion info for prepared data references for DeFuse -## http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 -## 3 columns separated by the TAB character -## The 3rd column has dictionary values that will be substituted in the config file for defuse -## It should likely contain keys: dataset_directory gene_models genome_fasta repeats_filename est_fasta est_alignments unigene_fasta -## If this is not a Homo_sapiens reference also need keys: gene_id_pattern transcript_id_pattern chromosomes - -#db_key name {'config_key':'config_value'} -#hg19 GRCh37(hg19) {'gene_id_pattern':'ENSG\d+', 'transcript_id_pattern':'ENST\d+', 'dataset_directory':'/data/genomes/Hsapiens/hg19/defuse', 'gene_models':'$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf', 'genome_fasta':'$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Hs.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'} -#mm9 NCBIM37(mm9) {'gene_id_pattern':'ENSMUSG\d+', 'transcript_id_pattern':'ENSMUST\d+', 'dataset_directory':'/data/genomes/Mmusculus/mm9/defuse', 'gene_models':'$(dataset_directory)/Mus_musculus.NCBIM37.63.gtf', 'genome_fasta':'$(dataset_directory)/Mus_musculus.NCBIM37.63.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Mm.seq.uniq', 'chromosomes':'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'} -#mm8 NCBIM36(mm8) {'gene_id_pattern':'ENSMUSG\d+', 'transcript_id_pattern':'ENSMUST\d+', 'dataset_directory':'/data/genomes/Mmusculus/mm9/defuse', 'gene_models':'$(dataset_directory)/Mus_musculus.NCBIM36.46.gtf', 'genome_fasta':'$(dataset_directory)/Mus_musculus.NCBIM36.46.dna.chromosome.fa', 'repeats_filename':'$(dataset_directory)/rmsk.txt', 'est_fasta':'$(dataset_directory)/est.fa', 'est_alignments':'$(dataset_directory)/intronEst.txt', 'unigene_fasta':'$(dataset_directory)/Mm.seq.uniq', 'mt_chromosome':'MT', 'gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding', 'ig_gene_sources':'IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene', 'rrna_gene_sources':'Mt_rRNA,rRNA,rRNA_pseudogene'} diff -r f65857c1b92e -r b22f8634ff84 tool-data/defuse_reference.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/defuse_reference.loc.sample Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,7 @@ +## Configurstion info for prepared data references for DeFuse +## http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.0 +## 4 columns separated by the TAB character +## The 4th column has the path to the defuse config.txt file, it needs to have the dataset_directory set the directory path where the defuse reference data resides. +## The defuse galaxy tool will substitute the directory path of config.txt if the dataset_directory property is not set '__DATASET_DIRECTORY__' +# +GRCh37 GRCh37 Human GRCh37 (hg19) /depot/GRCh37/defuse/GRCh37.config diff -r f65857c1b92e -r b22f8634ff84 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sun Jan 17 14:11:06 2016 -0500 @@ -0,0 +1,7 @@ + + + + value, dbkey, name, path + +
+
diff -r f65857c1b92e -r b22f8634ff84 tool_dependencies.xml --- a/tool_dependencies.xml Mon Jan 14 12:24:28 2013 -0600 +++ b/tool_dependencies.xml Sun Jan 17 14:11:06 2016 -0500 @@ -1,180 +1,24 @@ - - - - http://sourceforge.net/projects/defuse/files/defuse/0.6/defuse-0.6.0.tar.gz - cd tools && make - - . - $INSTALL_DIR - - - $INSTALL_DIR - - - - -deFuse code -To build the deFuse toolset you must have the boost c++ development libraries installed. If they are not installed on your system you can download them from the boost website. A full install of boost is not required. The easiest thing to do is to download the latest boost source tar.gz, extract it, then add the extracted path to the CPLUS_INCLUDE_PATH environment variable (in bash, `export CPLUS_INCLUDE_PATH=/boost/directory/:$CPLUS_INCLUDE_PATH`) - + + - - - - - http://sourceforge.net/projects/samtools/files/samtools/0.1.18/samtools-0.1.18.tar.bz2 - sed -i.bak -e 's/-lcurses/-lncurses/g' Makefile - make - - samtools - $INSTALL_DIR/bin - - - misc/maq2sam-long - $INSTALL_DIR/bin - - - $INSTALL_DIR/bin - - - - -Compiling SAMtools requires the ncurses and zlib development libraries. - + + - - - - - - http://downloads.sourceforge.net/project/bowtie-bio/bowtie/0.12.7/bowtie-0.12.7-src.zip - make - - bowtie - $INSTALL_DIR/bin - - - bowtie-build - $INSTALL_DIR/bin - - - bowtie-inspect - $INSTALL_DIR/bin - - - $INSTALL_DIR/bin - - - - - Compiling Bowtie requires libpthread to be present on your system. - + + - - - - - http://research-pub.gene.com/gmap/src/gmap-gsnap-2012-07-20.v2.tar.gz - ./configure - make - - src/gmap - $INSTALL_DIR/bin - - - src/gmapindex - $INSTALL_DIR/bin - - - src/gsnap - $INSTALL_DIR/bin - - - src/uniqscan - $INSTALL_DIR/bin - - - src/iit_store - $INSTALL_DIR/bin - - - src/iit_get - $INSTALL_DIR/bin - - - src/atoiindex - $INSTALL_DIR/bin - - - src/snpindex - $INSTALL_DIR/bin - - - src/cmetindex - $INSTALL_DIR/bin - - - src/get-genome - $INSTALL_DIR/bin - - - util - $INSTALL_DIR/bin - - - $INSTALL_DIR/bin - - - - - + + + + + - - - - - http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/blat/blat - chmod 755 blat - - blat - $INSTALL_DIR/bin - - - $INSTALL_DIR/bin - - - - -This only handles blat for a non-commercial linux system. - -Please note that the Blat source and executables are freely available for -academic, nonprofit and personal use. Commercial licensing information is -available on the Kent Informatics website (http://www.kentinformatics.com/). - + + - - - - - http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit - chmod 755 faToTwoBit - - faToTwoBit - $INSTALL_DIR/bin - - - $INSTALL_DIR/bin - - - - -This only handles faToTwoBit for a non-commercial linux system. - -Please note that the source and executables are freely available for -academic, nonprofit and personal use. Commercial licensing information is -available on the Kent Informatics website (http://www.kentinformatics.com/). - + + -