Galaxy |

Changeset 0:3b33da018e74 (2014-05-19)

Commit message:
Imported from capsule None

added:
annotation_profiler.xml
annotation_profiler_for_interval.py
scripts/README.txt
scripts/build_profile_indexes.py
test-data/3.bed
test-data/4.bed
test-data/annotation_profiler_1.out
test-data/annotation_profiler_2.out
tool-data/annotation_profiler_options.xml.sample
tool-data/annotation_profiler_valid_builds.txt.sample
tool_dependencies.xml

diff -r 000000000000 -r 3b33da018e74 annotation_profiler.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_profiler.xml Mon May 19 12:33:42 2014 -0400

b'@@ -0,0 +1,147 @@\n+<tool id="Annotation_Profiler_0" name="Profile Annotations" version="1.0.0">\r\n+ <description>for a set of genomic intervals</description>\r\n+ <requirements>\r\n+ <requirement type="package" version="0.7.1">bx-python</requirement>\r\n+ </requirements>\r\n+ <command interpreter="python">annotation_profiler_for_interval.py -i $input1 -c ${input1.metadata.chromCol} -s ${input1.metadata.startCol} -e ${input1.metadata.endCol} -o $out_file1 $keep_empty -p ${GALAXY_DATA_INDEX_DIR}/annotation_profiler/$dbkey $summary -b 3 -t $table_names</command>\r\n+ <inputs>\r\n+ <param format="interval" name="input1" type="data" label="Choose Intervals">\r\n+ <validator type="dataset_metadata_in_file" filename="annotation_profiler_valid_builds.txt" metadata_name="dbkey" metadata_column="0" message="Profiling is not currently available for this species."/>\r\n+ </param>\r\n+ <param name="keep_empty" type="select" label="Keep Region/Table Pairs with 0 Coverage">\r\n+ <option value="-k">Keep</option>\r\n+ <option value="" selected="true">Discard</option>\r\n+ </param>\r\n+ <param name="summary" type="select" label="Output per Region/Summary">\r\n+ <option value="-S">Summary</option>\r\n+ <option value="" selected="true">Per Region</option>\r\n+ </param>\r\n+ <param name="table_names" type="drill_down" display="checkbox" hierarchy="recurse" multiple="true" label="Choose Tables to Use" help="Selecting no tables will result in using all tables." from_file="annotation_profiler_options.xml"/>\r\n+ </inputs>\r\n+ <outputs>\r\n+ <data format="input" name="out_file1">\r\n+ <change_format>\r\n+ <when input="summary" value="-S" format="tabular" />\r\n+ </change_format>\r\n+ </data>\r\n+ </outputs>\r\n+ <tests>\r\n+ <test>\r\n+ <param name="input1" value="4.bed" dbkey="hg18"/>\r\n+ <param name="keep_empty" value=""/>\r\n+ <param name="summary" value=""/>\r\n+ <param name="table_names" value="acembly,affyGnf1h,knownAlt,knownGene,mrna,multiz17way,multiz28way,refGene,snp126"/>\r\n+ <output name="out_file1" file="annotation_profiler_1.out" />\r\n+ </test>\r\n+ <test>\r\n+ <param name="input1" value="3.bed" dbkey="hg18"/>\r\n+ <param name="keep_empty" value=""/>\r\n+ <param name="summary" value="Summary"/>\r\n+ <param name="table_names" value="acembly,affyGnf1h,knownAlt,knownGene,mrna,multiz17way,multiz28way,refGene,snp126"/>\r\n+ <output name="out_file1" file="annotation_profiler_2.out" />\r\n+ </test>\r\n+ </tests>\r\n+ <help>\r\n+**What it does**\r\n+\r\n+Takes an input set of intervals and for each interval determines the base coverage of the interval by a set of features (tables) available from UCSC. Genomic regions from the input feature data have been merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28).\r\n+\r\n+By default, this tool will check the coverage of your intervals against all available features; you may, however, choose to select only those tables that you want to include. Selecting a section heading will effectively cause all of its children to be selected.\r\n+\r\n+You may alternatively choose to receive a summary across all of the intervals that you provide.\r\n+\r\n+-----\r\n+\r\n+**Example**\r\n+\r\n+Using the interval below and selecting several tables::\r\n+\r\n+ chr1 4558 14764 uc001aab.1 0 -\r\n+\r\n+results in::\r\n+\r\n+ chr1 4558 14764 uc001aab.1 0 - snp126Exceptions 151 142\r\n+ chr1 4558 14764 uc001aab.1 0 - genomicSuperDups 10206 1\r\n+ chr1 4558 14764 uc001aab.1 0 - chainOryLat1 3718 1\r\n+ chr1 4558 14764 uc001aab.1 0 - multiz28way 10206 1\r\n+ chr1 4558 14764 uc001aab.1 0 - affyHuEx1 3553 32\r\n+ chr1 4558 14764 uc001aab.1 0 - netXenTro2 3050 1\r\n+ chr1 4558 14764 uc001aab.1 0 - intronEst 10206 1\r\n+ chr1 4558 14764 uc001aab.1 0 - xenoMrna 10203 1\r\n+ chr1 4558 14764 uc001aab.1 0 - ctgPos 10206 1\r\n+ chr1 4558 14764 uc001aab.1 0 - clonePos 10206 1\r\n+ chr1 4558 14764 uc001aab.1 0 - chai'..b'558 14764 uc001aab.1 0 - snp126orthoPanTro2RheMac2 61 58\r\n+ chr1 4558 14764 uc001aab.1 0 - snp126 205 192\r\n+ chr1 4558 14764 uc001aab.1 0 - chainEquCab1 10206 1\r\n+ chr1 4558 14764 uc001aab.1 0 - netGalGal3 3686 1\r\n+ chr1 4558 14764 uc001aab.1 0 - phastCons28wayPlacMammal 10172 3\r\n+\r\n+Where::\r\n+\r\n+ The first added column is the table name.\r\n+ The second added column is the number of bases covered by the table.\r\n+ The third added column is the number of regions from the table that is covered by the interval.\r\n+\r\n+Alternatively, requesting a summary, using the intervals below and selecting several tables::\r\n+\r\n+ chr1 4558 14764 uc001aab.1 0 -\r\n+ chr1 4558 19346 uc001aac.1 0 -\r\n+\r\n+results in::\r\n+\r\n+ #tableName tableSize tableRegionCount allIntervalCount allIntervalSize allCoverage allTableRegionsOverlaped allIntervalsOverlapingTable nrIntervalCount nrIntervalSize nrCoverage nrTableRegionsOverlaped nrIntervalsOverlapingTable\r\n+ snp126Exceptions 133601 92469 2 24994 388 359 2 1 14788 237 217 1\r\n+ genomicSuperDups 12268847 657 2 24994 24994 2 2 1 14788 14788 1 1\r\n+ chainOryLat1 70337730 2542 2 24994 7436 2 2 1 14788 3718 1 1\r\n+ affyHuEx1 15703901 112274 2 24994 7846 70 2 1 14788 4293 38 1\r\n+ netXenTro2 111440392 1877 2 24994 6100 2 2 1 14788 3050 1 1\r\n+ snp126orthoPanTro2RheMac2 700436 690674 2 24994 124 118 2 1 14788 63 60 1\r\n+ intronEst 135796064 2332 2 24994 24994 2 2 1 14788 14788 1 1\r\n+ xenoMrna 129031327 1586 2 24994 20406 2 2 1 14788 10203 1 1\r\n+ snp126 956976 838091 2 24994 498 461 2 1 14788 293 269 1\r\n+ clonePos 224999719 39 2 24994 24994 2 2 1 14788 14788 1 1\r\n+ chainStrPur2Link 7948016 119841 2 24994 2646 58 2 1 14788 1323 29 1\r\n+ affyTxnPhase3HeLaNuclear 136797870 140244 2 24994 22601 17 2 1 14788 13590 9 1\r\n+ multiz28way 225928588 38 2 24994 24994 2 2 1 14788 14788 1 1\r\n+ ctgPos 224999719 39 2 24994 24994 2 2 1 14788 14788 1 1\r\n+ chainEquCab1 246306414 141 2 24994 24994 2 2 1 14788 14788 1 1\r\n+ netGalGal3 203351973 461 2 24994 7372 2 2 1 14788 3686 1 1\r\n+ phastCons28wayPlacMammal 221017670 22803 2 24994 24926 6 2 1 14788 14754 3 1\r\n+\r\n+Where::\r\n+ \r\n+ tableName is the name of the table\r\n+ tableChromosomeCoverage is the number of positions existing in the table for only the chromosomes that were referenced by the interval file\r\n+ tableChromosomeCount is the number of regions existing in the table for only the chromosomes that were referenced by the interval file\r\n+ tableRegionCoverage is the number of positions existing in the table between the minimal and maximal bounding regions that were referenced by the interval file\r\n+ tableRegionCount is the number of regions existing in the table between the minimal and maximal bounding regions that were referenced by the interval file\r\n+ \r\n+ allIntervalCount is the number of provided intervals\r\n+ allIntervalSize is the sum of the lengths of the provided interval file\r\n+ allCoverage is the sum of the coverage for each provided interval\r\n+ allTableRegionsOverlapped is the sum of the number of regions of the table (non-unique) that were overlapped for each interval\r\n+ allIntervalsOverlappingTable is the number of provided intervals which overlap the table\r\n+ \r\n+ nrIntervalCount is the number of non-redundant intervals\r\n+ nrIntervalSize is the sum of the lengths of non-redundant intervals\r\n+ nrCoverage is the sum of the coverage of non-redundant intervals\r\n+ nrTableRegionsOverlapped is the number of regions of the table (unique) that were overlapped by the non-redundant intervals\r\n+ nrIntervalsOverlappingTable is the number of non-redundant intervals which overlap the table\r\n+ \r\n+\r\n+.. class:: infomark\r\n+\r\n+**TIP:** non-redundant (nr) refers to the set of intervals that remains after the intervals provided have been merged to resolve overlaps\r\n+\r\n+------\r\n+\r\n+**Citation**\r\n+\r\n+For the underlying data, please see http://genome.ucsc.edu/cite.html for the proper citation.\r\n+\r\n+If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*\r\n+\r\n+ </help>\r\n+</tool>\r\n'

diff -r 000000000000 -r 3b33da018e74 annotation_profiler_for_interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_profiler_for_interval.py Mon May 19 12:33:42 2014 -0400

[

b'@@ -0,0 +1,358 @@\n+#!/usr/bin/env python\r\n+#Dan Blankenberg\r\n+#For a set of intervals, this tool returns the same set of intervals \r\n+#with 2 additional fields: the name of a Table/Feature and the number of\r\n+#bases covered. The original intervals are repeated for each Table/Feature.\r\n+\r\n+import sys, struct, optparse, os, random\r\n+import bx.intervals.io\r\n+import bx.bitset\r\n+try:\r\n+ import psyco\r\n+ psyco.full()\r\n+except:\r\n+ pass\r\n+\r\n+assert sys.version_info[:2] >= ( 2, 4 )\r\n+\r\n+class CachedRangesInFile:\r\n+ DEFAULT_STRUCT_FORMAT = \'<I\'\r\n+ def __init__( self, filename, profiler_info ):\r\n+ self.file_size = os.stat( filename ).st_size\r\n+ self.file = open( filename, \'rb\' )\n+ self.filename = filename\r\n+ self.fmt = profiler_info.get( \'profiler_struct_format\', self.DEFAULT_STRUCT_FORMAT )\r\n+ self.fmt_size = int( profiler_info.get( \'profiler_struct_size\', struct.calcsize( self.fmt ) ) )\r\n+ self.length = int( self.file_size / self.fmt_size / 2 )\r\n+ self._cached_ranges = [ None for i in xrange( self.length ) ]\r\n+ def __getitem__( self, i ):\n+ if self._cached_ranges[i] is not None:\r\n+ return self._cached_ranges[i]\r\n+ if i < 0: i = self.length + i\r\n+ offset = i * self.fmt_size * 2\r\n+ self.file.seek( offset )\r\n+ try:\r\n+ start = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\r\n+ end = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\r\n+ except Exception, e:\n+ raise IndexError, e\r\n+ self._cached_ranges[i] = ( start, end )\r\n+ return start, end\r\n+ def __len__( self ):\r\n+ return self.length\r\n+\r\n+class RegionCoverage:\r\n+ def __init__( self, filename_base, profiler_info ):\r\n+ try:\r\n+ self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info )\r\n+ except Exception, e:\r\n+ #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e )\r\n+ self._coverage = []\r\n+ try: \r\n+ self._total_coverage = int( open( "%s.total_coverage" % filename_base ).read() )\r\n+ except Exception, e:\r\n+ #print "Error loading total coverage file %s: %s" % ( "%s.total_coverage" % filename_base, e )\r\n+ self._total_coverage = 0\r\n+ def get_start_index( self, start ):\r\n+ #binary search: returns index of range closest to start\r\n+ if start > self._coverage[-1][1]:\r\n+ return len( self._coverage ) - 1\r\n+ i = 0\r\n+ j = len( self._coverage) - 1\r\n+ while i < j:\r\n+ k = ( i + j ) / 2\r\n+ if start <= self._coverage[k][1]:\r\n+ j = k\r\n+ else:\r\n+ i = k + 1\r\n+ return i\r\n+ def get_coverage( self, start, end ):\r\n+ return self.get_coverage_regions_overlap( start, end )[0]\r\n+ def get_coverage_regions_overlap( self, start, end ):\r\n+ return self.get_coverage_regions_index_overlap( start, end )[0:2]\r\n+ def get_coverage_regions_index_overlap( self, start, end ):\r\n+ if len( self._coverage ) < 1 or start > self._coverage[-1][1] or end < self._coverage[0][0]:\r\n+ return 0, 0, 0\r\n+ if self._total_coverage and start <= self._coverage[0][0] and end >= self._coverage[-1][1]:\r\n+ return self._total_coverage, len( self._coverage ), 0\r\n+ coverage = 0\r\n+ region_count = 0\r\n+ start_index = self.get_start_index( start )\r\n+ for i in xrange( start_index, len( self._coverage ) ):\r\n+ c_start, c_end = self._coverage[i]\r\n+ if c_start > end:\r\n+ break\r\n+ if c_start <= end and c_end >= start:\r\n+ coverage += min( end, c_end ) - max( start, c_start )\r\n+ region_count += 1\r\n+ return coverage, region_count, start_index\r\n+\r\n+class CachedCoverageReader:\r\n+ def __init__( self, base_file_path, buffer = 10, table_names ='..b' if len( fields ) == 2:\r\n+ self.chroms[ fields[0] ] = int( fields[1] )\r\n+ else:\r\n+ self.chroms[ fields[0] ] = self.default_bitset_size\r\n+ def get( self, name ):\r\n+ return self.chroms.get( name, self.default_bitset_size )\r\n+\r\n+def parse_profiler_info( filename ):\r\n+ profiler_info = {}\r\n+ try:\r\n+ for line in open( filename ):\r\n+ fields = line.rstrip( \'\\n\\r\' ).split( \'\\t\', 1 )\r\n+ if len( fields ) == 2:\r\n+ if fields[0] in profiler_info:\r\n+ if not isinstance( profiler_info[ fields[0] ], list ):\r\n+ profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ]\r\n+ profiler_info[ fields[0] ].append( fields[1] )\r\n+ else:\r\n+ profiler_info[ fields[0] ] = fields[1]\r\n+ except:\r\n+ pass #likely missing file\r\n+ return profiler_info\r\n+\r\n+def __main__():\r\n+ parser = optparse.OptionParser()\r\n+ parser.add_option(\r\n+ \'-k\',\'--keep_empty\',\r\n+ action="store_true",\r\n+ dest=\'keep_empty\',\r\n+ default=False,\r\n+ help=\'Keep tables with 0 coverage\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-b\',\'--buffer\',\r\n+ dest=\'buffer\',\r\n+ type=\'int\',default=10,\r\n+ help=\'Number of Chromosomes to keep buffered\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-c\',\'--chrom_col\',\r\n+ dest=\'chrom_col\',\r\n+ type=\'int\',default=1,\r\n+ help=\'Chromosome column\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-s\',\'--start_col\',\r\n+ dest=\'start_col\',\r\n+ type=\'int\',default=2,\r\n+ help=\'Start Column\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-e\',\'--end_col\',\r\n+ dest=\'end_col\',\r\n+ type=\'int\',default=3,\r\n+ help=\'End Column\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-p\',\'--path\',\r\n+ dest=\'path\',\r\n+ type=\'str\',default=\'/galaxy/data/annotation_profiler/hg18\',\r\n+ help=\'Path to profiled data for this organism\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-t\',\'--table_names\',\r\n+ dest=\'table_names\',\r\n+ type=\'str\',default=\'None\',\r\n+ help=\'Table names requested\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-i\',\'--input\',\r\n+ dest=\'interval_filename\',\r\n+ type=\'str\',\r\n+ help=\'Input Interval File\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-o\',\'--output\',\r\n+ dest=\'out_filename\',\r\n+ type=\'str\',\r\n+ help=\'Input Interval File\'\r\n+ )\r\n+ parser.add_option(\r\n+ \'-S\',\'--summary\',\r\n+ action="store_true",\r\n+ dest=\'summary\',\r\n+ default=False,\r\n+ help=\'Display Summary Results\'\r\n+ )\r\n+ \r\n+ options, args = parser.parse_args()\r\n+ \r\n+ assert os.path.isdir( options.path ), IOError( "Configuration error: Table directory is missing (%s)" % options.path )\r\n+ \r\n+ #get profiler_info\r\n+ profiler_info = parse_profiler_info( os.path.join( options.path, \'profiler_info.txt\' ) )\r\n+ \r\n+ table_names = options.table_names.split( "," )\r\n+ if table_names == [\'None\']: table_names = None\r\n+ coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info )\r\n+ \r\n+ if options.summary:\r\n+ profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) )\r\n+ else:\r\n+ profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader )\r\n+ \r\n+ #print out data version info\r\n+ print \'Data version (%s:%s:%s)\' % ( profiler_info.get( \'dbkey\', \'unknown\' ), profiler_info.get( \'profiler_hash\', \'unknown\' ), profiler_info.get( \'dump_time\', \'unknown\' ) )\r\n+\r\n+if __name__ == "__main__": __main__()\r\n'

diff -r 000000000000 -r 3b33da018e74 scripts/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/README.txt Mon May 19 12:33:42 2014 -0400

[

@@ -0,0 +1,54 @@
+This file explains how to create annotation indexes for the annotation profiler tool. Annotation profiler indexes are an exceedingly simple binary format,
+containing no header information and consisting of an ordered linear list of (start,stop encoded individually as '<I') regions which are covered by a UCSC table partitioned
+by chromosome name. Genomic regions are merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28).
+
+Files are arranged like:
+/profiled_annotations/DBKEY/TABLE_NAME/
+                                       CHROMOSOME_NAME.covered
+                                       CHROMOSOME_NAME.total_coverage
+                                       CHROMOSOME_NAME.total_regions
+/profiled_annotations/DBKEY/
+                            DBKEY_tables.xml
+                            chromosomes.txt
+                            profiled_info.txt
+
+
+where CHROMOSOME_NAME.covered is the binary file, CHROMOSOME_NAME.total_coverage is a text file containing the integer count of bases covered by the
+table and CHROMOSOME_NAME.total_regions contains the integer count of the number of regions found in CHROMOSOME_NAME.covered
+
+DBKEY_tables.xml should be appended to the annotation profile available table configuration file (tool-data/annotation_profiler_options.xml).
+The DBKEY should also be added as a new line to the annotation profiler valid builds file (annotation_profiler_valid_builds.txt).
+The output (/profiled_annotations/DBKEY) should be made available as GALAXY_ROOT/tool-data/annotation_profiler/DBKEY.
+
+profiled_info.txt contains info on the generated annotations, separated by lines with tab-delimited label,value pairs:
+        profiler_version - the version of the build_profile_indexes.py script that was used to generate the profiled data
+        dbkey - the dbkey used for the run
+        chromosomes - contains the names and lengths of chromosomes that were used to parse single-chromosome tables (tables divided into individual files by chromosome)
+        dump_time - the declared dump time of the database, taken from trackDb.txt.gz
+        profiled_time - seconds since epoch in utc for when the database dump was profiled
+        database_hash - a md5 hex digest of all the profiled table info
+
+
+Typical usage includes:
+
+python build_profile_indexes.py -d hg19 -i /ucsc_data/hg19/database/ > hg19.txt
+
+where the genome build is hg19 and /ucsc_data/hg19/database/ contains the downloaded database dump from UCSC (e.g. obtained by rsync: rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/ /ucsc_data/hg19/database/).
+
+
+
+By default, chromosome names come from a file named 'chromInfo.txt.gz' found in the input directory, with FTP used as a backup.
+When FTP is used to obtain the names of chromosomes from UCSC for a particular genome build, alternate ftp sites and paths can be specified by using the --ftp_site and --ftp_path attributes.
+Chromosome names can instead be provided on the commandline via the --chromosomes option, which accepts a comma separated list of:ChromName1[=length],ChromName2[=length],...
+
+
+
+    usage = "usage: %prog options"
+    parser = OptionParser( usage=usage )
+    parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' )
+    parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' )
+    parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' )
+    parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' )
+    parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' )
+    parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' )
+    parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' )

diff -r 000000000000 -r 3b33da018e74 scripts/build_profile_indexes.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/build_profile_indexes.py Mon May 19 12:33:42 2014 -0400

[

b'@@ -0,0 +1,338 @@\n+#!/usr/bin/env python\n+#Dan Blankenberg\n+\n+VERSION = \'1.0.0\' # version of this script\n+\n+from optparse import OptionParser\n+import os, gzip, struct, time\n+from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy\n+\n+#import md5 from hashlib; if python2.4 or less, use old md5\n+try:\n+ from hashlib import md5\n+except ImportError:\n+ from md5 import new as md5\n+\n+#import BitSet from bx-python, try using eggs and package resources, fall back to any local installation\n+try:\n+ from galaxy import eggs\n+ import pkg_resources\n+ pkg_resources.require( "bx-python" )\n+except: pass #Maybe there is a local installation available\n+from bx.bitset import BitSet\n+\n+#Define constants\n+STRUCT_FMT = \'<I\'\n+STRUCT_SIZE = struct.calcsize( STRUCT_FMT )\n+DEFAULT_BITSET_SIZE = 300000000\n+CHUNK_SIZE = 1024\n+\n+#Headers used to parse .sql files to determine column indexes for chromosome name, start and end\n+alias_spec = { \n+ \'chromCol\' : [ \'chrom\' , \'CHROMOSOME\' , \'CHROM\', \'Chromosome Name\', \'tName\' ], \n+ \'startCol\' : [ \'start\' , \'START\', \'chromStart\', \'txStart\', \'Start Position (bp)\', \'tStart\', \'genoStart\' ],\n+ \'endCol\' : [ \'end\' , \'END\' , \'STOP\', \'chromEnd\', \'txEnd\', \'End Position (bp)\', \'tEnd\', \'genoEnd\' ], \n+}\n+\n+#Headers used to parse trackDb.txt.gz\n+#TODO: these should be parsed directly from trackDb.sql\n+trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"]\n+\n+def get_columns( filename ):\n+ input_sql = open( filename ).read()\n+ input_sql = input_sql.split( \'CREATE TABLE \' )[1].split( \';\' )[0]\n+ input_sql = input_sql.split( \' (\', 1 )\n+ table_name = input_sql[0].strip().strip( \'`\' )\n+ input_sql = [ split.strip().split( \' \' )[0].strip().strip( \'`\' ) for split in input_sql[1].rsplit( \')\', 1 )[0].strip().split( \'\\n\' ) ]\n+ print input_sql\n+ chrom_col = None\n+ start_col = None\n+ end_col = None\n+ for col_name in alias_spec[\'chromCol\']:\n+ for i, header_name in enumerate( input_sql ):\n+ if col_name == header_name:\n+ chrom_col = i\n+ break\n+ if chrom_col is not None:\n+ break\n+ \n+ for col_name in alias_spec[\'startCol\']:\n+ for i, header_name in enumerate( input_sql ):\n+ if col_name == header_name:\n+ start_col = i\n+ break\n+ if start_col is not None:\n+ break\n+\n+ for col_name in alias_spec[\'endCol\']:\n+ for i, header_name in enumerate( input_sql ):\n+ if col_name == header_name:\n+ end_col = i\n+ break\n+ if end_col is not None:\n+ break\n+\n+ return table_name, chrom_col, start_col, end_col\n+\n+\n+def create_grouping_xml( input_dir, output_dir, dbkey ):\n+ output_filename = os.path.join( output_dir, \'%s_tables.xml\' % dbkey )\n+ def load_groups( file_name = \'grp.txt.gz\' ):\n+ groups = {}\n+ for line in gzip.open( os.path.join( input_dir, file_name ) ):\n+ fields = line.split( \'\\t\' )\n+ groups[fields[0]] = { \'desc\': fields[1], \'priority\': fields[2] }\n+ return groups\n+ f = gzip.open( os.path.join( input_dir, \'trackDb.txt.gz\' ) )\n+ out = open( output_filename, \'wb\' )\n+ tables = {}\n+ cur_buf = \'\'\n+ while True:\n+ line = f.readline()\n+ if not line: break\n+ #remove new lines\n+ line = line.rstrip( \'\\n\\r\' )\n+ line = line.replace( \'\\\\\\t\', \' \' ) #replace escaped tabs with space\n+ cur_buf += "%s\\n" % line.rstrip( \'\\\\\' )\n+ if line.endswith( \'\\\\\' ):\n+ continue #line is wrapped, next line\n+ #all fields should be loaded now...\n+ fields = cur_buf.split( \'\\t\' )\n+ cur_buf = \'\' #reset buffer\n+ assert len( fields'..b'ngths[ chrom ] = options.bitset_size\n+ #sort chroms by length of name, decending; necessary for when table names start with chrom name\n+ chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) )\n+ \n+ #parse tables from local files\n+ #loop through directory contents, if file ends in \'.sql\', process table\n+ for filename in os.listdir( input_dir ):\n+ if filename.endswith ( \'.sql\' ):\n+ base_filename = filename[ 0:-len( \'.sql\' ) ]\n+ table_out_dir = os.path.join( output_dir, base_filename )\n+ #some tables are chromosome specific, lets strip off the chrom name\n+ for chrom in chroms:\n+ if base_filename.startswith( "%s_" % chrom ):\n+ #found chromosome\n+ table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] )\n+ break\n+ #create table dir\n+ if not os.path.exists( table_out_dir ):\n+ os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables\n+ print "Created table dir (%s)." % table_out_dir\n+ else:\n+ print "Table dir (%s) already exists." % table_out_dir\n+ #find column assignments\n+ table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) )\n+ if chrom_col is None or start_col is None or end_col is None:\n+ print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) )\n+ if not os.listdir( table_out_dir ):\n+ print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir )\n+ os.rmdir( table_out_dir )\n+ continue\n+ #build bitsets from table\n+ bitset_dict = {}\n+ for line in gzip.open( \'%s.txt.gz\' % os.path.join( input_dir, base_filename ) ):\n+ fields = line.strip().split( \'\\t\' )\n+ chrom = fields[ chrom_col ]\n+ start = int( fields[ start_col ] )\n+ end = int( fields[ end_col ] )\n+ if chrom not in bitset_dict:\n+ bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) )\n+ bitset_dict[ chrom ].set_range( start, end - start )\n+ #write bitsets as profiled annotations\n+ for chrom_name, chrom_bits in bitset_dict.iteritems():\n+ out = open( os.path.join( table_out_dir, \'%s.covered\' % chrom_name ), \'wb\' )\n+ end = 0\n+ total_regions = 0\n+ total_coverage = 0\n+ max_size = chrom_lengths.get( chrom_name, options.bitset_size )\n+ while True:\n+ start = chrom_bits.next_set( end )\n+ if start >= max_size:\n+ break\n+ end = chrom_bits.next_clear( start )\n+ out.write( struct.pack( STRUCT_FMT, start ) )\n+ out.write( struct.pack( STRUCT_FMT, end ) )\n+ total_regions += 1\n+ total_coverage += end - start\n+ if end >= max_size:\n+ break\n+ out.close()\n+ open( os.path.join( table_out_dir, \'%s.total_regions\' % chrom_name ), \'wb\' ).write( str( total_regions ) )\n+ open( os.path.join( table_out_dir, \'%s.total_coverage\' % chrom_name ), \'wb\' ).write( str( total_coverage ) )\n+ \n+ #create xml\n+ create_grouping_xml( input_dir, output_dir, options.dbkey )\n+ #create database dump info file, for database version control\n+ write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size )\n+ \n+if __name__ == "__main__": __main__()\n'

diff -r 000000000000 -r 3b33da018e74 test-data/3.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/3.bed Mon May 19 12:33:42 2014 -0400

@@ -0,0 +1,25 @@
+chr1 147962006 147975713 NM_005997 0 - 147962192 147975670 0 6 574,145,177,115,153,160, 0,1543,7859,9048,9340,13547,
+chr1 147984101 148035079 BC007833 0 + 147984545 148033414 0 14 529,32,81,131,118,153,300,206,84,49,85,130,46,1668, 0,25695,28767,33118,33695,33998,35644,38005,39629,40577,41402,43885,48367,49310,
+chr1 148077485 148111797 NM_002651 0 - 148078400 148111728 0 12 1097,121,133,266,124,105,110,228,228,45,937,77, 0,2081,2472,6871,9907,10257,11604,14199,15637,18274,23636,34235,
+chr1 148185113 148187485 NM_002796 0 + 148185136 148187378 0 7 163,207,147,82,117,89,120, 0,416,877,1199,1674,1977,2252,
+chr2 118288484 118306183 NM_006773 0 + 118288583 118304530 0 14 184,285,144,136,101,200,115,140,162,153,114,57,178,1796, 0,2765,4970,6482,6971,7183,7468,9890,10261,10768,11590,14270,14610,15903,
+chr2 118389378 118390700 BC005078 0 - 118390395 118390500 0 1 1322, 0,
+chr2 220108603 220116964 NM_001927 0 + 220108689 220116217 0 9 664,61,96,162,126,221,44,83,789, 0,1718,1874,2118,2451,2963,5400,7286,7572,
+chr2 220229182 220233943 NM_024536 0 - 220229609 220233765 0 4 1687,180,574,492, 0,1990,2660,4269,
+chr5 131170738 131357870 AF099740 0 - 131311206 131357817 0 31 112,124,120,81,65,40,120,129,61,88,94,79,72,102,144,117,89,73,96,135,135,78,74,52,33,179,100,102,65,115,248, 0,11593,44117,47607,104668,109739,114675,126366,135488,137518,138009,140437,152389,153373,155388,159269,160793,162981,164403,165577,166119,167611,169501,178260,179675,180901,181658,182260,182953,183706,186884,
+chr5 131424245 131426795 NM_000588 0 + 131424298 131426383 0 5 215,42,90,42,535, 0,313,1658,1872,2015,
+chr5 131556201 131590458 NM_004199 0 - 131556601 131582218 0 15 471,97,69,66,54,100,71,177,194,240,138,152,97,100,170, 0,2316,2802,5596,6269,11138,11472,15098,16528,17674,21306,24587,25142,25935,34087,
+chr5 131621285 131637046 NM_003687 0 + 131621326 131635821 0 7 134,152,82,179,164,118,1430, 0,4915,8770,13221,13609,14097,14331,
+chr6 108298214 108386086 NM_007214 0 - 108299600 108385906 0 21 1530,105,99,102,159,174,60,83,148,155,93,133,95,109,51,59,62,113,115,100,304, 0,2490,6246,10831,12670,23164,23520,27331,31052,32526,34311,36130,36365,38609,41028,42398,43048,51479,54500,59097,87568,
+chr6 108593954 108616704 NM_003269 0 + 108594662 108615360 0 9 733,146,88,236,147,97,150,106,1507, 0,5400,8778,10445,12037,14265,14749,15488,21243,
+chr6 108639410 108689143 NM_152827 0 - 108640045 108688818 0 3 741,125,487, 0,2984,49246,
+chr6 108722790 108950942 NM_145315 0 + 108722976 108950321 0 13 325,224,52,102,131,100,59,83,71,101,141,114,750, 0,28931,52094,60760,61796,71339,107102,152319,181970,182297,215317,224802,227402,
+chr7 113320332 113924911 AK131266 0 + 113862563 113893433 0 20 285,91,178,90,58,75,138,51,201,178,214,105,88,84,77,102,122,70,164,1124, 0,201692,340175,448290,451999,484480,542213,543265,543478,545201,556083,558358,565876,567599,573029,573245,575738,577123,577946,603455,
+chr7 116511232 116557294 NM_003391 0 - 116512159 116556994 0 5 1157,265,278,227,383, 0,20384,37843,43339,45679,
+chr7 116713967 116902666 NM_000492 0 + 116714099 116901113 0 27 185,111,109,216,90,164,126,247,93,183,192,95,87,724,129,38,251,80,151,228,101,249,156,90,173,106,1754, 0,24290,29071,50936,54313,55285,56585,60137,62053,68678,79501,107776,110390,111971,114967,122863,123569,126711,130556,131618,134650,147559,162475,172879,184725,185496,186945,
+chr7 116944658 117107512 AF377960 0 - 116945541 116979926 0 23 1129,102,133,64,186,206,179,188,153,100,87,80,96,276,118,255,151,100,204,1654,225,108,173, 0,7364,8850,10413,13893,14398,17435,24259,24615,35177,35359,45901,47221,49781,56405,66857,69787,72208,73597,80474,100111,150555,162681,
+chr8 118880786 119193239 NM_000127 0 - 118881131 119192466 0 11 531,172,161,90,96,119,133,120,108,94,1735, 0,5355,7850,13505,19068,20309,23098,30863,36077,37741,310718,
+chr9 128763240 128783870 NM_174933 0 + 128764156 128783586 0 12 261,118,74,159,76,48,56,63,129,117,127,370, 0,522,875,5630,12374,12603,15040,15175,18961,19191,20037,20260,
+chr9 128787362 128789566 NM_014908 0 - 128787519 128789136 0 1 2204, 0,
+chr9 128789530 128848928 NM_015354 0 + 128789552 128848511 0 44 54,55,74,85,81,45,93,120,212,115,201,90,66,120,127,153,127,88,77,115,121,67,129,140,107,207,170,70,68,196,78,86,146,182,201,93,159,138,75,228,132,74,130,594, 0,1491,5075,8652,9254,10312,11104,11317,20808,21702,23060,25462,31564,32908,33566,34851,35204,35595,35776,37202,38860,39111,39891,40349,42422,45499,45827,46675,47158,47621,50453,50840,51474,51926,53831,54186,55119,55619,57449,57605,57947,58352,58541,58804,
+chr9 128849867 128870133 NM_020145 0 - 128850516 128869987 0 11 757,241,101,90,24,63,93,134,129,142,209, 0,1071,1736,2085,2635,4201,6376,6736,13056,14247,20057,

diff -r 000000000000 -r 3b33da018e74 test-data/4.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/4.bed Mon May 19 12:33:42 2014 -0400

@@ -0,0 +1,1 @@
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +

diff -r 000000000000 -r 3b33da018e74 test-data/annotation_profiler_1.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/annotation_profiler_1.out Mon May 19 12:33:42 2014 -0400

@@ -0,0 +1,9 @@
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + multiz17way 1700000 1
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + mrna 1476531 12
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + multiz28way 1700000 1
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + refGene 1247808 15
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + knownAlt 14617 57
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + affyGnf1h 16218 2
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + snp126 8224 7262
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + acembly 1532618 20
+chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 + knownGene 1282789 18

diff -r 000000000000 -r 3b33da018e74 test-data/annotation_profiler_2.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/annotation_profiler_2.out Mon May 19 12:33:42 2014 -0400

@@ -0,0 +1,10 @@
+#tableName tableChromosomeCoverage tableChromosomeCount tableRegionCoverage tableRegionCount allIntervalCount allIntervalSize allCoverage allTableRegionsOverlaped allIntervalsOverlapingTable nrIntervalCount nrIntervalSize nrCoverage nrTableRegionsOverlaped nrIntervalsOverlapingTable
+multiz17way 1232617592 115 107496500 7 25 2178864 2178864 25 25 24 2178828 2178828 7 24
+mrna 610115393 8453 53577685 617 25 2178864 1904380 38 24 24 2178828 1904344 33 23
+multiz28way 1233785185 143 107466479 10 25 2178864 2178864 25 25 24 2178828 2178828 8 24
+refGene 496767116 7324 46112187 488 25 2178864 1677947 30 23 24 2178828 1677911 27 22
+knownAlt 8647368 20213 766619 1630 25 2178864 5612 31 11 24 2178828 5612 31 11
+affyGnf1h 24034558 3995 2446754 307 25 2178864 191851 9 6 24 2178828 191851 9 6
+snp126 5297125 4456213 382226 331523 25 2178864 9205 7074 25 24 2178828 9205 7074 24
+acembly 710938193 13800 63146381 938 25 2178864 1903560 35 24 24 2178828 1903524 30 23
+knownGene 555770538 7921 50317496 558 25 2178864 1822985 30 23 24 2178828 1822949 27 22

diff -r 000000000000 -r 3b33da018e74 tool-data/annotation_profiler_options.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/annotation_profiler_options.xml.sample Mon May 19 12:33:42 2014 -0400

b'@@ -0,0 +1,1101 @@\n+<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="hg18">\n+ <options>\n+ <option name="Mapping and Sequencing Tracks" value="group-map">\n+ <option name="STS Markers" value="stsMap"/>\n+ <option name="Fosmid End Pairs" value="fosEndPairs"/>\n+ <option name="Chromosome Band" value="cytoBand"/>\n+ <option name="BAC End Pairs" value="bacEndPairs"/>\n+ <option name="FISH Clones" value="fishClones"/>\n+ <option name="GC Percent" value="gc5Base"/>\n+ <option name="Coverage" value="clonePos"/>\n+ <option name="Recomb Rate" value="recombRate"/>\n+ <option name="Map Contigs" value="ctgPos"/>\n+ <option name="Assembly" value="gold"/>\n+ <option name="Gap" value="gap"/>\n+ <option name="Chromosome Band (Ideogram)" value="cytoBandIdeo"/>\n+ </option>\n+ <option name="Comparative Genomics" value="group-compGeno">\n+ <option name="chainRn4" value="chainRn4"/>\n+ <option name="chainOryLat1" value="chainOryLat1"/>\n+ <option name="chainTetNig1" value="chainTetNig1"/>\n+ <option name="netDanRer5" value="netDanRer5"/>\n+ <option name="netPanTro2" value="netPanTro2"/>\n+ <option name="chainCanFam2" value="chainCanFam2"/>\n+ <option name="netPonAbe2" value="netPonAbe2"/>\n+ <option name="netGasAcu1" value="netGasAcu1"/>\n+ <option name="netStrPur2" value="netStrPur2"/>\n+ <option name="chainDanRer5" value="chainDanRer5"/>\n+ <option name="netGalGal3" value="netGalGal3"/>\n+ <option name="netMonDom4" value="netMonDom4"/>\n+ <option name="netOryLat1" value="netOryLat1"/>\n+ <option name="netMm9" value="netMm9"/>\n+ <option name="chainRheMac2" value="chainRheMac2"/>\n+ <option name="netOrnAna1" value="netOrnAna1"/>\n+ <option name="chainFr2" value="chainFr2"/>\n+ <option name="netXenTro2" value="netXenTro2"/>\n+ <option name="Cons Indels MmCf" value="consIndelsHgMmCanFam"/>\n+ <option name="Tetraodon Ecores" value="ecoresTetNig1"/>\n+ <option name="netTetNig1" value="netTetNig1"/>\n+ <option name="chainGalGal3" value="chainGalGal3"/>\n+ <option name="chainStrPur2" value="chainStrPur2"/>\n+ <option name="chainOrnAna1" value="chainOrnAna1"/>\n+ <option name="netFelCat3" value="netFelCat3"/>\n+ <option name="chainEquCab1" value="chainEquCab1"/>\n+ <option name="chainPonAbe2" value="chainPonAbe2"/>\n+ <option name="chainFelCat3" value="chainFelCat3"/>\n+ <option name="mostConserved28way" value="subtracks-mostConserved28way">\n+ <option name="Most Conserved" value="mostConserved28way"/>\n+ <option name="Vertebrate" value="phastConsElements28way"/>\n+ <option name="Mammal" value="phastConsElements28wayPlacMammal"/>\n+ </option>\n+ <option name="chainAnoCar1" value="chainAnoCar1"/>\n+ <option name="17-Way Cons" value="multiz17way"/>\n+ <option name="chainXenTro2" value="chainXenTro2"/>\n+ <option name="17-Way Most Cons" value="phastConsElements17way"/>\n+ <option name="chainGasAcu1" value="chainGasAcu1"/>\n+ <option name="Conservation" value="multiz28way"/>\n+ <option name="chainCalJac1" value="chainCalJac1"/>\n+ <option name="netCanFam2" value="netCanFam2"/>\n+ <option name="chainMm9" value="chainMm9"/>\n+ <option name="chainMonDom4" value="chainMonDom4"/>\n+ <option name="netCalJac1" value="netCalJac1"/>\n+ <option name="netRheMac2" value="netRheMac2"/>\n+ <option name="chainPanTro2" value="chainPanTro2"/>\n+ <option name="netEquCab1" value="netEquCab1"/>\n+ <option name="netFr2" value="netFr2"/>\n+ <option name="netAnoCar1" value="netAnoCar1"/>\n+ <option name="netBosTau4" value="netBosTau4"/>\n+ <option name="chainBosTau4" value="chainBosTau4"/>\n+ <option name="netRn4" value="netRn4"/>\n+ </option>\n+ <option name="Phenotype and Di'..b'e="encodeEgaspFullExogean"/>\n+ <option name="ExonHunter" value="encodeEgaspFullExonhunter"/>\n+ <option name="Fgenesh++" value="encodeEgaspFullFgenesh"/>\n+ <option name="GeneID" value="encodeEgaspFullGeneId"/>\n+ <option name="GeneID U12" value="encodeEgaspFullGeneIdU12"/>\n+ <option name="GeneMark" value="encodeEgaspFullGenemark"/>\n+ <option name="Jigsaw" value="encodeEgaspFullJigsaw"/>\n+ <option name="Pairgn/NSCAN-E/+" value="encodeEgaspFullPairagonAny"/>\n+ <option name="Pairgn/NSCAN-E" value="encodeEgaspFullPairagonMrna"/>\n+ <option name="NSCAN" value="encodeEgaspFullPairagonMultiple"/>\n+ <option name="SGP2" value="encodeEgaspFullSgp2"/>\n+ <option name="SGP2 U12" value="encodeEgaspFullSgp2U12"/>\n+ <option name="Fgenesh Pseudo" value="encodeEgaspFullSoftberryPseudo"/>\n+ <option name="SPIDA Exons" value="encodeEgaspFullSpida"/>\n+ <option name="Twinscan" value="encodeEgaspFullTwinscan"/>\n+ </option>\n+ <option name="encodeGencodeGeneOct05" value="subtracks-encodeGencodeGeneOct05">\n+ <option name="Gencode Ref" value="encodeGencodeGeneKnownOct05"/>\n+ <option name="Gencode Genes Oct05" value="encodeGencodeGeneOct05"/>\n+ <option name="Gencode Pseudo" value="encodeGencodeGenePseudoOct05"/>\n+ <option name="Gencode Putative" value="encodeGencodeGenePutativeOct05"/>\n+ </option>\n+ <option name="Gencode Introns Oct05" value="encodeGencodeIntronOct05"/>\n+ <option name="encodePseudogene" value="subtracks-encodePseudogene">\n+ <option name="Pseudogenes" value="encodePseudogene"/>\n+ <option name="Consensus Pseudogenes" value="encodePseudogeneConsensus"/>\n+ <option name="GIS Pseudogenes" value="encodePseudogeneGIS"/>\n+ <option name="Havana-Gencode Pseudogenes" value="encodePseudogeneHavana"/>\n+ <option name="UCSC Retrogenes" value="encodePseudogeneUcsc"/>\n+ <option name="UCSC Pseudogenes" value="encodePseudogeneUcsc2"/>\n+ <option name="Yale Pseudogenes" value="encodePseudogeneYale"/>\n+ </option>\n+ <option name="ENCODE Regions" value="encodeRegions"/>\n+ <option name="encodeEgaspUpdate" value="subtracks-encodeEgaspUpdate">\n+ <option name="Augustus Update" value="encodeEgaspUpdAugustusAbinitio"/>\n+ <option name="August/EST/Ms Upd" value="encodeEgaspUpdAugustusAny"/>\n+ <option name="August/Mouse Upd" value="encodeEgaspUpdAugustusDual"/>\n+ <option name="Augustus/EST Upd" value="encodeEgaspUpdAugustusEst"/>\n+ <option name="Exogean Update" value="encodeEgaspUpdExogean"/>\n+ <option name="FGenesh++ Upd" value="encodeEgaspUpdFgenesh"/>\n+ <option name="GeneID Update" value="encodeEgaspUpdGeneId"/>\n+ <option name="GeneID U12 Upd" value="encodeEgaspUpdGeneIdU12"/>\n+ <option name="Jigsaw Update" value="encodeEgaspUpdJigsaw"/>\n+ <option name="SGP2 Update" value="encodeEgaspUpdSgp2"/>\n+ <option name="SGP2 U12 Update" value="encodeEgaspUpdSgp2U12"/>\n+ <option name="Yale Pseudo Upd" value="encodeEgaspUpdYalePseudo"/>\n+ <option name="EGASP Update" value="encodeEgaspUpdate"/>\n+ </option>\n+ <option name="encodeEgaspPartial" value="subtracks-encodeEgaspPartial">\n+ <option name="ACEScan Cons Alt" value="encodeEgaspPartAceCons"/>\n+ <option name="ACEScan Other" value="encodeEgaspPartAceOther"/>\n+ <option name="Augustus" value="encodeEgaspPartAugustusAbinitio"/>\n+ <option name="Augustus/EST/Mouse" value="encodeEgaspPartAugustusAny"/>\n+ <option name="Augustus/Mouse" value="encodeEgaspPartAugustusDual"/>\n+ <option name="Augustus/EST" value="encodeEgaspPartAugustusEst"/>\n+ <option name="GeneZilla" value="encodeEgaspPartGenezilla"/>\n+ <option name="SAGA" value="encodeEgaspPartSaga"/>\n+ <option name="EGASP Partial" value="encodeEgaspPartial"/>\n+ </option>\n+ </option>\n+ </options>\n+</filter>\n'

diff -r 000000000000 -r 3b33da018e74 tool-data/annotation_profiler_valid_builds.txt.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/annotation_profiler_valid_builds.txt.sample Mon May 19 12:33:42 2014 -0400

@@ -0,0 +1,1 @@
+hg18

diff -r 000000000000 -r 3b33da018e74 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon May 19 12:33:42 2014 -0400

@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="bx-python" version="0.7.1">
+    <repository changeset_revision="2d0c08728bca" name="package_bx_python_0_7" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+  </package>
+</tool_dependency>