# HG changeset patch # User xuebing # Date 1331340473 18000 # Node ID 33c067c3ae34b5102bec00ac97d87c1fc99b93f4 # Parent c2a356708570a886143ca45f196a4000e7fa40ef Deleted selected files diff -r c2a356708570 -r 33c067c3ae34 fimo2.xml --- a/fimo2.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ - - using FIMO - fimo - #if $background_select.bg_select == "fromfile": - -bgfile $bgfile - #end if - - $norc --max-stored-scores 5000000 --output-pthresh $pth --verbosity 1 $motif $database - && mv fimo_out/fimo.html ${html_outfile} - - && mv fimo_out/fimo.txt ${txt_outfile} - - && rm -rf fimo_out - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses FIMO to find matches of a motif in a fasta file. See more details: - -http://meme.sdsc.edu/meme/fimo-intro.html - - - diff -r c2a356708570 -r 33c067c3ae34 mytools.zip Binary file mytools.zip has changed diff -r c2a356708570 -r 33c067c3ae34 tools/.DS_Store Binary file tools/.DS_Store has changed diff -r c2a356708570 -r 33c067c3ae34 tools/._.DS_Store Binary file tools/._.DS_Store has changed diff -r c2a356708570 -r 33c067c3ae34 tools/._mytools Binary file tools/._mytools has changed diff -r c2a356708570 -r 33c067c3ae34 tools/._tool_conf.xml Binary file tools/._tool_conf.xml has changed diff -r c2a356708570 -r 33c067c3ae34 tools/annotation_profiler/annotation_profiler.xml --- a/tools/annotation_profiler/annotation_profiler.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,136 +0,0 @@ - - for a set of genomic intervals - annotation_profiler_for_interval.py -i $input1 -c ${input1.metadata.chromCol} -s ${input1.metadata.startCol} -e ${input1.metadata.endCol} -o $out_file1 $keep_empty -p ${GALAXY_DATA_INDEX_DIR}/annotation_profiler/$dbkey $summary -b 3 -t $table_names - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Takes an input set of intervals and for each interval determines the base coverage of the interval by a set of features (tables) available from UCSC. Genomic regions from the input feature data have been merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28). - -By default, this tool will check the coverage of your intervals against all available features; you may, however, choose to select only those tables that you want to include. Selecting a section heading will effectively cause all of its children to be selected. - -You may alternatively choose to receive a summary across all of the intervals that you provide. - ------ - -**Example** - -Using the interval below and selecting several tables:: - - chr1 4558 14764 uc001aab.1 0 - - -results in:: - - chr1 4558 14764 uc001aab.1 0 - snp126Exceptions 151 142 - chr1 4558 14764 uc001aab.1 0 - genomicSuperDups 10206 1 - chr1 4558 14764 uc001aab.1 0 - chainOryLat1 3718 1 - chr1 4558 14764 uc001aab.1 0 - multiz28way 10206 1 - chr1 4558 14764 uc001aab.1 0 - affyHuEx1 3553 32 - chr1 4558 14764 uc001aab.1 0 - netXenTro2 3050 1 - chr1 4558 14764 uc001aab.1 0 - intronEst 10206 1 - chr1 4558 14764 uc001aab.1 0 - xenoMrna 10203 1 - chr1 4558 14764 uc001aab.1 0 - ctgPos 10206 1 - chr1 4558 14764 uc001aab.1 0 - clonePos 10206 1 - chr1 4558 14764 uc001aab.1 0 - chainStrPur2Link 1323 29 - chr1 4558 14764 uc001aab.1 0 - affyTxnPhase3HeLaNuclear 9011 8 - chr1 4558 14764 uc001aab.1 0 - snp126orthoPanTro2RheMac2 61 58 - chr1 4558 14764 uc001aab.1 0 - snp126 205 192 - chr1 4558 14764 uc001aab.1 0 - chainEquCab1 10206 1 - chr1 4558 14764 uc001aab.1 0 - netGalGal3 3686 1 - chr1 4558 14764 uc001aab.1 0 - phastCons28wayPlacMammal 10172 3 - -Where:: - - The first added column is the table name. - The second added column is the number of bases covered by the table. - The third added column is the number of regions from the table that is covered by the interval. - -Alternatively, requesting a summary, using the intervals below and selecting several tables:: - - chr1 4558 14764 uc001aab.1 0 - - chr1 4558 19346 uc001aac.1 0 - - -results in:: - - #tableName tableSize tableRegionCount allIntervalCount allIntervalSize allCoverage allTableRegionsOverlaped allIntervalsOverlapingTable nrIntervalCount nrIntervalSize nrCoverage nrTableRegionsOverlaped nrIntervalsOverlapingTable - snp126Exceptions 133601 92469 2 24994 388 359 2 1 14788 237 217 1 - genomicSuperDups 12268847 657 2 24994 24994 2 2 1 14788 14788 1 1 - chainOryLat1 70337730 2542 2 24994 7436 2 2 1 14788 3718 1 1 - affyHuEx1 15703901 112274 2 24994 7846 70 2 1 14788 4293 38 1 - netXenTro2 111440392 1877 2 24994 6100 2 2 1 14788 3050 1 1 - snp126orthoPanTro2RheMac2 700436 690674 2 24994 124 118 2 1 14788 63 60 1 - intronEst 135796064 2332 2 24994 24994 2 2 1 14788 14788 1 1 - xenoMrna 129031327 1586 2 24994 20406 2 2 1 14788 10203 1 1 - snp126 956976 838091 2 24994 498 461 2 1 14788 293 269 1 - clonePos 224999719 39 2 24994 24994 2 2 1 14788 14788 1 1 - chainStrPur2Link 7948016 119841 2 24994 2646 58 2 1 14788 1323 29 1 - affyTxnPhase3HeLaNuclear 136797870 140244 2 24994 22601 17 2 1 14788 13590 9 1 - multiz28way 225928588 38 2 24994 24994 2 2 1 14788 14788 1 1 - ctgPos 224999719 39 2 24994 24994 2 2 1 14788 14788 1 1 - chainEquCab1 246306414 141 2 24994 24994 2 2 1 14788 14788 1 1 - netGalGal3 203351973 461 2 24994 7372 2 2 1 14788 3686 1 1 - phastCons28wayPlacMammal 221017670 22803 2 24994 24926 6 2 1 14788 14754 3 1 - -Where:: - - tableName is the name of the table - tableChromosomeCoverage is the number of positions existing in the table for only the chromosomes that were referenced by the interval file - tableChromosomeCount is the number of regions existing in the table for only the chromosomes that were referenced by the interval file - tableRegionCoverage is the number of positions existing in the table between the minimal and maximal bounding regions that were referenced by the interval file - tableRegionCount is the number of regions existing in the table between the minimal and maximal bounding regions that were referenced by the interval file - - allIntervalCount is the number of provided intervals - allIntervalSize is the sum of the lengths of the provided interval file - allCoverage is the sum of the coverage for each provided interval - allTableRegionsOverlapped is the sum of the number of regions of the table (non-unique) that were overlapped for each interval - allIntervalsOverlappingTable is the number of provided intervals which overlap the table - - nrIntervalCount is the number of non-redundant intervals - nrIntervalSize is the sum of the lengths of non-redundant intervals - nrCoverage is the sum of the coverage of non-redundant intervals - nrTableRegionsOverlapped is the number of regions of the table (unique) that were overlapped by the non-redundant intervals - nrIntervalsOverlappingTable is the number of non-redundant intervals which overlap the table - - -.. class:: infomark - -**TIP:** non-redundant (nr) refers to the set of intervals that remains after the intervals provided have been merged to resolve overlaps - - - diff -r c2a356708570 -r 33c067c3ae34 tools/annotation_profiler/annotation_profiler_for_interval.py --- a/tools/annotation_profiler/annotation_profiler_for_interval.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,360 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -#For a set of intervals, this tool returns the same set of intervals -#with 2 additional fields: the name of a Table/Feature and the number of -#bases covered. The original intervals are repeated for each Table/Feature. - -import sys, struct, optparse, os, random -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.intervals.io -import bx.bitset -try: - import psyco - psyco.full() -except: - pass - -assert sys.version_info[:2] >= ( 2, 4 ) - -class CachedRangesInFile: - DEFAULT_STRUCT_FORMAT = ' self._coverage[-1][1]: - return len( self._coverage ) - 1 - i = 0 - j = len( self._coverage) - 1 - while i < j: - k = ( i + j ) / 2 - if start <= self._coverage[k][1]: - j = k - else: - i = k + 1 - return i - def get_coverage( self, start, end ): - return self.get_coverage_regions_overlap( start, end )[0] - def get_coverage_regions_overlap( self, start, end ): - return self.get_coverage_regions_index_overlap( start, end )[0:2] - def get_coverage_regions_index_overlap( self, start, end ): - if len( self._coverage ) < 1 or start > self._coverage[-1][1] or end < self._coverage[0][0]: - return 0, 0, 0 - if self._total_coverage and start <= self._coverage[0][0] and end >= self._coverage[-1][1]: - return self._total_coverage, len( self._coverage ), 0 - coverage = 0 - region_count = 0 - start_index = self.get_start_index( start ) - for i in xrange( start_index, len( self._coverage ) ): - c_start, c_end = self._coverage[i] - if c_start > end: - break - if c_start <= end and c_end >= start: - coverage += min( end, c_end ) - max( start, c_start ) - region_count += 1 - return coverage, region_count, start_index - -class CachedCoverageReader: - def __init__( self, base_file_path, buffer = 10, table_names = None, profiler_info = None ): - self._base_file_path = base_file_path - self._buffer = buffer #number of chromosomes to keep in memory at a time - self._coverage = {} - if table_names is None: table_names = [ table_dir for table_dir in os.listdir( self._base_file_path ) if os.path.isdir( os.path.join( self._base_file_path, table_dir ) ) ] - for tablename in table_names: self._coverage[tablename] = {} - if profiler_info is None: profiler_info = {} - self._profiler_info = profiler_info - def iter_table_coverage_by_region( self, chrom, start, end ): - for tablename, coverage, regions in self.iter_table_coverage_regions_by_region( chrom, start, end ): - yield tablename, coverage - def iter_table_coverage_regions_by_region( self, chrom, start, end ): - for tablename, coverage, regions, index in self.iter_table_coverage_regions_index_by_region( chrom, start, end ): - yield tablename, coverage, regions - def iter_table_coverage_regions_index_by_region( self, chrom, start, end ): - for tablename, chromosomes in self._coverage.iteritems(): - if chrom not in chromosomes: - if len( chromosomes ) >= self._buffer: - #randomly remove one chromosome from this table - del chromosomes[ chromosomes.keys().pop( random.randint( 0, self._buffer - 1 ) ) ] - chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ), self._profiler_info ) - coverage, regions, index = chromosomes[chrom].get_coverage_regions_index_overlap( start, end ) - yield tablename, coverage, regions, index - -class TableCoverageSummary: - def __init__( self, coverage_reader, chrom_lengths ): - self.coverage_reader = coverage_reader - self.chrom_lengths = chrom_lengths - self.chromosome_coverage = {} #dict of bitset by chromosome holding user's collapsed input intervals - self.total_interval_size = 0 #total size of user's input intervals - self.total_interval_count = 0 #total number of user's input intervals - self.table_coverage = {} #dict of total coverage by user's input intervals by table - self.table_chromosome_size = {} #dict of dict of table:chrom containing total coverage of table for a chrom - self.table_chromosome_count = {} #dict of dict of table:chrom containing total number of coverage ranges of table for a chrom - self.table_regions_overlaped_count = {} #total number of table regions overlaping user's input intervals (non unique) - self.interval_table_overlap_count = {} #total number of user input intervals which overlap table - self.region_size_errors = {} #dictionary of lists of invalid ranges by chromosome - def add_region( self, chrom, start, end ): - chrom_length = self.chrom_lengths.get( chrom ) - region_start = min( start, chrom_length ) - region_end = min( end, chrom_length ) - region_length = region_end - region_start - - if region_length < 1 or region_start != start or region_end != end: - if chrom not in self.region_size_errors: - self.region_size_errors[chrom] = [] - self.region_size_errors[chrom].append( ( start, end ) ) - if region_length < 1: return - - self.total_interval_size += region_length - self.total_interval_count += 1 - if chrom not in self.chromosome_coverage: - self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length ) - - self.chromosome_coverage[chrom].set_range( region_start, region_length ) - for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ): - if table_name not in self.table_coverage: - self.table_coverage[table_name] = 0 - self.table_chromosome_size[table_name] = {} - self.table_regions_overlaped_count[table_name] = 0 - self.interval_table_overlap_count[table_name] = 0 - self.table_chromosome_count[table_name] = {} - if chrom not in self.table_chromosome_size[table_name]: - self.table_chromosome_size[table_name][chrom] = self.coverage_reader._coverage[table_name][chrom]._total_coverage - self.table_chromosome_count[table_name][chrom] = len( self.coverage_reader._coverage[table_name][chrom]._coverage ) - self.table_coverage[table_name] += coverage - if coverage: - self.interval_table_overlap_count[table_name] += 1 - self.table_regions_overlaped_count[table_name] += regions - def iter_table_coverage( self ): - def get_nr_coverage(): - #returns non-redundant coverage, where user's input intervals have been collapse to resolve overlaps - table_coverage = {} #dictionary of tables containing number of table bases overlaped by nr intervals - interval_table_overlap_count = {} #dictionary of tables containing number of nr intervals overlaping table - table_regions_overlap_count = {} #dictionary of tables containing number of regions overlaped (unique) - interval_count = 0 #total number of nr intervals - interval_size = 0 #holds total size of nr intervals - region_start_end = {} #holds absolute start,end for each user input chromosome - for chrom, chromosome_bitset in self.chromosome_coverage.iteritems(): - #loop through user's collapsed input intervals - end = 0 - last_end_index = {} - interval_size += chromosome_bitset.count_range() - while True: - if end >= chromosome_bitset.size: break - start = chromosome_bitset.next_set( end ) - if start >= chromosome_bitset.size: break - end = chromosome_bitset.next_clear( start ) - interval_count += 1 - if chrom not in region_start_end: - region_start_end[chrom] = [start, end] - else: - region_start_end[chrom][1] = end - for table_name, coverage, region_count, start_index in self.coverage_reader.iter_table_coverage_regions_index_by_region( chrom, start, end ): - if table_name not in table_coverage: - table_coverage[table_name] = 0 - interval_table_overlap_count[table_name] = 0 - table_regions_overlap_count[table_name] = 0 - table_coverage[table_name] += coverage - if coverage: - interval_table_overlap_count[table_name] += 1 - table_regions_overlap_count[table_name] += region_count - if table_name in last_end_index and last_end_index[table_name] == start_index: - table_regions_overlap_count[table_name] -= 1 - last_end_index[table_name] = start_index + region_count - 1 - table_region_coverage = {} #total coverage for tables by bounding nr interval region - table_region_count = {} #total number for tables by bounding nr interval region - for chrom, start_end in region_start_end.items(): - for table_name, coverage, region_count in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, start_end[0], start_end[1] ): - if table_name not in table_region_coverage: - table_region_coverage[table_name] = 0 - table_region_count[table_name] = 0 - table_region_coverage[table_name] += coverage - table_region_count[table_name] += region_count - return table_region_coverage, table_region_count, interval_count, interval_size, table_coverage, table_regions_overlap_count, interval_table_overlap_count - table_region_coverage, table_region_count, nr_interval_count, nr_interval_size, nr_table_coverage, nr_table_regions_overlap_count, nr_interval_table_overlap_count = get_nr_coverage() - for table_name in self.table_coverage: - #TODO: determine a type of statistic, then calculate and report here - yield table_name, sum( self.table_chromosome_size.get( table_name, {} ).values() ), sum( self.table_chromosome_count.get( table_name, {} ).values() ), table_region_coverage.get( table_name, 0 ), table_region_count.get( table_name, 0 ), self.total_interval_count, self.total_interval_size, self.table_coverage[table_name], self.table_regions_overlaped_count.get( table_name, 0), self.interval_table_overlap_count.get( table_name, 0 ), nr_interval_count, nr_interval_size, nr_table_coverage[table_name], nr_table_regions_overlap_count.get( table_name, 0 ), nr_interval_table_overlap_count.get( table_name, 0 ) - -def profile_per_interval( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader ): - out = open( out_filename, 'wb' ) - for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ): - for table_name, coverage, region_count in coverage_reader.iter_table_coverage_regions_by_region( region.chrom, region.start, region.end ): - if keep_empty or coverage: - #only output regions that have atleast 1 base covered unless empty are requested - out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), table_name, coverage, region_count ) ) - out.close() - -def profile_summary( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader, chrom_lengths ): - out = open( out_filename, 'wb' ) - table_coverage_summary = TableCoverageSummary( coverage_reader, chrom_lengths ) - for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ): - table_coverage_summary.add_region( region.chrom, region.start, region.end ) - - out.write( "#tableName\ttableChromosomeCoverage\ttableChromosomeCount\ttableRegionCoverage\ttableRegionCount\tallIntervalCount\tallIntervalSize\tallCoverage\tallTableRegionsOverlaped\tallIntervalsOverlapingTable\tnrIntervalCount\tnrIntervalSize\tnrCoverage\tnrTableRegionsOverlaped\tnrIntervalsOverlapingTable\n" ) - for table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count in table_coverage_summary.iter_table_coverage(): - if keep_empty or total_coverage: - #only output tables that have atleast 1 base covered unless empty are requested - out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count ) ) - out.close() - - #report chrom size errors as needed: - if table_coverage_summary.region_size_errors: - print "Regions provided extended beyond known chromosome lengths, and have been truncated as necessary, for the following intervals:" - for chrom, regions in table_coverage_summary.region_size_errors.items(): - if len( regions ) > 3: - extra_region_info = ", ... " - else: - extra_region_info = "" - print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info ) - -class ChromosomeLengths: - def __init__( self, profiler_info ): - self.chroms = {} - self.default_bitset_size = int( profiler_info.get( 'bitset_size', bx.bitset.MAX ) ) - chroms = profiler_info.get( 'chromosomes', None ) - if chroms: - for chrom in chroms.split( ',' ): - for fields in chrom.rsplit( '=', 1 ): - if len( fields ) == 2: - self.chroms[ fields[0] ] = int( fields[1] ) - else: - self.chroms[ fields[0] ] = self.default_bitset_size - def get( self, name ): - return self.chroms.get( name, self.default_bitset_size ) - -def parse_profiler_info( filename ): - profiler_info = {} - try: - for line in open( filename ): - fields = line.rstrip( '\n\r' ).split( '\t', 1 ) - if len( fields ) == 2: - if fields[0] in profiler_info: - if not isinstance( profiler_info[ fields[0] ], list ): - profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ] - profiler_info[ fields[0] ].append( fields[1] ) - else: - profiler_info[ fields[0] ] = fields[1] - except: - pass #likely missing file - return profiler_info - -def __main__(): - parser = optparse.OptionParser() - parser.add_option( - '-k','--keep_empty', - action="store_true", - dest='keep_empty', - default=False, - help='Keep tables with 0 coverage' - ) - parser.add_option( - '-b','--buffer', - dest='buffer', - type='int',default=10, - help='Number of Chromosomes to keep buffered' - ) - parser.add_option( - '-c','--chrom_col', - dest='chrom_col', - type='int',default=1, - help='Chromosome column' - ) - parser.add_option( - '-s','--start_col', - dest='start_col', - type='int',default=2, - help='Start Column' - ) - parser.add_option( - '-e','--end_col', - dest='end_col', - type='int',default=3, - help='End Column' - ) - parser.add_option( - '-p','--path', - dest='path', - type='str',default='/galaxy/data/annotation_profiler/hg18', - help='Path to profiled data for this organism' - ) - parser.add_option( - '-t','--table_names', - dest='table_names', - type='str',default='None', - help='Table names requested' - ) - parser.add_option( - '-i','--input', - dest='interval_filename', - type='str', - help='Input Interval File' - ) - parser.add_option( - '-o','--output', - dest='out_filename', - type='str', - help='Input Interval File' - ) - parser.add_option( - '-S','--summary', - action="store_true", - dest='summary', - default=False, - help='Display Summary Results' - ) - - options, args = parser.parse_args() - - assert os.path.isdir( options.path ), IOError( "Configuration error: Table directory is missing (%s)" % options.path ) - - #get profiler_info - profiler_info = parse_profiler_info( os.path.join( options.path, 'profiler_info.txt' ) ) - - table_names = options.table_names.split( "," ) - if table_names == ['None']: table_names = None - coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info ) - - if options.summary: - profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) ) - else: - profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader ) - - #print out data version info - print 'Data version (%s:%s:%s)' % ( profiler_info.get( 'dbkey', 'unknown' ), profiler_info.get( 'profiler_hash', 'unknown' ), profiler_info.get( 'dump_time', 'unknown' ) ) - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/bedtools/._bedToBam.xml Binary file tools/bedtools/._bedToBam.xml has changed diff -r c2a356708570 -r 33c067c3ae34 tools/bedtools/bedToBam.xml --- a/tools/bedtools/bedToBam.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ - - convert BED or GFF or VCF to BAM - bedToBam -i $input -g $genome $bed12 $mapq $ubam > $outfile - - - - - - - - - - - - - - - - - -**What it does** - -Program: bedToBam (v2.13.3) -Author: Aaron Quinlan (aaronquinlan@gmail.com) -Summary: Converts feature records to BAM format. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_destination/epigraph.xml --- a/tools/data_destination/epigraph.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ - - - and prediction with EpiGRAPH - GENOME=${input1.dbkey} NAME=${input1.name} INFO=${input1.info} - - - - - - - - - - - -.. class:: warningmark - -After clicking the **Execute** button, you will be redirected to the EpiGRAPH website. Please be patient while the dataset is being imported. Inside EpiGRAPH, buttons are available to send the results of the EpiGRAPH analysis back to Galaxy. In addition, you can always abandon an EpiGRAPH session and return to Galaxy by directing your browser to your current Galaxy instance. - ------ - -.. class:: infomark - -**What it does** - -This tool sends the selected dataset to EpiGRAPH in order to perform an in-depth analysis with statistical and machine learning methods. - ------ - -.. class:: infomark - -**EpiGRAPH outline** - -The EpiGRAPH_ web service enables biologists to uncover hidden associations in vertebrate genome and epigenome datasets. Users can upload or import sets of genomic regions and EpiGRAPH will test a wide range of attributes (including DNA sequence and structure, gene density, chromatin modifications and evolutionary conservation) for enrichment or depletion among these regions. Furthermore, EpiGRAPH learns to predictively identify genomic regions that exhibit similar properties. - -.. _EpiGRAPH: http://epigraph.mpi-inf.mpg.de/ - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_destination/epigraph_test.xml --- a/tools/data_destination/epigraph_test.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ - - - and prediction with EpiGRAPH Test - GENOME=${input1.dbkey} NAME=${input1.name} INFO=${input1.info} - - - - - - - - - - - -.. class:: warningmark - -After clicking the **Execute** button, you will be redirected to the EpiGRAPH test website. Please be patient while the dataset is being imported. Inside EpiGRAPH, buttons are available to send the results of the EpiGRAPH analysis back to Galaxy. In addition, you can always abandon an EpiGRAPH session and return to Galaxy by directing your browser to your current Galaxy instance. - ------ - -.. class:: infomark - -**What it does** - -This tool sends the selected dataset to EpiGRAPH in order to perform an in-depth analysis with statistical and machine learning methods. - ------ - -.. class:: infomark - -**EpiGRAPH outline** - -The EpiGRAPH_ web service enables biologists to uncover hidden associations in vertebrate genome and epigenome datasets. Users can upload or import sets of genomic regions and EpiGRAPH will test a wide range of attributes (including DNA sequence and structure, gene density, chromatin modifications and evolutionary conservation) for enrichment or depletion among these regions. Furthermore, EpiGRAPH learns to predictively identify genomic regions that exhibit similar properties. - -.. _EpiGRAPH: http://epigraph.mpi-inf.mpg.de/ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/access_libraries.xml --- a/tools/data_source/access_libraries.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ - - - stored locally - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/bed_convert.xml --- a/tools/data_source/bed_convert.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ - - creates a bed or xbed file containing from text query - noop - - creates a bed or xbed file containing user assigned input of $input - - - - - - - User specifies delimiter, header information, and column assignments and the file will be converted to BED or xBED. - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/biomart.xml --- a/tools/data_source/biomart.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - - - Central server - data_source.py $output $__app__.config.output_size_limit - - go to BioMart Central $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/biomart_test.xml --- a/tools/data_source/biomart_test.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - - - Test server - data_source.py $output $__app__.config.output_size_limit - - go to BioMart Central $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/bx_browser.xml --- a/tools/data_source/bx_browser.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ - - - - browser - data_source.py $output $__app__.config.output_size_limit - - go to BX Browser $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/cbi_rice_mart.xml --- a/tools/data_source/cbi_rice_mart.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ - - - - rice mart - data_source.py $output $__app__.config.output_size_limit - - go to RMap rice mart $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/data_source.py --- a/tools/data_source/data_source.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# Retrieves data from external data source applications and stores in a dataset file. -# Data source application parameters are temporarily stored in the dataset file. -import socket, urllib, sys, os -from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg... -from galaxy.util.json import from_json_string, to_json_string -import galaxy.model # need to import model before sniff to resolve a circular import dependency -from galaxy.datatypes import sniff -from galaxy.datatypes.registry import Registry -from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_FILE - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -GALAXY_PARAM_PREFIX = 'GALAXY' -GALAXY_ROOT_DIR = os.path.realpath( os.path.join( os.path.split( os.path.realpath( __file__ ) )[0], '..', '..' ) ) -GALAXY_DATATYPES_CONF_FILE = os.path.join( GALAXY_ROOT_DIR, 'datatypes_conf.xml' ) - -def load_input_parameters( filename, erase_file = True ): - datasource_params = {} - try: - json_params = from_json_string( open( filename, 'r' ).read() ) - datasource_params = json_params.get( 'param_dict' ) - except: - json_params = None - for line in open( filename, 'r' ): - try: - line = line.strip() - fields = line.split( '\t' ) - datasource_params[ fields[0] ] = fields[1] - except: - continue - if erase_file: - open( filename, 'w' ).close() #open file for writing, then close, removes params from file - return json_params, datasource_params - -def __main__(): - filename = sys.argv[1] - try: - max_file_size = int( sys.argv[2] ) - except: - max_file_size = 0 - - job_params, params = load_input_parameters( filename ) - if job_params is None: #using an older tabular file - enhanced_handling = False - job_params = dict( param_dict = params ) - job_params[ 'output_data' ] = [ dict( out_data_name = 'output', - ext = 'data', - file_name = filename, - extra_files_path = None ) ] - job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE = TOOL_PROVIDED_JOB_METADATA_FILE ) - else: - enhanced_handling = True - json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) #specially named file for output junk to pass onto set metadata - - datatypes_registry = Registry( root_dir = job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) - - URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded - URL_method = params.get( 'URL_method', None ) - - # The Python support for fetching resources from the web is layered. urllib uses the httplib - # library, which in turn uses the socket library. As of Python 2.3 you can specify how long - # a socket should wait for a response before timing out. By default the socket module has no - # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 - # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by - # doing the following. - socket.setdefaulttimeout( 600 ) - - for data_dict in job_params[ 'output_data' ]: - cur_filename = data_dict.get( 'file_name', filename ) - cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL ) - if not cur_URL: - open( cur_filename, 'w' ).write( "" ) - stop_err( 'The remote data source application has not sent back a URL parameter in the request.' ) - - # The following calls to urllib.urlopen() will use the above default timeout - try: - if not URL_method or URL_method == 'get': - page = urllib.urlopen( cur_URL ) - elif URL_method == 'post': - page = urllib.urlopen( cur_URL, urllib.urlencode( params ) ) - except Exception, e: - stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) - if max_file_size: - file_size = int( page.info().get( 'Content-Length', 0 ) ) - if file_size > max_file_size: - stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) - #do sniff stream for multi_byte - try: - cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename ) - except Exception, e: - stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) ) - - #here import checks that upload tool performs - if enhanced_handling: - try: - ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext = data_dict[ 'ext' ], is_multi_byte = is_multi_byte ) - except Exception, e: - stop_err( str( e ) ) - info = dict( type = 'dataset', - dataset_id = data_dict[ 'dataset_id' ], - ext = ext) - - json_file.write( "%s\n" % to_json_string( info ) ) - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/echo.py --- a/tools/data_source/echo.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -#!/usr/bin/env python - -""" -Script that just echos the command line. -""" - -import sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -print '-' * 20, "
" -for elem in sys.argv: - print elem, "
" -print '-' * 20, "
" \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/echo.xml --- a/tools/data_source/echo.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ - - - - - - echoes parameters - - - echo.py $input $database $output - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/encode_db.xml --- a/tools/data_source/encode_db.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ - - - - - - at NHGRI - - - - fetch.py "$url" $output - - - - - go to EncodeDB $GALAXY_URL - - - - - - - - - - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/epigraph_import.xml --- a/tools/data_source/epigraph_import.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - - - server - data_source.py $output $__app__.config.output_size_limit - - go to EpiGRAPH server $GALAXY_URL - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/epigraph_import_test.xml --- a/tools/data_source/epigraph_import_test.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - - - test server - data_source.py $output $__app__.config.output_size_limit - - go to EpiGRAPH server $GALAXY_URL - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/eupathdb.xml --- a/tools/data_source/eupathdb.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ - - server - data_source.py $output $__app__.config.output_size_limit - - go to EuPathDB server $GALAXY_URL - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/fetch.py --- a/tools/data_source/fetch.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -#!/usr/bin/env python - -""" -Script that just echos the command line. -""" - -import sys, os, urllib - -assert sys.version_info[:2] >= ( 2, 4 ) - -BUFFER = 1048576 - -url = sys.argv[1] -out_name = sys.argv[2] - -out = open(out_name, 'wt') -try: - page = urllib.urlopen(url) - while 1: - data = page.read(BUFFER) - if not data: - break - out.write(data) -except Exception, e: - print 'Error getting the data -> %s' % e -out.close() diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/fly_modencode.xml --- a/tools/data_source/fly_modencode.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ - - - server - data_source.py $output $__app__.config.output_size_limit - - go to modENCODE fly server $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/flymine.xml --- a/tools/data_source/flymine.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ - - - - server - data_source.py $output $__app__.config.output_size_limit - - go to Flymine server $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/flymine_test.xml --- a/tools/data_source/flymine_test.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ - - - - server - data_source.py $output $__app__.config.output_size_limit - - go to Flymine server $GALAXY_URL - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/genbank.py --- a/tools/data_source/genbank.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -#!/usr/bin/env python -from Bio import GenBank -import sys, os, textwrap - -assert sys.version_info[:2] >= ( 2, 4 ) - -def make_fasta(rec): - '''Creates fasta format from a record''' - gi = rec.annotations.get('gi','') - org = rec.annotations.get('organism','') - date = rec.annotations.get('date','') - head = '>gi:%s, id:%s, org:%s, date:%s\n' % (gi, rec.id, org, date) - body = '\n'.join(textwrap.wrap(rec.seq.data, width=80)) - return head, body - -if __name__ == '__main__': - - mode = sys.argv[1] - text = sys.argv[2] - output_file = sys.argv[3] - - print 'Searching for %s
' % text - - # check if inputs are all numbers - try: - gi_list = text.split() - tmp = map(int, gi_list) - except ValueError: - gi_list = GenBank.search_for(text, max_ids=10) - - fp = open(output_file, 'wt') - record_parser = GenBank.FeatureParser() - ncbi_dict = GenBank.NCBIDictionary(mode, 'genbank', parser = record_parser) - for gid in gi_list: - res = ncbi_dict[gid] - head, body = make_fasta(res) - fp.write(head+body+'\n') - print head - fp.close() - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/genbank.xml --- a/tools/data_source/genbank.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ - - - genbank.py $mode "$text" $output - - - - - - - - - - - - - - -At the moment this tool allows the following simple searches: - -- by GI: **51594135** -- by accession: **CF622840** -- using text: **human hbb1** (this feature is experimental) - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/gramene_mart.xml --- a/tools/data_source/gramene_mart.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - - - Central server - data_source.py $output $__app__.config.output_size_limit - - go to GrameneMart Central $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/hapmapmart.xml --- a/tools/data_source/hapmapmart.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ - - - - HapMap Biomart - data_source.py $output $__app__.config.output_size_limit - - go to HapMap BioMart $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/hbvar.xml --- a/tools/data_source/hbvar.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ - - - - Human Hemoglobin Variants and Thalassemias - - - - - go to HbVar database $GALAXY_URL $tool_id - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/hbvar_filter.py --- a/tools/data_source/hbvar_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -#TODO: Set dbkey to proper UCSC build, if known -import urllib - -from galaxy import datatypes, config -import tempfile, shutil - -def exec_before_job( app, inp_data, out_data, param_dict, tool=None): - """Sets the name of the data""" - data_name = param_dict.get( 'name', 'HbVar query' ) - data_type = param_dict.get( 'type', 'txt' ) - if data_type == 'txt': data_type='interval' #All data is TSV, assume interval - name, data = out_data.items()[0] - data = app.datatypes_registry.change_datatype(data, data_type) - data.name = data_name - out_data[name] = data - -def exec_after_process(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None): - """Verifies the data after the run""" - - URL = param_dict.get( 'URL', None ) - URL = URL + '&_export=1&GALAXY_URL=0' - if not URL: - raise Exception('Datasource has not sent back a URL parameter') - - CHUNK_SIZE = 2**20 # 1Mb - MAX_SIZE = CHUNK_SIZE * 100 - - try: - page = urllib.urlopen(URL) - except Exception, exc: - raise Exception('Problems connecting to %s (%s)' % (URL, exc) ) - - name, data = out_data.items()[0] - - fp = open(data.file_name, 'wb') - size = 0 - while 1: - chunk = page.read(CHUNK_SIZE) - if not chunk: - break - if size > MAX_SIZE: - raise Exception('----- maximum datasize exceeded ---') - size += len(chunk) - fp.write(chunk) - - fp.close() - #Set meta data, format file to be valid interval type - if isinstance(data.datatype, datatypes.interval.Interval): - data.set_meta(first_line_is_header=True) - #check for missing meta data, if all there, comment first line and process file - if not data.missing_meta(): - line_ctr = -1 - temp = tempfile.NamedTemporaryFile('w') - temp_filename = temp.name - temp.close() - temp = open(temp_filename,'w') - chromCol = int(data.metadata.chromCol) - 1 - startCol = int(data.metadata.startCol) - 1 - strandCol = int(data.metadata.strandCol) - 1 - - - for line in open(data.file_name, 'r'): - line_ctr += 1 - - fields = line.strip().split('\t') - - temp.write("%s\n" % '\t'.join(fields)) - - temp.close() - shutil.move(temp_filename,data.file_name) - - else: - data = app.datatypes_registry.change_datatype(data, 'tabular') - data.set_size() - data.set_peek() - app.model.context.add( data ) - app.model.context.flush() diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/import.py --- a/tools/data_source/import.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -#!/usr/bin/env python - -""" -Script that imports locally stored data as a new dataset for the user -Usage: import id outputfile -""" -import sys, os - -assert sys.version_info[:2] >= ( 2, 4 ) - -BUFFER = 1048576 - -dataid = sys.argv[1] -out_name = sys.argv[2] - - -id2name = { - 'eryth' : 'ErythPreCRMmm3_cusTrk.txt', - 'cishg16' : 'ReglRegHBBhg16CusTrk.txt', - 'cishg17' : 'ReglRegHBBhg17CusTrk.txt', - 'exons' : 'ExonsKnownGenes_mm3.txt', - 'krhg16' : 'known_regulatory_hg16.bed', - 'krhg17' : 'known_regulatory_hg17.bed', - 'tARhg16mmc' : 'hg16.mouse.t_AR.cold.bed', - 'tARhg16mmm' : 'hg16.mouse.t_AR.medium.bed', - 'tARhg16mmh' : 'hg16.mouse.t_AR.hot.bed', - 'tARhg16rnc' : 'hg16.rat.t_AR.cold.bed', - 'tARhg16rnm' : 'hg16.rat.t_AR.medium.bed', - 'tARhg16rnh' : 'hg16.rat.t_AR.hot.bed', - 'phastConsHg16' : 'phastConsMost_hg16.bed', - 'omimhg16' : 'omimDisorders_hg16.tab', - 'omimhg17' : 'omimDisorders_hg17.tab', - -} - -fname = id2name.get(dataid, '') -if not fname: - print 'Importing invalid data %s' % dataid - sys.exit() -else: - print 'Imported %s' % fname - -# this path is hardcoded -inp_name = os.path.join('database', 'import', fname) - -try: - inp = open(inp_name, 'rt') -except: - print 'Could not find file %s' % inp_name - sys.exit() - -out = open(out_name, 'wt') - -while 1: - data = inp.read(BUFFER) - if not data: - break - out.write(data) - -inp.close() -out.close() diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/import.xml --- a/tools/data_source/import.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ - - (PSU prepared queries) - import.py $data $output - - $data - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/metabolicmine.xml --- a/tools/data_source/metabolicmine.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ - - - server - data_source.py $output $__app__.config.output_size_limit - - go to metabolicMine server $GALAXY_URL - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/microbial_import.py --- a/tools/data_source/microbial_import.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -#!/usr/bin/env python - -""" -Script that imports locally stored data as a new dataset for the user -Usage: import id outputfile -""" -import sys, os -from shutil import copyfile - -assert sys.version_info[:2] >= ( 2, 4 ) - -BUFFER = 1048576 - -uids = sys.argv[1].split(",") -out_file1 = sys.argv[2] - -#remove NONE from uids -have_none = True -while have_none: - try: - uids.remove('None') - except: - have_none = False - - -#create dictionary keyed by uid of tuples of (displayName,filePath,build) for all files -available_files = {} -try: - filename = sys.argv[-1] - for i, line in enumerate( file( filename ) ): - if not line or line[0:1] == "#" : continue - fields = line.split('\t') - try: - info_type = fields.pop(0) - - if info_type.upper()=="DATA": - uid = fields.pop(0) - org_num = fields.pop(0) - chr_acc = fields.pop(0) - feature = fields.pop(0) - filetype = fields.pop(0) - path = fields.pop(0).replace("\r","").replace("\n","") - - file_type = filetype - build = org_num - description = uid - else: - continue - except: - continue - - available_files[uid]=(description,path,build,file_type,chr_acc) -except: - print >>sys.stderr, "It appears that the configuration file for this tool is missing." - -#create list of tuples of (displayName,FileName,build) for desired files -desired_files = [] -for uid in uids: - try: - desired_files.append(available_files[uid]) - except: - continue - -#copy first file to contents of given output file -file1_copied = False -while not file1_copied: - try: - first_file = desired_files.pop(0) - except: - print >>sys.stderr, "There were no valid files requested." - sys.exit() - file1_desc, file1_path, file1_build, file1_type,file1_chr_acc = first_file - try: - copyfile(file1_path,out_file1) - print "#File1\t"+file1_desc+"\t"+file1_chr_acc+"\t"+file1_build+"\t"+file1_type - file1_copied = True - except: - print >>sys.stderr, "The file specified is missing." - continue - #print >>sys.stderr, "The file specified is missing." - - -#Tell post-process filter where remaining files reside -for extra_output in desired_files: - file_desc, file_path, file_build, file_type,file_chr_acc = extra_output - print "#NewFile\t"+file_desc+"\t"+file_chr_acc+"\t"+file_build+"\t"+file_path+"\t"+file_type diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/microbial_import.xml --- a/tools/data_source/microbial_import.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ - - microbial_import.py $CDS,$tRNA,$rRNA,$sequence,$GeneMark,$GeneMarkHMM,$Glimmer3 $output ${GALAXY_DATA_INDEX_DIR}/microbial_data.loc - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool will allow you to obtain various genomic datasets for any completed Microbial Genome Project as listed at NCBI_. - -.. _NCBI: http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?view=1 - -Current datasets available include - 1. CDS - 2. tRNA - 3. rRNA - 4. FASTA Sequences - 5. GeneMark Annotations - 6. GeneMarkHMM Annotations - 7. Glimmer3 Annotations - ------ - -Organisms in **bold** are available at the UCSC Browser. - ------ - -.. class:: infomark - -**Note:** Having trouble locating your organism? Click here_ for a list of available species and their location. - -.. _here: http://wiki.g2.bx.psu.edu/Main/Data%20Libraries/Microbes - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/microbial_import_code.py --- a/tools/data_source/microbial_import_code.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ - -def load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' ): - # FIXME: this function is duplicated in the DynamicOptions class. It is used here only to - # set data.name in exec_after_process(). - microbe_info= {} - orgs = {} - - filename = "%s/microbial_data.loc" % GALAXY_DATA_INDEX_DIR - for i, line in enumerate( open( filename ) ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - fields = line.split( sep ) - #read each line, if not enough fields, go to next line - try: - info_type = fields.pop(0) - if info_type.upper() == "ORG": - #ORG 12521 Clostridium perfringens SM101 bacteria Firmicutes CP000312,CP000313,CP000314,CP000315 http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=12521 - org_num = fields.pop(0) - name = fields.pop(0) - kingdom = fields.pop(0) - group = fields.pop(0) - chromosomes = fields.pop(0) - info_url = fields.pop(0) - link_site = fields.pop(0) - if org_num not in orgs: - orgs[ org_num ] = {} - orgs[ org_num ][ 'chrs' ] = {} - orgs[ org_num ][ 'name' ] = name - orgs[ org_num ][ 'kingdom' ] = kingdom - orgs[ org_num ][ 'group' ] = group - orgs[ org_num ][ 'chromosomes' ] = chromosomes - orgs[ org_num ][ 'info_url' ] = info_url - orgs[ org_num ][ 'link_site' ] = link_site - elif info_type.upper() == "CHR": - #CHR 12521 CP000315 Clostridium perfringens phage phiSM101, complete genome 38092 110684521 CP000315.1 - org_num = fields.pop(0) - chr_acc = fields.pop(0) - name = fields.pop(0) - length = fields.pop(0) - gi = fields.pop(0) - gb = fields.pop(0) - info_url = fields.pop(0) - chr = {} - chr[ 'name' ] = name - chr[ 'length' ] = length - chr[ 'gi' ] = gi - chr[ 'gb' ] = gb - chr[ 'info_url' ] = info_url - if org_num not in orgs: - orgs[ org_num ] = {} - orgs[ org_num ][ 'chrs' ] = {} - orgs[ org_num ][ 'chrs' ][ chr_acc ] = chr - elif info_type.upper() == "DATA": - #DATA 12521_12521_CDS 12521 CP000315 CDS bed /home/djb396/alignments/playground/bacteria/12521/CP000315.CDS.bed - uid = fields.pop(0) - org_num = fields.pop(0) - chr_acc = fields.pop(0) - feature = fields.pop(0) - filetype = fields.pop(0) - path = fields.pop(0) - data = {} - data[ 'filetype' ] = filetype - data[ 'path' ] = path - data[ 'feature' ] = feature - - if org_num not in orgs: - orgs[ org_num ] = {} - orgs[ org_num ][ 'chrs' ] = {} - if 'data' not in orgs[ org_num ][ 'chrs' ][ chr_acc ]: - orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ] = {} - orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ][ uid ] = data - else: continue - except: continue - for org_num in orgs: - org = orgs[ org_num ] - if org[ 'kingdom' ] not in microbe_info: - microbe_info[ org[ 'kingdom' ] ] = {} - if org_num not in microbe_info[ org[ 'kingdom' ] ]: - microbe_info[ org[ 'kingdom' ] ][org_num] = org - return microbe_info - -#post processing, set build for data and add additional data to history -from galaxy import datatypes, config, jobs, tools -from shutil import copyfile - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - base_dataset = out_data.items()[0][1] - history = base_dataset.history - if history == None: - print "unknown history!" - return - kingdom = param_dict.get( 'kingdom', None ) - #group = param_dict.get( 'group', None ) - org = param_dict.get( 'org', None ) - - #if not (kingdom or group or org): - if not (kingdom or org): - print "Parameters are not available." - #workflow passes galaxy.tools.parameters.basic.UnvalidatedValue instead of values - if isinstance( kingdom, tools.parameters.basic.UnvalidatedValue ): - kingdom = kingdom.value - if isinstance( org, tools.parameters.basic.UnvalidatedValue ): - org = org.value - - GALAXY_DATA_INDEX_DIR = app.config.tool_data_path - microbe_info = load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' ) - new_stdout = "" - split_stdout = stdout.split("\n") - basic_name = "" - for line in split_stdout: - fields = line.split("\t") - if fields[0] == "#File1": - description = fields[1] - chr = fields[2] - dbkey = fields[3] - file_type = fields[4] - name, data = out_data.items()[0] - data.set_size() - basic_name = data.name - data.name = data.name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] +" for " + microbe_info[kingdom][org]['name'] + ":" + chr + ")" - data.dbkey = dbkey - data.info = data.name - data = app.datatypes_registry.change_datatype( data, file_type ) - data.init_meta() - data.set_peek() - app.model.context.add( data ) - app.model.context.flush() - elif fields[0] == "#NewFile": - description = fields[1] - chr = fields[2] - dbkey = fields[3] - filepath = fields[4] - file_type = fields[5] - newdata = app.model.HistoryDatasetAssociation( create_dataset = True, sa_session = app.model.context ) #This import should become a library - newdata.set_size() - newdata.extension = file_type - newdata.name = basic_name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] +" for "+microbe_info[kingdom][org]['name']+":"+chr + ")" - app.model.context.add( newdata ) - app.model.context.flush() - app.security_agent.copy_dataset_permissions( base_dataset.dataset, newdata.dataset ) - history.add_dataset( newdata ) - app.model.context.add( history ) - app.model.context.flush() - try: - copyfile(filepath,newdata.file_name) - newdata.info = newdata.name - newdata.state = jobs.JOB_OK - except: - newdata.info = "The requested file is missing from the system." - newdata.state = jobs.JOB_ERROR - newdata.dbkey = dbkey - newdata.init_meta() - newdata.set_peek() - app.model.context.flush() diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/modmine.xml --- a/tools/data_source/modmine.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ - - - - server - data_source.py $output $__app__.config.output_size_limit - - go to modENCODE modMine server $GALAXY_URL - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ratmine.xml --- a/tools/data_source/ratmine.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ - - - - server - data_source.py $output $__app__.config.output_size_limit - - go to Ratmine server $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_archaea.xml --- a/tools/data_source/ucsc_archaea.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ - - - - table browser - - - ucsc_proxy.py $param_file $output - - - - go to UCSC $init $hgta_outputType - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_filter.py --- a/tools/data_source/ucsc_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ -# runs after the job (and after the default post-filter) -from galaxy import datatypes, jobs - -def validate(incoming): - """Validator""" - #raise Exception, 'not quite right' - pass - -def exec_before_job( app, inp_data, out_data, param_dict, tool=None): - """Sets the name of the data""" - outputType = param_dict.get( 'hgta_outputType', None ) - if isinstance(outputType, list) and len(outputType)>0: outputType = outputType[-1] - items = out_data.items() - - for name, data in items: - data.name = param_dict.get('display', data.name) - data.dbkey = param_dict.get('dbkey', '???') - - if outputType == 'wigData': - ext = "wig" - elif outputType == 'maf': - ext = "maf" - elif outputType == 'gff': - ext = "gff" - elif outputType == 'gff3': - ext = "gff3" - else: - if 'hgta_doPrintSelectedFields' in param_dict: - ext = "interval" - elif 'hgta_doGetBed' in param_dict: - ext = "bed" - elif 'hgta_doGenomicDna' in param_dict: - ext = "fasta" - elif 'hgta_doGenePredSequence' in param_dict: - ext = "fasta" - else: - ext = "interval" - - data = app.datatypes_registry.change_datatype(data, ext) - out_data[name] = data - -def exec_after_process( app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None): - """Verifies the data after the run""" - items = out_data.items() - for name, data in items: - data.set_size() - try: - err_msg, err_flag = 'Errors:', False - line_count = 0 - num_lines = len(file(data.file_name).readlines()) - for line in file(data.file_name): - line_count += 1 - if line and line[0] == '-': - if line_count + 3 == num_lines and not err_flag: - err_flag = True - err_msg = "Warning: It appears that your results have been truncated by UCSC. View the bottom of your result file for details." - break - err_flag = True - err_msg = err_msg +" (line "+str(line_count)+")"+line - data.set_peek() - if isinstance(data.datatype, datatypes.interval.Interval) and data.missing_meta(): - data = app.datatypes_registry.change_datatype(data, 'tabular') - out_data[name] = data - if err_flag: - raise Exception(err_msg) - except Exception, exc: - data.info = data.info + "\n" + str(exc) - data.blurb = "error" diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_proxy.py --- a/tools/data_source/ucsc_proxy.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -#!/usr/bin/env python -import urllib -import sys, os - -assert sys.version_info[:2] >= ( 2, 4 ) - -CHUNK = 2**20 # 1Mb -MAXSIZE = CHUNK * 100 -if __name__ == '__main__': - - if len(sys.argv) != 3: - print 'Usage ucsc.py input_params output_file' - sys.exit() - - inp_file = sys.argv[1] - out_file = sys.argv[2] - - DEFAULT_URL = "http://genome.ucsc.edu/hgTables?" - - # this must stay a list to allow multiple selections for the same widget name (checkboxes) - params = [] - for line in file(inp_file): - line = line.strip() - if line: - parts = line.split('=') - if len(parts) == 0: - key = "" - value = "" - elif len(parts) == 1: - key = parts[0] - value = "" - else: - key = parts[0] - value = parts[1] - if key == 'display': - print value - # get url from params, refered from proxy.py, initialized by the tool xml - elif key == 'proxy_url': - DEFAULT_URL = value - else: - params.append( (key, value) ) - - #print params - - encoded_params = urllib.urlencode(params) - url = DEFAULT_URL + encoded_params - - #print url - - page = urllib.urlopen(url) - - fp = open(out_file, 'wt') - size = 0 - while 1: - data = page.read(CHUNK) - if not data: - break - if size > MAXSIZE: - fp.write('----- maximum datasize exceeded ---\n') - break - size += len(data) - fp.write(data) - - fp.close() - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_proxy.xml --- a/tools/data_source/ucsc_proxy.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ - - - - table browser proxy - - - ucsc_proxy.py $param_file $output - - - - go to UCSC $init $hgta_outputType - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_tablebrowser.xml --- a/tools/data_source/ucsc_tablebrowser.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - - - table browser - data_source.py $output $__app__.config.output_size_limit - - go to UCSC Table Browser $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_tablebrowser_archaea.xml --- a/tools/data_source/ucsc_tablebrowser_archaea.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - - - table browser - data_source.py $output $__app__.config.output_size_limit - - go to UCSC Table Browser $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_tablebrowser_test.xml --- a/tools/data_source/ucsc_tablebrowser_test.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - - - table browser - data_source.py $output $__app__.config.output_size_limit - - go to UCSC Table Browser $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/ucsc_testproxy.xml --- a/tools/data_source/ucsc_testproxy.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ - - - - table browser proxy - - - ucsc_proxy.py $param_file $output - - - - go to UCSC genome-test $init $hgta_outputType - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/upload.py --- a/tools/data_source/upload.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,394 +0,0 @@ -#!/usr/bin/env python -#Processes uploads from the user. - -# WARNING: Changes in this tool (particularly as related to parsing) may need -# to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools - -import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii -from galaxy import eggs -# need to import model before sniff to resolve a circular import dependency -import galaxy.model -from galaxy.datatypes.checkers import * -from galaxy.datatypes import sniff -from galaxy.datatypes.binary import * -from galaxy.datatypes.images import Pdf -from galaxy.datatypes.registry import Registry -from galaxy import util -from galaxy.datatypes.util.image_util import * -from galaxy.util.json import * - -try: - import Image as PIL -except ImportError: - try: - from PIL import Image as PIL - except: - PIL = None - -try: - import bz2 -except: - bz2 = None - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg, ret=1 ): - sys.stderr.write( msg ) - sys.exit( ret ) -def file_err( msg, dataset, json_file ): - json_file.write( to_json_string( dict( type = 'dataset', - ext = 'data', - dataset_id = dataset.dataset_id, - stderr = msg ) ) + "\n" ) - # never remove a server-side upload - if dataset.type in ( 'server_dir', 'path_paste' ): - return - try: - os.remove( dataset.path ) - except: - pass -def safe_dict(d): - """ - Recursively clone json structure with UTF-8 dictionary keys - http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/ - """ - if isinstance(d, dict): - return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()]) - elif isinstance(d, list): - return [safe_dict(x) for x in d] - else: - return d -def check_bam( file_path ): - return Bam().sniff( file_path ) -def check_sff( file_path ): - return Sff().sniff( file_path ) -def check_pdf( file_path ): - return Pdf().sniff( file_path ) -def check_bigwig( file_path ): - return BigWig().sniff( file_path ) -def check_bigbed( file_path ): - return BigBed().sniff( file_path ) -def parse_outputs( args ): - rval = {} - for arg in args: - id, files_path, path = arg.split( ':', 2 ) - rval[int( id )] = ( path, files_path ) - return rval -def add_file( dataset, registry, json_file, output_path ): - data_type = None - line_count = None - converted_path = None - stdout = None - link_data_only = dataset.get( 'link_data_only', 'copy_files' ) - - try: - ext = dataset.file_type - except AttributeError: - file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file ) - return - - if dataset.type == 'url': - try: - temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) - except Exception, e: - file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) - return - dataset.path = temp_name - # See if we have an empty file - if not os.path.exists( dataset.path ): - file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file ) - return - if not os.path.getsize( dataset.path ) > 0: - file_err( 'The uploaded file is empty', dataset, json_file ) - return - if not dataset.type == 'url': - # Already set is_multi_byte above if type == 'url' - try: - dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) ) - except UnicodeDecodeError, e: - dataset.is_multi_byte = False - # Is dataset an image? - image = check_image( dataset.path ) - if image: - if not PIL: - image = None - # get_image_ext() returns None if nor a supported Image type - ext = get_image_ext( dataset.path, image ) - data_type = ext - # Is dataset content multi-byte? - elif dataset.is_multi_byte: - data_type = 'multi-byte char' - ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) - # Is dataset content supported sniffable binary? - elif check_bam( dataset.path ): - ext = 'bam' - data_type = 'bam' - elif check_sff( dataset.path ): - ext = 'sff' - data_type = 'sff' - elif check_pdf( dataset.path ): - ext = 'pdf' - data_type = 'pdf' - elif check_bigwig( dataset.path ): - ext = 'bigwig' - data_type = 'bigwig' - elif check_bigbed( dataset.path ): - ext = 'bigbed' - data_type = 'bigbed' - if not data_type: - # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress - is_gzipped, is_valid = check_gzip( dataset.path ) - if is_gzipped and not is_valid: - file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) - return - elif is_gzipped and is_valid: - if link_data_only == 'copy_files': - # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) - while 1: - try: - chunk = gzipped_file.read( CHUNK_SIZE ) - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing gzipped data', dataset, json_file ) - return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - gzipped_file.close() - # Replace the gzipped file with the decompressed file if it's safe to do so - if dataset.type in ( 'server_dir', 'path_paste' ): - dataset.path = uncompressed - else: - shutil.move( uncompressed, dataset.path ) - dataset.name = dataset.name.rstrip( '.gz' ) - data_type = 'gzip' - if not data_type and bz2 is not None: - # See if we have a bz2 file, much like gzip - is_bzipped, is_valid = check_bz2( dataset.path ) - if is_bzipped and not is_valid: - file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) - return - elif is_bzipped and is_valid: - if link_data_only == 'copy_files': - # We need to uncompress the temp_name file - CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) - while 1: - try: - chunk = bzipped_file.read( CHUNK_SIZE ) - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) - return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - bzipped_file.close() - # Replace the bzipped file with the decompressed file if it's safe to do so - if dataset.type in ( 'server_dir', 'path_paste' ): - dataset.path = uncompressed - else: - shutil.move( uncompressed, dataset.path ) - dataset.name = dataset.name.rstrip( '.bz2' ) - data_type = 'bz2' - if not data_type: - # See if we have a zip archive - is_zipped = check_zip( dataset.path ) - if is_zipped: - if link_data_only == 'copy_files': - CHUNK_SIZE = 2**20 # 1Mb - uncompressed = None - uncompressed_name = None - unzipped = False - z = zipfile.ZipFile( dataset.path ) - for name in z.namelist(): - if name.endswith('/'): - continue - if unzipped: - stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' - break - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) - if sys.version_info[:2] >= ( 2, 6 ): - zipped_file = z.open( name ) - while 1: - try: - chunk = zipped_file.read( CHUNK_SIZE ) - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing zipped data', dataset, json_file ) - return - if not chunk: - break - os.write( fd, chunk ) - os.close( fd ) - zipped_file.close() - uncompressed_name = name - unzipped = True - else: - # python < 2.5 doesn't have a way to read members in chunks(!) - try: - outfile = open( uncompressed, 'wb' ) - outfile.write( z.read( name ) ) - outfile.close() - uncompressed_name = name - unzipped = True - except IOError: - os.close( fd ) - os.remove( uncompressed ) - file_err( 'Problem decompressing zipped data', dataset, json_file ) - return - z.close() - # Replace the zipped file with the decompressed file if it's safe to do so - if uncompressed is not None: - if dataset.type in ( 'server_dir', 'path_paste' ): - dataset.path = uncompressed - else: - shutil.move( uncompressed, dataset.path ) - dataset.name = uncompressed_name - data_type = 'zip' - if not data_type: - if check_binary( dataset.path ): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - #binary_ok = False - parts = dataset.name.split( "." ) - if len( parts ) > 1: - ext = parts[1].strip().lower() - if ext not in unsniffable_binary_formats: - file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) - return - elif ext in unsniffable_binary_formats and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) - file_err( err_msg, dataset, json_file ) - return - if not data_type: - # We must have a text file - if check_html( dataset.path ): - file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) - return - if data_type != 'binary': - if link_data_only == 'copy_files': - in_place = True - if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: - in_place = False - if dataset.space_to_tab: - line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) - else: - line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) - if dataset.file_type == 'auto': - ext = sniff.guess_ext( dataset.path, registry.sniff_order ) - else: - ext = dataset.file_type - data_type = ext - # Save job info for the framework - if ext == 'auto' and dataset.ext: - ext = dataset.ext - if ext == 'auto': - ext = 'data' - datatype = registry.get_datatype_by_extension( ext ) - if dataset.type in ( 'server_dir', 'path_paste' ) and link_data_only == 'link_to_files': - # Never alter a file that will not be copied to Galaxy's local file store. - if datatype.dataset_content_needs_grooming( dataset.path ): - err_msg = 'The uploaded files need grooming, so change your Copy data into Galaxy? selection to be ' + \ - 'Copy files into Galaxy instead of Link to files without copying into Galaxy so grooming can be performed.' - file_err( err_msg, dataset, json_file ) - return - if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: - # Move the dataset to its "real" path - if converted_path is not None: - shutil.copy( converted_path, output_path ) - try: - os.remove( converted_path ) - except: - pass - else: - # This should not happen, but it's here just in case - shutil.copy( dataset.path, output_path ) - elif link_data_only == 'copy_files': - shutil.move( dataset.path, output_path ) - # Write the job info - stdout = stdout or 'uploaded %s file' % data_type - info = dict( type = 'dataset', - dataset_id = dataset.dataset_id, - ext = ext, - stdout = stdout, - name = dataset.name, - line_count = line_count ) - json_file.write( to_json_string( info ) + "\n" ) - if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path ): - # Groom the dataset content if necessary - datatype.groom_dataset_content( output_path ) -def add_composite_file( dataset, registry, json_file, output_path, files_path ): - if dataset.composite_files: - os.mkdir( files_path ) - for name, value in dataset.composite_files.iteritems(): - value = util.bunch.Bunch( **value ) - if dataset.composite_file_paths[ value.name ] is None and not value.optional: - file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file ) - break - elif dataset.composite_file_paths[value.name] is not None: - dp = dataset.composite_file_paths[value.name][ 'path' ] - isurl = dp.find('://') <> -1 # todo fixme - if isurl: - try: - temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix='url_paste' ) - except Exception, e: - file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) - return - dataset.path = temp_name - dp = temp_name - if not value.is_binary: - if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): - sniff.convert_newlines_sep2tabs( dp ) - else: - sniff.convert_newlines( dp ) - shutil.move( dp, os.path.join( files_path, name ) ) - # Move the dataset to its "real" path - shutil.move( dataset.primary_file, output_path ) - # Write the job info - info = dict( type = 'dataset', - dataset_id = dataset.dataset_id, - stdout = 'uploaded %s file' % dataset.file_type ) - json_file.write( to_json_string( info ) + "\n" ) - -def __main__(): - - if len( sys.argv ) < 4: - print >>sys.stderr, 'usage: upload.py ...' - sys.exit( 1 ) - - output_paths = parse_outputs( sys.argv[4:] ) - json_file = open( 'galaxy.json', 'w' ) - - registry = Registry( sys.argv[1], sys.argv[2] ) - - for line in open( sys.argv[3], 'r' ): - dataset = from_json_string( line ) - dataset = util.bunch.Bunch( **safe_dict( dataset ) ) - try: - output_path = output_paths[int( dataset.dataset_id )][0] - except: - print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id - sys.exit( 1 ) - if dataset.type == 'composite': - files_path = output_paths[int( dataset.dataset_id )][1] - add_composite_file( dataset, registry, json_file, output_path, files_path ) - else: - add_file( dataset, registry, json_file, output_path ) - # clean up paramfile - try: - os.remove( sys.argv[3] ) - except: - pass - -if __name__ == '__main__': - __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/upload.xml --- a/tools/data_source/upload.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,213 +0,0 @@ - - - - - from your computer - - - - upload.py $GALAXY_ROOT_DIR $GALAXY_DATATYPES_CONF_FILE $paramfile - #set $outnum = 0 - #while $varExists('output%i' % $outnum): - #set $output = $getVar('output%i' % $outnum) - #set $outnum += 1 - #set $file_name = $output.file_name - ## FIXME: This is not future-proof for other uses of external_filename (other than for use by the library upload's "link data" feature) - #if $output.dataset.dataset.external_filename: - #set $file_name = "None" - #end if - ${output.dataset.dataset.id}:${output.files_path}:${file_name} - #end while - - - - - - - - - - - - - - not ( ( isinstance( value, unicode ) or isinstance( value, str ) ) and value != "" ) - - - - - - - - - - - - - -**Auto-detect** - -The system will attempt to detect Axt, Fasta, Fastqsolexa, Gff, Gff3, Html, Lav, Maf, Tabular, Wiggle, Bed and Interval (Bed with headers) formats. If your file is not detected properly as one of the known formats, it most likely means that it has some format problems (e.g., different number of columns on different rows). You can still coerce the system to set your data to the format you think it should be. You can also upload compressed files, which will automatically be decompressed. - ------ - -**Ab1** - -A binary sequence file in 'ab1' format with a '.ab1' file extension. You must manually select this 'File Format' when uploading the file. - ------ - -**Axt** - -blastz pairwise alignment format. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields. - ------ - -**Bam** - -A binary file compressed in the BGZF format with a '.bam' file extension. - ------ - -**Bed** - -* Tab delimited format (tabular) -* Does not require header line -* Contains 3 required fields: - - - chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or contig (e.g. ctgY1). - - chromStart - The starting position of the feature in the chromosome or contig. The first base in a chromosome is numbered 0. - - chromEnd - The ending position of the feature in the chromosome or contig. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. - -* May contain 9 additional optional BED fields: - - - name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode. - - score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). - - strand - Defines the strand - either '+' or '-'. - - thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). - - thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays). - - itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser. - - blockCount - The number of blocks (exons) in the BED line. - - blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - - blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - -* Example:: - - chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512 - chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601 - ------ - -**Fasta** - -A sequence in FASTA format consists of a single-line description, followed by lines of sequence data. The first character of the description line is a greater-than (">") symbol in the first column. All lines should be shorter than 80 characters:: - - >sequence1 - atgcgtttgcgtgc - gtcggtttcgttgc - >sequence2 - tttcgtgcgtatag - tggcgcggtga - ------ - -**FastqSolexa** - -FastqSolexa is the Illumina (Solexa) variant of the Fastq format, which stores sequences and quality scores in a single file:: - - @seq1 - GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT - +seq1 - hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh - @seq2 - GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG - +seq2 - hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO - -Or:: - - @seq1 - GAATTGATCAGGACATAGGACAACTGTAGGCACCAT - +seq1 - 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 - @seq2 - GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG - +seq2 - 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 - ------ - -**Gff** - -GFF lines have nine required fields that must be tab-separated. - ------ - -**Gff3** - -The GFF3 format addresses the most common extensions to GFF, while preserving backward compatibility with previous formats. - ------ - -**Interval (Genomic Intervals)** - -- Tab delimited format (tabular) -- File must start with definition line in the following format (columns may be in any order).:: - - #CHROM START END STRAND - -- CHROM - The name of the chromosome (e.g. chr3, chrY, chr2_random) or contig (e.g. ctgY1). -- START - The starting position of the feature in the chromosome or contig. The first base in a chromosome is numbered 0. -- END - The ending position of the feature in the chromosome or contig. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. -- STRAND - Defines the strand - either '+' or '-'. - -- Example:: - - #CHROM START END STRAND NAME COMMENT - chr1 10 100 + exon myExon - chrX 1000 10050 - gene myGene - ------ - -**Lav** - -Lav is the primary output format for BLASTZ. The first line of a .lav file begins with #:lav.. - ------ - -**MAF** - -TBA and multiz multiple alignment format. The first line of a .maf file begins with ##maf. This word is followed by white-space-separated "variable=value" pairs. There should be no white space surrounding the "=". - ------ - -**Scf** - -A binary sequence file in 'scf' format with a '.scf' file extension. You must manually select this 'File Format' when uploading the file. - ------ - -**Sff** - -A binary file in 'Standard Flowgram Format' with a '.sff' file extension. - ------ - -**Tabular (tab delimited)** - -Any data in tab delimited format (tabular) - ------ - -**Wig** - -The wiggle format is line-oriented. Wiggle data is preceded by a track definition line, which adds a number of options for controlling the default display of this track. - ------ - -**Other text type** - -Any text file - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/worm_modencode.xml --- a/tools/data_source/worm_modencode.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ - - - server - data_source.py $output $__app__.config.output_size_limit - - go to modENCODE worm server $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/wormbase.xml --- a/tools/data_source/wormbase.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ - - - server - data_source.py $output $__app__.config.output_size_limit - - go to Wormbase server $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/wormbase_test.xml --- a/tools/data_source/wormbase_test.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ - - - test server - data_source.py $output $__app__.config.output_size_limit - - go to Wormbase test server $GALAXY_URL - - - - - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/data_source/yeastmine.xml --- a/tools/data_source/yeastmine.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ - - - server - data_source.py $output $__app__.config.output_size_limit - - go to yeastMine server $GALAXY_URL - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_IvC_all.pl --- a/tools/discreteWavelet/execute_dwt_IvC_all.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,210 +0,0 @@ -#!/usr/bin/perl -w -use warnings; -use IO::Handle; - -$usage = "execute_dwt_IvC_all.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out] \n"; -die $usage unless @ARGV == 4; - -#get the input arguments -my $firstInputFile = $ARGV[0]; -my $secondInputFile = $ARGV[1]; -my $firstOutputFile = $ARGV[2]; -my $secondOutputFile = $ARGV[3]; - -open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n"); -open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n"); -open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); -open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); -open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); - -#save all error messages into the error file $errorFile using the error file handle ERROR -STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); - - -print "There are two input data files: \n"; -print "The input data file is: $firstInputFile \n"; -print "The control data file is: $secondInputFile \n"; - -# IvC test -$test = "IvC"; - -# construct an R script to implement the IvC test -print "\n"; - -$r_script = "get_dwt_IvC_test.r"; -print "$r_script \n"; - -# R script -open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; -print Rcmd " - ########################################################################################### - # code to do wavelet Indel vs. Control - # signal is the difference I-C; function is second moment i.e. variance from zero not mean - # to perform wavelet transf. of signal, scale-by-scale analysis of the function - # create null bands by permuting the original data series - # generate plots and table matrix of correlation coefficients including p-values - ############################################################################################ - library(\"Rwave\"); - library(\"wavethresh\"); - library(\"waveslim\"); - - options(echo = FALSE) - - # normalize data - norm <- function(data){ - v <- (data - mean(data))/sd(data); - if(sum(is.na(v)) >= 1){ - v <- data; - } - return(v); - } - - dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", wf = \"haar\", boundary = \"reflection\") { - print(test); - print(pdf); - print(table); - - pdf(file = pdf); - final_pvalue = NULL; - title = NULL; - - short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels; - title <- c(\"motif\"); - for (i in 1:short.levels){ - title <- c(title, paste(i, \"moment2\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"), paste(i, \"test\", sep = \"_\")); - } - print(title); - - # loop to compare a vs a - for(i in 1:length(names.short)){ - wave1.dwt = NULL; - m2.dwt = diff = var.dwt = NULL; - out = NULL; - out <- vector(length = length(title)); - - print(names.short[i]); - print(names.long[i]); - - # need exit if not comparing motif(a) vs motif(a) - if (names.short[i] != names.long[i]){ - stop(paste(\"motif\", names.short[i], \"is not the same as\", names.long[i], sep = \" \")); - } - else { - # signal is the difference I-C data sets - diff<-data.short[,i]-data.long[,i]; - - # normalize the signal - diff<-norm(diff); - - # function is 2nd moment - # 2nd moment m_j = 1/N[sum_N(W_j + V_J)^2] = 1/N sum_N(W_j)^2 + (X_bar)^2 - wave1.dwt <- dwt(diff, wf = wf, short.levels, boundary = boundary); - var.dwt <- wave.variance(wave1.dwt); - m2.dwt <- vector(length = short.levels) - for(level in 1:short.levels){ - m2.dwt[level] <- var.dwt[level, 1] + (mean(diff)^2); - } - - # CI bands by permutation of time series - feature1 = feature2 = NULL; - feature1 = data.short[, i]; - feature2 = data.long[, i]; - null = results = med = NULL; - m2_25 = m2_975 = NULL; - - for (k in 1:1000) { - nk_1 = nk_2 = NULL; - m2_null = var_null = NULL; - null.levels = null_wave1 = null_diff = NULL; - nk_1 <- sample(feature1, length(feature1), replace = FALSE); - nk_2 <- sample(feature2, length(feature2), replace = FALSE); - null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; - null_diff <- nk_1-nk_2; - null_diff <- norm(null_diff); - null_wave1 <- dwt(null_diff, wf = wf, short.levels, boundary = boundary); - var_null <- wave.variance(null_wave1); - m2_null <- vector(length = null.levels); - for(level in 1:null.levels){ - m2_null[level] <- var_null[level, 1] + (mean(null_diff)^2); - } - null= rbind(null, m2_null); - } - - null <- apply(null, 2, sort, na.last = TRUE); - m2_25 <- null[25,]; - m2_975 <- null[975,]; - med <- apply(null, 2, median, na.rm = TRUE); - - # plot - results <- cbind(m2.dwt, m2_25, m2_975); - matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), xlab = \"Wavelet Scale\", ylab = c(\"Wavelet 2nd Moment\", test), main = (names.short[i]), cex.main = 0.75); - abline(h = 1); - - # get pvalues by comparison to null distribution - out <- c(names.short[i]); - for (m in 1:length(m2.dwt)){ - print(paste(\"scale\", m, sep = \" \")); - print(paste(\"m2\", m2.dwt[m], sep = \" \")); - print(paste(\"median\", med[m], sep = \" \")); - out <- c(out, format(m2.dwt[m], digits = 4)); - pv = NULL; - if(is.na(m2.dwt[m])){ - pv <- \"NA\"; - } - else { - if (m2.dwt[m] >= med[m]){ - # R tail test - tail <- \"R\"; - pv <- (length(which(null[, m] >= m2.dwt[m])))/(length(na.exclude(null[, m]))); - } - else{ - if (m2.dwt[m] < med[m]){ - # L tail test - tail <- \"L\"; - pv <- (length(which(null[, m] <= m2.dwt[m])))/(length(na.exclude(null[, m]))); - } - } - } - out <- c(out, pv); - print(pv); - out <- c(out, tail); - } - final_pvalue <-rbind(final_pvalue, out); - print(out); - } - } - - colnames(final_pvalue) <- title; - write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE); - dev.off(); - }\n"; - -print Rcmd " - # execute - # read in data - - inputData <- read.delim(\"$firstInputFile\"); - inputDataNames <- colnames(inputData); - - controlData <- read.delim(\"$secondInputFile\"); - controlDataNames <- colnames(controlData); - - # call the test function to implement IvC test - dwt_cor(inputData, inputDataNames, controlData, controlDataNames, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\"); - print (\"done with the correlation test\"); -\n"; - -print Rcmd "#eof\n"; - -close Rcmd; - -system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n"); -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n"); -system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n"); - -#close the input and output and error files -close(ERROR); -close(OUTPUT2); -close(OUTPUT1); -close(INPUT2); -close(INPUT1); \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_IvC_all.xml --- a/tools/discreteWavelet/execute_dwt_IvC_all.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ - - between two datasets using Discrete Wavelet Transfoms - - - execute_dwt_IvC_all.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2 - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -This program generates plots and computes table matrix of second moments, p-values, and test orientations at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. - -The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales. - -The program has two input files obtained as follows: - -For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. - -The program gives two output files: - -- The first output file is a TABULAR format file representing the second moments, p-values, and test orientations for each feature at each scale. -- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the second moment for that feature at every scale. - ------ - -.. class:: warningmark - -**Note** - -In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. - ------ - -**Example** - -Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file:: - - deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget - 226 403 416 221 1165 - 236 444 380 241 1223 - 242 496 391 195 1116 - 243 429 364 191 1118 - 244 410 371 236 1063 - 230 386 370 217 1087 - 275 404 402 214 1044 - 265 443 365 231 1086 - 255 390 354 246 1114 - 281 384 406 232 1102 - 263 459 369 251 1135 - 280 433 400 251 1159 - 278 385 382 231 1147 - 248 393 389 211 1162 - 251 403 385 246 1114 - 239 383 347 227 1172 - -And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file:: - - deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget - 235 374 407 257 1159 - 244 356 353 212 1128 - 233 343 322 204 1110 - 222 329 398 253 1054 - 216 325 328 253 1129 - 257 368 352 221 1115 - 238 360 346 224 1102 - 225 350 377 248 1107 - 230 330 365 236 1132 - 241 389 357 220 1120 - 274 354 392 235 1120 - 250 379 354 210 1102 - 254 329 320 251 1080 - 221 355 406 279 1127 - 224 330 390 249 1129 - 246 366 364 218 1176 - - -We notice that the number of scales here is 4 because 16 = 2^4. Runnig the program on the above input files gives the following output: - -The first output file:: - - motif 1_moment2 1_pval 1_test 2_moment2 2_pval 2_test 3_moment2 3_pval 3_test 4_moment2 4_pval 4_test - - deletionHoptspot 0.8751 0.376 L 1.549 0.168 R 0.6152 0.434 L 0.5735 0.488 R - insertionHoptspot 0.902 0.396 L 1.172 0.332 R 0.6843 0.456 L 1.728 0.213 R - dnaPolPauseFrameshift 1.65 0.013 R 0.267 0.055 L 0.1387 0.124 L 0.4516 0.498 L - topoisomeraseCleavageSite 0.7443 0.233 L 1.023 0.432 R 1.933 0.155 R 1.09 0.3 R - translinTarget 0.5084 0.057 L 0.8219 0.446 L 3.604 0.019 R 0.4377 0.492 L - -The second output file: - -.. image:: ./static/operation_icons/dwt_IvC_1.png -.. image:: ./static/operation_icons/dwt_IvC_2.png -.. image:: ./static/operation_icons/dwt_IvC_3.png -.. image:: ./static/operation_icons/dwt_IvC_4.png -.. image:: ./static/operation_icons/dwt_IvC_5.png - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_cor_aVa_perClass.pl --- a/tools/discreteWavelet/execute_dwt_cor_aVa_perClass.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,221 +0,0 @@ -#!/usr/bin/perl -w - -use warnings; -use IO::Handle; - -$usage = "execute_dwt_cor_aVa_perClass.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out] \n"; -die $usage unless @ARGV == 4; - -#get the input arguments -my $firstInputFile = $ARGV[0]; -my $secondInputFile = $ARGV[1]; -my $firstOutputFile = $ARGV[2]; -my $secondOutputFile = $ARGV[3]; - -open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n"); -open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n"); -open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); -open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); -open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); - -#save all error messages into the error file $errorFile using the error file handle ERROR -STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); - -print "There are two input data files: \n"; -print "The input data file is: $firstInputFile \n"; -print "The control data file is: $secondInputFile \n"; - -# IvC test -$test = "cor_aVa"; - -# construct an R script to implement the IvC test -print "\n"; - -$r_script = "get_dwt_cor_aVa_test.r"; -print "$r_script \n"; - -open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; -print Rcmd " - ################################################################################## - # code to do all correlation tests of form: motif(a) vs. motif(a) - # add code to create null bands by permuting the original data series - # generate plots and table matrix of correlation coefficients including p-values - ################################################################################## - library(\"Rwave\"); - library(\"wavethresh\"); - library(\"waveslim\"); - - options(echo = FALSE) - - # normalize data - norm <- function(data){ - v <- (data - mean(data))/sd(data); - if(sum(is.na(v)) >= 1){ - v <- data; - } - return(v); - } - - dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { - print(test); - print(pdf); - print(table); - - pdf(file = pdf); - final_pvalue = NULL; - title = NULL; - - short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels; - title <- c(\"motif\"); - for (i in 1:short.levels){ - title <- c(title, paste(i, \"cor\", sep = \"_\"), paste(i, \"pval\", sep = \"_\")); - } - print(title); - - # normalize the raw data - data.short <- apply(data.short, 2, norm); - data.long <- apply(data.long, 2, norm); - - for(i in 1:length(names.short)){ - # Kendall Tau - # DWT wavelet correlation function - # include significance to compare - wave1.dwt = wave2.dwt = NULL; - tau.dwt = NULL; - out = NULL; - - print(names.short[i]); - print(names.long[i]); - - # need exit if not comparing motif(a) vs motif(a) - if (names.short[i] != names.long[i]){ - stop(paste(\"motif\", names.short[i], \"is not the same as\", names.long[i], sep = \" \")); - } - else { - wave1.dwt <- dwt(data.short[, i], wf = wf, short.levels, boundary = boundary); - wave2.dwt <- dwt(data.long[, i], wf = wf, short.levels, boundary = boundary); - tau.dwt <- vector(length=short.levels) - - #perform cor test on wavelet coefficients per scale - for(level in 1:short.levels){ - w1_level = w2_level = NULL; - w1_level <- (wave1.dwt[[level]]); - w2_level <- (wave2.dwt[[level]]); - tau.dwt[level] <- cor.test(w1_level, w2_level, method = method)\$estimate; - } - - # CI bands by permutation of time series - feature1 = feature2 = NULL; - feature1 = data.short[, i]; - feature2 = data.long[, i]; - null = results = med = NULL; - cor_25 = cor_975 = NULL; - - for (k in 1:1000) { - nk_1 = nk_2 = NULL; - null.levels = NULL; - cor = NULL; - null_wave1 = null_wave2 = NULL; - - nk_1 <- sample(feature1, length(feature1), replace = FALSE); - nk_2 <- sample(feature2, length(feature2), replace = FALSE); - null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; - cor <- vector(length = null.levels); - null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary); - null_wave2 <- dwt(nk_2, wf = wf, short.levels, boundary = boundary); - - for(level in 1:null.levels){ - null_level1 = null_level2 = NULL; - null_level1 <- (null_wave1[[level]]); - null_level2 <- (null_wave2[[level]]); - cor[level] <- cor.test(null_level1, null_level2, method = method)\$estimate; - } - null = rbind(null, cor); - } - - null <- apply(null, 2, sort, na.last = TRUE); - print(paste(\"NAs\", length(which(is.na(null))), sep = \" \")); - cor_25 <- null[25,]; - cor_975 <- null[975,]; - med <- (apply(null, 2, median, na.rm = TRUE)); - - # plot - results <- cbind(tau.dwt, cor_25, cor_975); - matplot(results, type = \"b\", pch = \"*\" , lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = \"Wavelet Scale\", ylab = \"Wavelet Correlation Kendall's Tau\", main = (paste(test, names.short[i], sep = \" \")), cex.main = 0.75); - abline(h = 0); - - # get pvalues by comparison to null distribution - ### modify pval calculation for error type II of T test #### - out <- (names.short[i]); - for (m in 1:length(tau.dwt)){ - print(paste(\"scale\", m, sep = \" \")); - print(paste(\"tau\", tau.dwt[m], sep = \" \")); - print(paste(\"med\", med[m], sep = \" \")); - out <- c(out, format(tau.dwt[m], digits = 3)); - pv = NULL; - if(is.na(tau.dwt[m])){ - pv <- \"NA\"; - } - else { - if (tau.dwt[m] >= med[m]){ - # R tail test - print(paste(\"R\")); - ### per sv ok to use inequality not strict - pv <- (length(which(null[, m] >= tau.dwt[m])))/(length(na.exclude(null[, m]))); - if (tau.dwt[m] == med[m]){ - print(\"tau == med\"); - print(summary(null[, m])); - } - } - else if (tau.dwt[m] < med[m]){ - # L tail test - print(paste(\"L\")); - pv <- (length(which(null[, m] <= tau.dwt[m])))/(length(na.exclude(null[, m]))); - } - } - out <- c(out, pv); - print(paste(\"pval\", pv, sep = \" \")); - } - final_pvalue <- rbind(final_pvalue, out); - print(out); - } - } - colnames(final_pvalue) <- title; - write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE) - dev.off(); - }\n"; - -print Rcmd " - # execute - # read in data - - inputData1 = inputData2 = NULL; - inputData.short1 = inputData.short2 = NULL; - inputDataNames.short1 = inputDataNames.short2 = NULL; - - inputData1 <- read.delim(\"$firstInputFile\"); - inputData.short1 <- inputData1[, +c(1:ncol(inputData1))]; - inputDataNames.short1 <- colnames(inputData.short1); - - inputData2 <- read.delim(\"$secondInputFile\"); - inputData.short2 <- inputData2[, +c(1:ncol(inputData2))]; - inputDataNames.short2 <- colnames(inputData.short2); - - # cor test for motif(a) in inputData1 vs motif(a) in inputData2 - dwt_cor(inputData.short1, inputDataNames.short1, inputData.short2, inputDataNames.short2, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\"); - print (\"done with the correlation test\"); - - #eof\n"; -close Rcmd; - -system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n"); -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n"); -system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n"); - -#close the input and output and error files -close(ERROR); -close(OUTPUT2); -close(OUTPUT1); -close(INPUT2); -close(INPUT1); - diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_cor_aVa_perClass.xml --- a/tools/discreteWavelet/execute_dwt_cor_aVa_perClass.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ - - between two datasets using Discrete Wavelet Transfoms - - - execute_dwt_cor_aVa_perClass.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2 - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. - -The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales. - -The program has two input files obtained as follows: - -For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. - -The program gives two output files: - -- The first output file is a TABULAR format file representing the coefficient correlations and p-values for each feature at each scale. -- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the coefficient correlation for that feature at every scale. - ------ - -.. class:: warningmark - -**Note** - -In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. - ------ - -**Example** - -Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file:: - - deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget - 269 366 330 238 1129 - 239 328 327 283 1188 - 254 351 358 297 1151 - 262 371 355 256 1107 - 254 361 352 234 1192 - 265 354 367 240 1182 - 255 359 333 235 1217 - 271 389 387 272 1241 - 240 305 341 249 1159 - 272 351 337 257 1169 - 275 351 337 233 1158 - 305 331 361 253 1172 - 277 341 343 253 1113 - 266 362 355 267 1162 - 235 326 329 241 1230 - 254 335 360 251 1172 - -And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file:: - - deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget - 104 146 142 113 478 - 89 146 151 94 495 - 100 176 151 88 435 - 96 163 128 114 468 - 99 138 144 91 513 - 112 126 162 106 468 - 86 127 145 83 491 - 104 145 171 110 496 - 91 121 147 104 469 - 103 141 145 98 458 - 92 134 142 117 468 - 97 146 145 107 471 - 115 121 136 109 470 - 113 135 138 101 491 - 111 150 138 102 451 - 94 128 151 138 481 - - -We notice that the number of scales here is 4 because 16 = 2^4. Running the program on the above input files gives the following output: - -The first output file:: - - motif 1_cor 1_pval 2_cor 2_pval 3_cor 3_pval 4_cor 4_pval - - deletionHoptspot 0.4 0.072 0.143 0.394 -0.667 0.244 1 0.491 - insertionHoptspot 0.343 0.082 -0.0714 0.446 -1 0.12 1 0.502 - dnaPolPauseFrameshift 0.617 0.004 -0.5 0.13 0.667 0.234 1 0.506 - topoisomeraseCleavageSite -0.183 0.242 -0.286 0.256 0.333 0.353 -1 0.489 - translinTarget 0.0167 0.503 -0.0714 0.469 1 0.136 1 0.485 - -The second output file: - -.. image:: ./static/operation_icons/dwt_cor_aVa_1.png -.. image:: ./static/operation_icons/dwt_cor_aVa_2.png -.. image:: ./static/operation_icons/dwt_cor_aVa_3.png -.. image:: ./static/operation_icons/dwt_cor_aVa_4.png -.. image:: ./static/operation_icons/dwt_cor_aVa_5.png - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_cor_aVb_all.pl --- a/tools/discreteWavelet/execute_dwt_cor_aVb_all.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ -#!/usr/bin/perl -w - -use warnings; -use IO::Handle; - -$usage = "execute_dwt_cor_aVb_all.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out] \n"; -die $usage unless @ARGV == 4; - -#get the input arguments -my $firstInputFile = $ARGV[0]; -my $secondInputFile = $ARGV[1]; -my $firstOutputFile = $ARGV[2]; -my $secondOutputFile = $ARGV[3]; - -open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n"); -open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n"); -open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); -open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); -open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); - -#save all error messages into the error file $errorFile using the error file handle ERROR -STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); - -print "There are two input data files: \n"; -print "The input data file is: $firstInputFile \n"; -print "The control data file is: $secondInputFile \n"; - -# IvC test -$test = "cor_aVb_all"; - -# construct an R script to implement the IvC test -print "\n"; - -$r_script = "get_dwt_cor_aVa_test.r"; -print "$r_script \n"; - - -# R script -open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; -print Rcmd " - ################################################################################# - # code to do all correlation tests of form: motif(a) vs. motif(b) - # add code to create null bands by permuting the original data series - # generate plots and table matrix of correlation coefficients including p-values - ################################################################################# - library(\"Rwave\"); - library(\"wavethresh\"); - library(\"waveslim\"); - - options(echo = FALSE) - - # normalize data - norm <- function(data){ - v <- (data - mean(data))/sd(data); - if(sum(is.na(v)) >= 1){ - v <- data; - } - return(v); - } - - dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { - print(test); - print(pdf); - print(table); - - pdf(file = pdf); - final_pvalue = NULL; - title = NULL; - - short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels; - title <- c(\"motif1\", \"motif2\"); - for (i in 1:short.levels){ - title <- c(title, paste(i, \"cor\", sep = \"_\"), paste(i, \"pval\", sep = \"_\")); - } - print(title); - - # normalize the raw data - data.short <- apply(data.short, 2, norm); - data.long <- apply(data.long, 2, norm); - - # loop to compare a vs b - for(i in 1:length(names.short)){ - for(j in 1:length(names.long)){ - if(i >= j){ - next; - } - else { - # Kendall Tau - # DWT wavelet correlation function - # include significance to compare - wave1.dwt = wave2.dwt = NULL; - tau.dwt = NULL; - out = NULL; - - print(names.short[i]); - print(names.long[j]); - - # need exit if not comparing motif(a) vs motif(a) - if (names.short[i] == names.long[j]){ - stop(paste(\"motif\", names.short[i], \"is the same as\", names.long[j], sep = \" \")); - } - else { - wave1.dwt <- dwt(data.short[, i], wf = wf, short.levels, boundary = boundary); - wave2.dwt <- dwt(data.long[, j], wf = wf, short.levels, boundary = boundary); - tau.dwt <-vector(length = short.levels) - - # perform cor test on wavelet coefficients per scale - for(level in 1:short.levels){ - w1_level = w2_level = NULL; - w1_level <- (wave1.dwt[[level]]); - w2_level <- (wave2.dwt[[level]]); - tau.dwt[level] <- cor.test(w1_level, w2_level, method = method)\$estimate; - } - - # CI bands by permutation of time series - feature1 = feature2 = NULL; - feature1 = data.short[, i]; - feature2 = data.long[, j]; - null = results = med = NULL; - cor_25 = cor_975 = NULL; - - for (k in 1:1000) { - nk_1 = nk_2 = NULL; - null.levels = NULL; - cor = NULL; - null_wave1 = null_wave2 = NULL; - - nk_1 <- sample(feature1, length(feature1), replace = FALSE); - nk_2 <- sample(feature2, length(feature2), replace = FALSE); - null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; - cor <- vector(length = null.levels); - null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary); - null_wave2 <- dwt(nk_2, wf = wf, short.levels, boundary = boundary); - - for(level in 1:null.levels){ - null_level1 = null_level2 = NULL; - null_level1 <- (null_wave1[[level]]); - null_level2 <- (null_wave2[[level]]); - cor[level] <- cor.test(null_level1, null_level2, method = method)\$estimate; - } - null = rbind(null, cor); - } - - null <- apply(null, 2, sort, na.last = TRUE); - cor_25 <- null[25, ]; - cor_975 <- null[975, ]; - med <- (apply(null, 2, median, na.rm = TRUE)); - - # plot - results <- cbind(tau.dwt, cor_25, cor_975); - matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = \"Wavelet Scale\", ylab = \"Wavelet Correlation Kendall's Tau\", main = (paste(test, names.short[i], \"vs.\", names.long[j], sep = \" \")), cex.main = 0.75); - abline(h = 0); - - # get pvalues by comparison to null distribution - ### modify pval calculation for error type II of T test #### - out <- c(names.short[i],names.long[j]); - for (m in 1:length(tau.dwt)){ - print(m); - print(tau.dwt[m]); - out <- c(out, format(tau.dwt[m], digits = 3)); - pv = NULL; - if(is.na(tau.dwt[m])){ - pv <- \"NA\"; - } - else{ - if (tau.dwt[m] >= med[m]){ - # R tail test - pv <- (length(which(null[, m] >= tau.dwt[m])))/(length(na.exclude(null[, m]))); - } - else{ - if (tau.dwt[m] < med[m]){ - # L tail test - pv <- (length(which(null[, m] <= tau.dwt[m])))/(length(na.exclude(null[, m]))); - } - } - } - out <- c(out, pv); - print(pv); - } - final_pvalue <-rbind(final_pvalue, out); - print(out); - } - } - } - } - colnames(final_pvalue) <- title; - write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE) - dev.off(); - }\n"; - -print Rcmd " - # execute - # read in data - - inputData1 = inputData2 = NULL; - inputData.short1 = inputData.short2 = NULL; - inputDataNames.short1 = inputDataNames.short2 = NULL; - - inputData1 <- read.delim(\"$firstInputFile\"); - inputData.short1 <- inputData1[, +c(1:ncol(inputData1))]; - inputDataNames.short1 <- colnames(inputData.short1); - - inputData2 <- read.delim(\"$secondInputFile\"); - inputData.short2 <- inputData2[, +c(1:ncol(inputData2))]; - inputDataNames.short2 <- colnames(inputData.short2); - - # cor test for motif(a) in inputData1 vs motif(b) in inputData2 - dwt_cor(inputData.short1, inputDataNames.short1, inputData.short2, inputDataNames.short2, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\"); - print (\"done with the correlation test\"); - - #eof\n"; -close Rcmd; - -system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n"); -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n"); -system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n"); - -#close the input and output and error files -close(ERROR); -close(OUTPUT2); -close(OUTPUT1); -close(INPUT2); -close(INPUT1); diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_cor_aVb_all.xml --- a/tools/discreteWavelet/execute_dwt_cor_aVb_all.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ - - between two datasets using Discrete Wavelet Transfoms - - - execute_dwt_cor_aVb_all.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2 - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. - -The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales. - -The program has two input files obtained as follows: - -For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. - -The program gives two output files: - -- The first output file is a TABULAR format file representing the coefficient correlations and p-values for each feature at each scale. -- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the coefficient correlations for that feature at every scale. - ------ - -.. class:: warningmark - -**Note** - -In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. - ------ - -**Example** - -Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file:: - - deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget - 82 162 158 79 459 - 111 196 154 75 459 - 98 178 160 79 475 - 113 201 170 113 436 - 113 173 147 95 446 - 107 150 155 84 436 - 106 166 175 96 448 - 113 176 135 106 514 - 113 170 152 87 450 - 95 152 167 93 467 - 91 171 169 118 426 - 84 139 160 100 459 - 92 154 164 104 440 - 100 145 154 98 472 - 91 161 152 71 461 - 117 164 139 97 463 - -And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file:: - - deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget - 269 366 330 238 1129 - 239 328 327 283 1188 - 254 351 358 297 1151 - 262 371 355 256 1107 - 254 361 352 234 1192 - 265 354 367 240 1182 - 255 359 333 235 1217 - 271 389 387 272 1241 - 240 305 341 249 1159 - 272 351 337 257 1169 - 275 351 337 233 1158 - 305 331 361 253 1172 - 277 341 343 253 1113 - 266 362 355 267 1162 - 235 326 329 241 1230 - 254 335 360 251 1172 - - -We notice that the number of scales here is 4 because 16 = 2^4. Running the program on the above input files gives the following output: - -The first output file:: - - motif1 motif2 1_cor 1_pval 2_cor 2_pval 3_cor 3_pval 4_cor 4_pval - - deletionHoptspot insertionHoptspot -0.1 0.346 -0.214 0.338 1 0.127 1 0.467 - deletionHoptspot dnaPolPauseFrameshift 0.167 0.267 -0.214 0.334 1 0.122 1 0.511 - deletionHoptspot topoisomeraseCleavageSite 0.167 0.277 0.143 0.412 -0.667 0.243 1 0.521 - deletionHoptspot translinTarget 0 0.505 0.0714 0.441 1 0.124 1 0.518 - insertionHoptspot dnaPolPauseFrameshift -0.202 0.238 0.143 0.379 -1 0.122 1 0.517 - insertionHoptspot topoisomeraseCleavageSite -0.0336 0.457 0.214 0.29 0.667 0.252 1 0.503 - insertionHoptspot translinTarget 0.0672 0.389 0.429 0.186 -1 0.119 1 0.506 - dnaPolPauseFrameshift topoisomeraseCleavageSite -0.353 0.101 0.357 0.228 0 0.612 -1 0.49 - dnaPolPauseFrameshift translinTarget -0.151 0.303 -0.571 0.09 -0.333 0.37 -1 1 - topoisomeraseCleavageSite translinTarget -0.37 0.077 -0.222 0.297 0.667 0.234 -1 0.471 - -The second output file: - -.. image:: ./static/operation_icons/dwt_cor_aVb_all_1.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_2.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_3.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_4.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_5.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_6.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_7.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_8.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_9.png -.. image:: ./static/operation_icons/dwt_cor_aVb_all_10.png - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_var_perClass.pl --- a/tools/discreteWavelet/execute_dwt_var_perClass.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,320 +0,0 @@ -#!/usr/bin/perl -w - -use warnings; -use IO::Handle; -use POSIX qw(floor ceil); - -# example: perl execute_dwt_var_perClass.pl hg18_NCNR_10bp_3flanks_deletionHotspot_data_del.txt deletionHotspot 3flanks del - -$usage = "execute_dwt_var_perClass.pl [TABULAR.in] [TABULAR.out] [TABULAR.out] [PDF.out] \n"; -die $usage unless @ARGV == 4; - -#get the input arguments -my $inputFile = $ARGV[0]; -my $firstOutputFile = $ARGV[1]; -my $secondOutputFile = $ARGV[2]; -my $thirdOutputFile = $ARGV[3]; - -open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n"); -open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); -open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); -open (OUTPUT3, ">", $thirdOutputFile) || die("Could not open file $thirdOutputFile \n"); -open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); - -#save all error messages into the error file $errorFile using the error file handle ERROR -STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); - -# choosing meaningful names for the output files -$max_dwt = $firstOutputFile; -$pvalue = $secondOutputFile; -$pdf = $thirdOutputFile; - -# count the number of columns in the input file -while($buffer = ){ - #if ($buffer =~ m/interval/){ - chomp($buffer); - $buffer =~ s/^#\s*//; - @contrl = split(/\t/, $buffer); - last; - #} -} -print "The number of columns in the input file is: " . (@contrl) . "\n"; -print "\n"; - -# count the number of motifs in the input file -$count = 0; -for ($i = 0; $i < @contrl; $i++){ - $count++; - print "# $contrl[$i]\n"; -} -print "The number of motifs in the input file is: $count \n"; - -# check if the number of motifs is not a multiple of 12, and round up is so -$count2 = ($count/12); -if ($count2 =~ m/(\D)/){ - print "the number of motifs is not a multiple of 12 \n"; - $count2 = ceil($count2); -} -else { - print "the number of motifs is a multiple of 12 \n"; -} -print "There will be $count2 subfiles\n\n"; - -# split infile into subfiles only 12 motif per file for R plotting -for ($x = 1; $x <= $count2; $x++){ - $a = (($x - 1) * 12 + 1); - $b = $x * 12; - - if ($x < $count2){ - print "# data.short $x <- data_test[, +c($a:$b)]; \n"; - } - else{ - print "# data.short $x <- data_test[, +c($a:ncol(data_test)]; \n"; - } -} - -print "\n"; -print "There are 4 output files: \n"; -print "The first output file is a pdf file\n"; -print "The second output file is a max_dwt file\n"; -print "The third output file is a pvalues file\n"; -print "The fourth output file is a test_final_pvalues file\n"; - -# write R script -$r_script = "get_dwt_varPermut_getMax.r"; -print "The R file name is: $r_script \n"; - -open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; - -print Rcmd " - ###################################################################### - # plot power spectra, i.e. wavelet variance by class - # add code to create null bands by permuting the original data series - # get class of maximum significant variance per feature - # generate plots and table matrix of variance including p-values - ###################################################################### - library(\"Rwave\"); - library(\"wavethresh\"); - library(\"waveslim\"); - - options(echo = FALSE) - - # normalize data - norm <- function(data){ - v <- (data-mean(data))/sd(data); - if(sum(is.na(v)) >= 1){ - v<-data; - } - return(v); - } - - dwt_var_permut_getMax <- function(data, names, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { - max_var = NULL; - matrix = NULL; - title = NULL; - final_pvalue = NULL; - short.levels = NULL; - scale = NULL; - - print(names); - - par(mfcol = c(length(names), length(names)), mar = c(0, 0, 0, 0), oma = c(4, 3, 3, 2), xaxt = \"s\", cex = 1, las = 1); - - short.levels <- wd(data[, 1], filter.number = filter, bc = bc)\$nlevels; - - title <- c(\"motif\"); - for (i in 1:short.levels){ - title <- c(title, paste(i, \"var\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"), paste(i, \"test\", sep = \"_\")); - } - print(title); - - # normalize the raw data - data<-apply(data,2,norm); - - for(i in 1:length(names)){ - for(j in 1:length(names)){ - temp = NULL; - results = NULL; - wave1.dwt = NULL; - out = NULL; - - out <- vector(length = length(title)); - temp <- vector(length = short.levels); - - if(i < j) { - plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); - box(col = \"grey\"); - grid(ny = 0, nx = NULL); - } else { - if (i > j){ - plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); - box(col = \"grey\"); - grid(ny = 0, nx = NULL); - } else { - - wave1.dwt <- dwt(data[, i], wf = wf, short.levels, boundary = boundary); - - temp_row = (short.levels + 1 ) * -1; - temp_col = 1; - temp <- wave.variance(wave1.dwt)[temp_row, temp_col]; - - #permutations code : - feature1 = NULL; - null = NULL; - var_25 = NULL; - var_975 = NULL; - med = NULL; - - feature1 = data[, i]; - for (k in 1:1000) { - nk_1 = NULL; - null.levels = NULL; - var = NULL; - null_wave1 = NULL; - - nk_1 = sample(feature1, length(feature1), replace = FALSE); - null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; - var <- vector(length = length(null.levels)); - null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary); - var<- wave.variance(null_wave1)[-8, 1]; - null= rbind(null, var); - } - null <- apply(null, 2, sort, na.last = TRUE); - var_25 <- null[25, ]; - var_975 <- null[975, ]; - med <- (apply(null, 2, median, na.rm = TRUE)); - - # plot - results <- cbind(temp, var_25, var_975); - matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), axes = F); - - # get pvalues by comparison to null distribution - out <- (names[i]); - for (m in 1:length(temp)){ - print(paste(\"scale\", m, sep = \" \")); - print(paste(\"var\", temp[m], sep = \" \")); - print(paste(\"med\", med[m], sep = \" \")); - pv = tail = NULL; - out <- c(out, format(temp[m], digits = 3)); - if (temp[m] >= med[m]){ - # R tail test - print(\"R\"); - tail <- \"R\"; - pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m]))); - - } else { - if (temp[m] < med[m]){ - # L tail test - print(\"L\"); - tail <- \"L\"; - pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m]))); - } - } - out <- c(out, pv); - print(pv); - out <- c(out, tail); - } - final_pvalue <-rbind(final_pvalue, out); - - - # get variances outside null bands by comparing temp to null - ## temp stores variance for each scale, and null stores permuted variances for null bands - for (n in 1:length(temp)){ - if (temp[n] <= var_975[n]){ - temp[n] <- NA; - } else { - temp[n] <- temp[n]; - } - } - matrix <- rbind(matrix, temp) - } - } - # labels - if (i == 1){ - mtext(names[j], side = 2, line = 0.5, las = 3, cex = 0.25); - } - if (j == 1){ - mtext(names[i], side = 3, line = 0.5, cex = 0.25); - } - if (j == length(names)){ - axis(1, at = (1:short.levels), las = 3, cex.axis = 0.5); - } - } - } - colnames(final_pvalue) <- title; - #write.table(final_pvalue, file = \"test_final_pvalue.txt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); - - # get maximum variance larger than expectation by comparison to null bands - varnames <- vector(); - for(i in 1:length(names)){ - name1 = paste(names[i], \"var\", sep = \"_\") - varnames <- c(varnames, name1) - } - rownames(matrix) <- varnames; - colnames(matrix) <- (1:short.levels); - max_var <- names; - scale <- vector(length = length(names)); - for (x in 1:nrow(matrix)){ - if (length(which.max(matrix[x, ])) == 0){ - scale[x] <- NA; - } - else{ - scale[x] <- colnames(matrix)[which.max(matrix[x, ])]; - } - } - max_var <- cbind(max_var, scale); - write.table(max_var, file = \"$max_dwt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); - return(final_pvalue); - }\n"; - -print Rcmd " - # execute - # read in data - - data_test = NULL; - data_test <- read.delim(\"$inputFile\"); - - pdf(file = \"$pdf\", width = 11, height = 8); - - # loop to read and execute on all $count2 subfiles - final = NULL; - for (x in 1:$count2){ - sub = NULL; - sub_names = NULL; - a = NULL; - b = NULL; - - a = ((x - 1) * 12 + 1); - b = x * 12; - - if (x < $count2){ - sub <- data_test[, +c(a:b)]; - sub_names <- colnames(data_test)[a:b]; - final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); - } - else{ - sub <- data_test[, +c(a:ncol(data_test))]; - sub_names <- colnames(data_test)[a:ncol(data_test)]; - final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); - - } - } - - dev.off(); - - write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE); - - #eof\n"; - -close Rcmd; - -system("echo \"wavelet ANOVA started on \`hostname\` at \`date\`\"\n"); -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); -system("echo \"wavelet ANOVA ended on \`hostname\` at \`date\`\"\n"); - -#close the input and output and error files -close(ERROR); -close(OUTPUT3); -close(OUTPUT2); -close(OUTPUT1); -close(INPUT); \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_var_perClass.xml --- a/tools/discreteWavelet/execute_dwt_var_perClass.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ - - in one dataset using Discrete Wavelet Transfoms - - - execute_dwt_var_perClass.pl $inputFile $outputFile1 $outputFile2 $outputFile3 - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -This program generates plots and computes table matrix of maximum variances, p-values, and test orientations at multiple scales for the occurrences of a class of features in one dataset of DNA sequences using multiscale wavelet analysis technique. - -The program assumes that the user has one set of DNA sequences, S, which consists of one or more sequences of equal length. Each sequence in S is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales. - -The program has one input file obtained as follows: - -For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S, and builds a tabular file representing the count results in each interval of S. This is the input file of the program. - -The program gives three output files: - -- The first output file is a TABULAR format file giving the scales at which each features has a maximum variances. -- The second output file is a TABULAR format file representing the variances, p-values, and test orientation for the occurrences of features at each scale based on a random permutation test and using multiscale wavelet analysis technique. -- The third output file is a PDF file plotting the wavelet variances of each feature at each scale. - ------ - -.. class:: warningmark - -**Note** - -- If the number of features is greater than 12, the program will divide each output file into subfiles, such that each subfile represents the results of a group of 12 features except the last subfile that will represents the results of the rest. For example, if the number of features is 17, the p-values file will consists of two subfiles, the first for the features 1-12 and the second for the features 13-17. As for the PDF file, it will consists of two pages in this case. -- In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. - ------ - - -**Example** - -Counting the occurrences of 8 features (motifs) in 16 intervals (one line per interval) of set of DNA sequences in S gives the following tabular file:: - - deletionHoptspot insertionHoptspot dnaPolPauseFrameshift indelHotspot topoisomeraseCleavageSite translinTarget vDjRecombinationSignal x-likeSite - 226 403 416 221 1165 832 749 1056 - 236 444 380 241 1223 746 782 1207 - 242 496 391 195 1116 643 770 1219 - 243 429 364 191 1118 694 783 1223 - 244 410 371 236 1063 692 805 1233 - 230 386 370 217 1087 657 787 1215 - 275 404 402 214 1044 697 831 1188 - 265 443 365 231 1086 694 782 1184 - 255 390 354 246 1114 642 773 1176 - 281 384 406 232 1102 719 787 1191 - 263 459 369 251 1135 643 810 1215 - 280 433 400 251 1159 701 777 1151 - 278 385 382 231 1147 697 707 1161 - 248 393 389 211 1162 723 759 1183 - 251 403 385 246 1114 752 776 1153 - 239 383 347 227 1172 759 789 1141 - -We notice that the number of scales here is 4 because 16 = 2^4. Runnig the program on the above input file gives the following 3 output files: - -The first output file:: - - motifs max_var at scale - deletionHoptspot NA - insertionHoptspot NA - dnaPolPauseFrameshift NA - indelHotspot NA - topoisomeraseCleavageSite 3 - translinTarget NA - vDjRecombinationSignal NA - x.likeSite NA - -The second output file:: - - motif 1_var 1_pval 1_test 2_var 2_pval 2_test 3_var 3_pval 3_test 4_var 4_pval 4_test - - deletionHoptspot 0.457 0.048 L 1.18 0.334 R 1.61 0.194 R 3.41 0.055 R - insertionHoptspot 0.556 0.109 L 1.34 0.272 R 1.59 0.223 R 2.02 0.157 R - dnaPolPauseFrameshift 1.42 0.089 R 0.66 0.331 L 0.421 0.305 L 0.121 0.268 L - indelHotspot 0.373 0.021 L 1.36 0.254 R 1.24 0.301 R 4.09 0.047 R - topoisomeraseCleavageSite 0.305 0.002 L 0.936 0.489 R 3.78 0.01 R 1.25 0.272 R - translinTarget 0.525 0.061 L 1.69 0.11 R 2.02 0.131 R 0.00891 0.069 L - vDjRecombinationSignal 0.68 0.138 L 0.957 0.46 R 2.35 0.071 R 1.03 0.357 R - x.likeSite 0.928 0.402 L 1.33 0.261 R 0.735 0.431 L 0.783 0.422 R - -The third output file: - -.. image:: ./static/operation_icons/dwt_var_perClass.png - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_var_perFeature.pl --- a/tools/discreteWavelet/execute_dwt_var_perFeature.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,199 +0,0 @@ -#!/usr/bin/perl -w -# Author: Erika Kvikstad - -use warnings; -use IO::Handle; -use POSIX qw(floor ceil); - -$usage = "execute_dwt_var_perFeature.pl [TABULAR.in] [FEATURE] [ALPHA] [TABULAR.out] [PDF.out] \n"; -die $usage unless @ARGV == 5; - -#get the input arguments -my $inputFile = $ARGV[0]; -my @features = split(/,/,$ARGV[1]); -my $features_count = scalar(@features); -my $alpha = $ARGV[2]; -my $outFile1 = $ARGV[3]; -my $outFile2 = $ARGV[4]; - -open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n"); -open (OUTPUT2, ">", $outFile1) || die("Could not open file $outFile1 \n"); -open (OUTPUT3, ">", $outFile2) || die("Could not open file $outFile2 \n"); -#open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); - -# choosing meaningful names for the output files -$pvalue = $outFile1; -$pdf = $outFile2; - -# write R script -$r_script = "get_dwt_varPermut.r"; - -open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; - -print Rcmd " - ###################################################################### - # plot multiscale wavelet variance - # create null bands by permuting the original data series - # generate plots and table of wavelet variance including p-values - ###################################################################### - options(echo = FALSE) - #library(\"Rwave\"); - #library(\"wavethresh\"); - #library(\"waveslim\"); - # turn off diagnostics for de-bugging only, turn back on for functional tests on test - require(\"Rwave\",quietly=TRUE,warn.conflicts = FALSE); - require(\"wavethresh\",quietly=TRUE,warn.conflicts = FALSE); - require(\"waveslim\",quietly=TRUE,warn.conflicts = FALSE); - require(\"bitops\",quietly=TRUE,warn.conflicts = FALSE); - - # to determine if data is properly formatted 2^N observations - is.power2<- function(x){x && !(bitAnd(x,x - 1));} - - # dwt : discrete wavelet transform using Haar wavelet filter, simplest wavelet function but later can modify to let user-define the wavelet filter function - dwt_var_permut_getMax <- function(data, names, alpha, filter = 1,family=\"DaubExPhase\", bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { - max_var = NULL; - matrix = NULL; - title = NULL; - final_pvalue = NULL; - J = NULL; - scale = NULL; - out = NULL; - - print(class(data)); - print(names); - print(alpha); - - par(mar=c(5,4,4,3),oma = c(4, 4, 3, 2), xaxt = \"s\", cex = 1, las = 1); - - title<-c(\"Wavelet\",\"Variance\",\"Pvalue\",\"Test\"); - print(title); - - for(i in 1:length(names)){ - temp = NULL; - results = NULL; - wave1.dwt = NULL; - - # if data fails formatting check, do something - - print(is.numeric(as.matrix(data)[, i])); - if(!is.numeric(as.matrix(data)[, i])) - stop(\"data must be a numeric vector\"); - - print(length(as.matrix(data)[, i])); - print(is.power2(length(as.matrix(data)[, i]))); - if(!is.power2(length(as.matrix(data)[, i]))) - stop(\"data length must be a power of two\"); - - - J <- wd(as.matrix(data)[, i], filter.number = filter, family=family, bc = bc)\$nlevels; - print(J); - temp <- vector(length = J); - wave1.dwt <- dwt(as.matrix(data)[, i], wf = wf, J, boundary = boundary); - #print(wave1.dwt); - - temp <- wave.variance(wave1.dwt)[-(J+1), 1]; - print(temp); - - #permutations code : - feature1 = NULL; - null = NULL; - var_lower=limit_lower=NULL; - var_upper=limit_upper=NULL; - med = NULL; - - limit_lower = alpha/2*1000; - print(limit_lower); - limit_upper = (1-alpha/2)*1000; - print(limit_upper); - - feature1 = as.matrix(data)[,i]; - for (k in 1:1000) { - nk_1 = NULL; - null.levels = NULL; - var = NULL; - null_wave1 = NULL; - - nk_1 = sample(feature1, length(feature1), replace = FALSE); - null.levels <- wd(nk_1, filter.number = filter,family=family ,bc = bc)\$nlevels; - var <- vector(length = length(null.levels)); - null_wave1 <- dwt(nk_1, wf = wf, J, boundary = boundary); - var<- wave.variance(null_wave1)[-(null.levels+1), 1]; - null= rbind(null, var); - } - null <- apply(null, 2, sort, na.last = TRUE); - var_lower <- null[limit_lower, ]; - var_upper <- null[limit_upper, ]; - med <- (apply(null, 2, median, na.rm = TRUE)); - - # plot - results <- cbind(temp, var_lower, var_upper); - print(results); - matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2),xaxt='n',xlab=\"Wavelet Scale\",ylab=\"Wavelet variance\" ); - mtext(names[i], side = 3, line = 0.5, cex = 1); - axis(1, at = 1:J , labels=c(2^(0:(J-1))), las = 3, cex.axis = 1); - - # get pvalues by comparison to null distribution - #out <- (names[i]); - for (m in 1:length(temp)){ - print(paste(\"scale\", m, sep = \" \")); - print(paste(\"var\", temp[m], sep = \" \")); - print(paste(\"med\", med[m], sep = \" \")); - pv = tail =scale = NULL; - scale=2^(m-1); - #out <- c(out, format(temp[m], digits = 3)); - if (temp[m] >= med[m]){ - # R tail test - print(\"R\"); - tail <- \"R\"; - pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m]))); - - } else { - if (temp[m] < med[m]){ - # L tail test - print(\"L\"); - tail <- \"L\"; - pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m]))); - } - } - print(pv); - out<-rbind(out,c(paste(\"Scale\", scale, sep=\"_\"),format(temp[m], digits = 3),pv,tail)); - } - final_pvalue <-rbind(final_pvalue, out); - } - colnames(final_pvalue) <- title; - return(final_pvalue); -}\n"; - -print Rcmd " -# execute -# read in data -data_test = final = NULL; -sub = sub_names = NULL; -data_test <- read.delim(\"$inputFile\",header=FALSE); -pdf(file = \"$pdf\", width = 11, height = 8)\n"; - -for ($x=0;$x<$features_count;$x++){ - $feature=$features[$x]; -print Rcmd " - if ($feature > ncol(data_test)) - stop(\"column $feature doesn't exist\"); - sub<-data_test[,$feature]; - #sub_names <- colnames(data_test); - sub_names<-colnames(data_test)[$feature]; - final <- rbind(final,dwt_var_permut_getMax(sub, sub_names,$alpha));\n"; -} - -print Rcmd " - - dev.off(); - write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE); - -#eof\n"; - -close Rcmd; -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); - -#close the input and output and error files -close(OUTPUT3); -close(OUTPUT2); -close(INPUT); diff -r c2a356708570 -r 33c067c3ae34 tools/discreteWavelet/execute_dwt_var_perFeature.xml --- a/tools/discreteWavelet/execute_dwt_var_perFeature.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ - - using Discrete Wavelet Transfoms - - - execute_dwt_var_perFeature.pl $inputFile $feature $alpha $outputFile1 $outputFile2 - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -This tool computes the scale-specific variance in wavelet coeffients obtained from the discrete wavelet transform of a feature of interest. - -Input data consists of an ordered series of data, S, equispaced and of sample size N, where N is of the form N = 2^k, and k is a positive integer and represents the number of levels of wavelet decomposition. S could be a time series, or a set of DNA sequences. The user calculates a statistic of interest for each feature in each interval of S: say, expression level of a particular gene in a time course, or the number of LINE elements per window across a chromosome. This tool then performs a discrete wavelet transform of the feature of interest, and plots the resulting variance in wavelet coefficients per wavelet scale. In addition, statistical significance of variances are determined by 1,000 random permutations of the intervals in S, to generate null bands (representing the user provided alpha value) corresponding to the empirical distribution of wavelet variances under the null hypothesis of no inherent order to the series in S. - -This tool generates two output files: - -- The first output file is a TABULAR format file representing the variances, p-values, and test orientation for the features at each wavelet scale based on a random permutation test. -- The second output file is a PDF image plotting the wavelet variances of each feature at each scale. - ------ - -.. class:: warningmark - -**Note** -In order to obtain empirical p-values, a random perumtation scheme is implemented by the tool, such that the output may generate slightly variations in results each time it is run on the same input file. - ------ - - -**Example** - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_antigenic.xml --- a/tools/emboss_5/emboss_antigenic.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ - - Predicts potentially antigenic regions of a protein sequence, using the method of Kolaskar and Tongaonkar. - emboss - antigenic -sequence $input1 -outfile $out_file1 -minlen $minlen -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/antigenic.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_backtranseq.xml --- a/tools/emboss_5/emboss_backtranseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,220 +0,0 @@ - - Back translate a protein sequence - emboss - backtranseq -sequence $input1 -outfile $out_file1 -cfile $cfile -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/backtranseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_banana.pl --- a/tools/emboss_5/emboss_banana.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -#! /usr/bin/perl -w -use strict; - -my $cmd_string = join (" ",@ARGV); -#my $cmd_string = "/home/djb396/temp/emboss/bin/banana -sequence /home/djb396/universe-prototype/test.fasta -outfile result.txt -graph png -goutfile results -auto"; -my $results = `$cmd_string`; -my @files = split("\n",$results); -foreach my $thisLine (@files) -{ - if ($thisLine =~ /Created /i) - { - $thisLine =~ /[\w|\.]+$/; - $thisLine =$&; - print "outfile: $thisLine\n"; - } -} diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_banana.xml --- a/tools/emboss_5/emboss_banana.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ - - Bending and curvature plot in B-DNA - emboss - banana -sequence $input1 -outfile $out_file1 -graph none -auto - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/banana.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_biosed.xml --- a/tools/emboss_5/emboss_biosed.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ - - Replace or delete sequence sections - emboss - biosed -sequence $input1 -outseq $out_file1 -target $target -replace $replace -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/biosed.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_btwisted.xml --- a/tools/emboss_5/emboss_btwisted.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ - - Calculates the twisting in a B-DNA sequence - emboss - btwisted -sequence $input1 -outfile $out_file1 -auto - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/btwisted.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cai.xml --- a/tools/emboss_5/emboss_cai.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,185 +0,0 @@ - - CAI codon adaptation index - emboss - cai -seqall $input1 -outfile $out_file1 -cfile $cfile -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cai.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cai_custom.xml --- a/tools/emboss_5/emboss_cai_custom.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ - - CAI codon adaptation index using custom codon usage file - emboss - cai -seqall $input1 -outfile $out_file1 -cfile $input2 -auto - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cai_custom.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_chaos.xml --- a/tools/emboss_5/emboss_chaos.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ - - Create a chaos game representation plot for a sequence - emboss - emboss_single_outputfile_wrapper.pl chaos -sequence $input1 -graph png -goutfile $out_file1 -auto - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/chaos.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_charge.xml --- a/tools/emboss_5/emboss_charge.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ - - Protein charge plot - emboss - charge -seqall $input1 -outfile $out_file1 -window $window -auto - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/charge.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_checktrans.xml --- a/tools/emboss_5/emboss_checktrans.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ - - Reports STOP codons and ORF statistics of a protein - emboss - checktrans -sequence $input1 -outfile $out_file1 -outseq $out_file2 -osformat3 $out_format2 -outfeat $out_file3 -offormat4 $out_format3 -orfml $orfml -addlast $addlast -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/checktrans.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_chips.xml --- a/tools/emboss_5/emboss_chips.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - Codon usage statistics - emboss - chips -seqall $input1 -outfile $out_file1 -sum $sum -auto - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/chips.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cirdna.xml --- a/tools/emboss_5/emboss_cirdna.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ - - Draws circular maps of DNA constructs - emboss - emboss_single_outputfile_wrapper.pl cirdna -infile $input1 -graphout png -goutfile $out_file1 -auto - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cirdna.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_codcmp.xml --- a/tools/emboss_5/emboss_codcmp.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,330 +0,0 @@ - - Codon usage table comparison - emboss - codcmp -first $cfile1 -second $cfile2 -outfile $out_file1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/codcmp.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_coderet.xml --- a/tools/emboss_5/emboss_coderet.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ - - Extract CDS, mRNA and translations from feature tables - emboss - - coderet -seqall $input1 -outfile $out_file1 -auto - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/coderet.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_compseq.xml --- a/tools/emboss_5/emboss_compseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - Count composition of dimer/trimer/etc words in a sequence - emboss - compseq -sequence $input1 -outfile $out_file1 -word $word -frame $frame -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/compseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cpgplot.xml --- a/tools/emboss_5/emboss_cpgplot.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - - Plot CpG rich areas - emboss - emboss_cpgplot_wrapper.pl cpgplot -sequence $input1 -window $window -minlen $minlen -minpc $minpc -outfile $outfile -graph png -goutfile $goutfile -outfeat $outfeat -minoe $minoe -auto - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cpgplot.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cpgplot_wrapper.pl --- a/tools/emboss_5/emboss_cpgplot_wrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -#! /usr/bin/perl -w -use strict; -use File::Copy; - -my $cmd_string = join (" ",@ARGV); -my $results = `$cmd_string`; -my @files = split("\n",$results); -my $fileNameOut = $ARGV[14]; -move($fileNameOut.".1.png",$fileNameOut); diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cpgreport.xml --- a/tools/emboss_5/emboss_cpgreport.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ - - Reports all CpG rich regions - emboss - cpgreport -sequence $input1 -outfile $out_file1 -outfeat $out_file2 -offormat3 $out_format2 -score $score -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cpgreport.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cusp.xml --- a/tools/emboss_5/emboss_cusp.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - Create a codon usage table - emboss - cusp -sequence $input1 -outfile $out_file1 -auto - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cusp.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_cutseq.xml --- a/tools/emboss_5/emboss_cutseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - Removes a specified section from a sequence - emboss - cutseq -sequence $input1 -outseq $out_file1 -from $from -to $to -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cutseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_dan.xml --- a/tools/emboss_5/emboss_dan.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ - - Calculates DNA RNA/DNA melting temperature - emboss - emboss_single_outputfile_wrapper.pl dan -sequence $input1 -windowsize $window -goutfile $out_file1 -graph png -plot $plot1 -shiftincrement $shift -dnaconc $dnaconc - -saltconc $saltconc -product $product -formamide $formamide -mismatch $mismatch -prodlen $prodlen -thermo $thermo -temperature $temperature -rna $rna -outfile $out_file1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dan.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_degapseq.xml --- a/tools/emboss_5/emboss_degapseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - Removes gap characters from sequences - emboss - degapseq -sequence $input1 -outseq $out_file1 -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/degapseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_descseq.xml --- a/tools/emboss_5/emboss_descseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - Alter the name or description of a sequence - emboss - descseq -sequence $input1 -outseq $out_file1 -name "$seqname" -description "$desc" -append $append -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/descseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_diffseq.xml --- a/tools/emboss_5/emboss_diffseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ - - Find differences between nearly identical sequences - emboss - diffseq -asequence $input1 -bsequence $input2 -outfile $out_file1 -aoutfeat $out_file2 -boutfeat $out_file3 -wordsize $wordsize -globaldifferences $globaldifferences -rformat3 - $out_format1 -offormat4 $out_format2 -offormat5 $out_format3 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/diffseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_digest.xml --- a/tools/emboss_5/emboss_digest.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ - - Protein proteolytic enzyme or reagent cleavage digest - emboss - digest -seqall $input1 -outfile $out_file1 -menu $menu -unfavoured $unfavoured -overlap $overlap -allpartials $allpartials -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/digest.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_dotmatcher.xml --- a/tools/emboss_5/emboss_dotmatcher.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ - - Displays a thresholded dotplot of two sequences - emboss - emboss_single_outputfile_wrapper.pl dotmatcher -asequence $input1 -bsequence $input2 -goutfile $out_file1 -windowsize $windowsize -threshold $threshold -graph png -xygraph png - -auto - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dotmatcher.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_dotpath.xml --- a/tools/emboss_5/emboss_dotpath.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - - Non-overlapping wordmatch dotplot of two sequences - emboss - emboss_single_outputfile_wrapper.pl dotpath -asequence $input1 -bsequence $input2 -goutfile $out_file1 -wordsize $wordsize -overlaps $overlaps -boxit $boxit -graph png - -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dotpath.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_dottup.xml --- a/tools/emboss_5/emboss_dottup.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - Displays a wordmatch dotplot of two sequences - emboss - emboss_single_outputfile_wrapper.pl dottup -asequence $input1 -bsequence $input2 -goutfile $out_file1 -wordsize $wordsize -boxit $boxit -graph png -xygraph png -auto - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dottup.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_dreg.xml --- a/tools/emboss_5/emboss_dreg.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ - - Regular expression search of a nucleotide sequence - emboss - dreg -sequence $input1 -outfile $out_file1 -pattern "$pattern" -raccshow3 "no" -rusashow3 "no" -rdesshow3 "no" -auto - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dreg.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_einverted.xml --- a/tools/emboss_5/emboss_einverted.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ - - Finds DNA inverted repeats - emboss - einverted -sequence $input1 -outfile $out_file1 -gap $gap -threshold $threshold -match $match -mismatch $mismatch -maxrepeat $maxrepeat -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/einverted.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_epestfind.xml --- a/tools/emboss_5/emboss_epestfind.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ - - Finds PEST motifs as potential proteolytic cleavage sites - emboss - emboss_single_outputfile_wrapper.pl epestfind -sequence $input1 -goutfile $ofile2 -outfile $ofile1 -window $window -order $order -potential $potential -poor $poor - -invalid $invalid -map $map -graph png -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/epestfind.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_equicktandem.xml --- a/tools/emboss_5/emboss_equicktandem.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - - Finds tandem repeats - emboss - equicktandem -sequence $input1 -outfile $out_file1 -origfile $ofile2 -maxrepeat $maxrepeat -threshold $threshold -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/equicktandem.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_est2genome.xml --- a/tools/emboss_5/emboss_est2genome.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ - - Align EST and genomic DNA sequences - emboss - est2genome -estsequence $input1 -genomesequence $input2 -outfile $out_file1 -match $match -mismatch $mismatch -gappenalty $gappenalty -intronpenalty $intronpenalty -splicepenalty - $splicepenalty -minscore $minscore -reverse $reverse -splice $splice -mode $mode -best $best -shuffle $shuffle -seed $seed -align $align -width $width -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/est2genome.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_etandem.xml --- a/tools/emboss_5/emboss_etandem.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ - - Looks for tandem repeats in a nucleotide sequence - emboss - etandem -sequence $input1 -outfile $out_file1 -origfile $ofile2 -minrepeat $minrepeat -maxrepeat $maxrepeat -threshold $threshold -mismatch $mismatch -uniform $uniform -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/etandem.html - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_extractfeat.xml --- a/tools/emboss_5/emboss_extractfeat.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ - - - Extract features from a sequence - emboss - extractfeat -sequence $input1 -outseq $out_file1 -before $before -after $after -source "$source" -type "$type" -sense $sense -minscore $minscore -maxscore $maxscore -tag "$tag" -value - "$value" -join $join -featinname $featinname -describe "$describe" -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/extractfeat.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_extractseq.xml --- a/tools/emboss_5/emboss_extractseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ - - Extract regions from a sequence - emboss - extractseq -sequence $input1 -outseq $out_file1 -regions $regions -separate $separate -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/extractseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_format_corrector.py --- a/tools/emboss_5/emboss_format_corrector.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -#EMBOSS format corrector - -import operator -#from galaxy import datatypes - -#Properly set file formats after job run -def exec_after_process( app, inp_data, out_data, param_dict,tool, stdout, stderr): -#Properly set file formats before job run -#def exec_before_job(trans, inp_data, out_data, param_dict,tool): - #why isn't items an ordered list? - items = out_data.items() - #lets sort it ourselves.... - items = sorted(items, key=operator.itemgetter(0)) - #items is now sorted... - - #normal filetype correction - data_count=1 - for name, data in items: - outputType = param_dict.get( 'out_format'+str(data_count), None ) - #print "data_count",data_count, "name", name, "outputType", outputType - if outputType !=None: - if outputType == 'ncbi': - outputType = "fasta" - elif outputType == 'excel': - outputType = "tabular" - elif outputType == 'text': - outputType = "txt" - data = app.datatypes_registry.change_datatype(data, outputType) - app.model.context.add( data ) - app.model.context.flush() - data_count+=1 - - #html filetype correction - data_count=1 - for name, data in items: - wants_plot = param_dict.get( 'html_out'+str(data_count), None ) - ext = "html" - if wants_plot == "yes": - data = app.datatypes_registry.change_datatype(data, ext) - app.model.context.add( data ) - app.model.context.flush() - data_count+=1 - - #png file correction - data_count=1 - for name, data in items: - wants_plot = param_dict.get( 'plot'+str(data_count), None ) - ext = "png" - if wants_plot == "yes": - data = app.datatypes_registry.change_datatype(data, ext) - app.model.context.add( data ) - app.model.context.flush() - data_count+=1 diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_freak.xml --- a/tools/emboss_5/emboss_freak.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - - Residue/base frequency table or plot - emboss - freak -seqall $input1 -outfile $out_file1 -window $window -letters $letters -graph png -step $step -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/freak.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_fuzznuc.xml --- a/tools/emboss_5/emboss_fuzznuc.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - - Nucleic acid pattern search - emboss - fuzznuc -sequence $input1 -outfile $out_file1 -pattern '$pattern' -pmismatch $mismatch -complement $complement -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/fuzznuc.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_fuzzpro.xml --- a/tools/emboss_5/emboss_fuzzpro.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - Protein pattern search - emboss - fuzzpro -sequence $input1 -outfile $out_file1 -pattern "$pattern" -pmismatch $mismatch -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/fuzzpro.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_fuzztran.xml --- a/tools/emboss_5/emboss_fuzztran.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ - - Protein pattern search after translation - emboss - fuzztran -sequence $input1 -outfile $out_file1 -pattern "$pattern" -pmismatch $mismatch -frame $frame -table $table -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/fuzztran.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_garnier.xml --- a/tools/emboss_5/emboss_garnier.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - Predicts protein secondary structure - emboss - garnier -sequence $input1 -outfile $out_file1 -idc $idc -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/4.0/emboss/apps/garnier.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_geecee.xml --- a/tools/emboss_5/emboss_geecee.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ - - Calculates fractional GC content of nucleic acid sequences - emboss - geecee -sequence $input1 -outfile $out_file1 -auto - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/geecee.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_getorf.xml --- a/tools/emboss_5/emboss_getorf.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,129 +0,0 @@ - - Finds and extracts open reading frames (ORFs) - emboss - getorf -sequence $input1 -outseq $out_file1 -table $table -minsize $minsize -maxsize $maxsize -find $find -methionine $methionine -circular $circular -reverse $reverse -flanking $flanking - -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/getorf.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_helixturnhelix.xml --- a/tools/emboss_5/emboss_helixturnhelix.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - Report nucleic acid binding motifs - emboss - helixturnhelix -sequence $input1 -outfile $out_file1 -mean $mean -sd $sd -minsd $minsd -eightyseven $eightyseven -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/helixturnhelix.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_hmoment.xml --- a/tools/emboss_5/emboss_hmoment.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ - - Hydrophobic moment calculation - emboss - hmoment -seqall $input1 -outfile $out_file1 -window $window -aangle $aangle -graph png -auto - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/hmoment.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_iep.xml --- a/tools/emboss_5/emboss_iep.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ - - Calculates the isoelectric point of a protein - emboss - iep -sequence $input1 -outfile $out_file1 -step $step -amino $amino -graph png -termini $termini -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/iep.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_infoseq.xml --- a/tools/emboss_5/emboss_infoseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ - - - Displays some simple information about sequences - emboss - infoseq -sequence $input1 -outfile $out_file1 -html $html_out1 -heading $heading -usa $usa -name $disname -accession $accession -gi $gi -version $version -type $type -length $length -pgc - $pgc -description $description -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/infoseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_infoseq_wrapper.pl --- a/tools/emboss_5/emboss_infoseq_wrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -#! /usr/bin/perl -w -use strict; - -my $cmd_string = join (" ",@ARGV); -my $results = `$cmd_string`; -if ($ARGV[6]=~/yes/) -{ - print "Extension: html\n"; -} diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_isochore.xml --- a/tools/emboss_5/emboss_isochore.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ - - Plots isochores in large DNA sequences - emboss - emboss_single_outputfile_wrapper.pl isochore -sequence $input1 -outfile $ofile2 -goutfile $ofile1 -graph png -window $window -shift $shift -auto - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - -**Syntax** - -This application plots GC content over a sequence. It is intended for large sequences such as complete chromosomes or large genomic contigs, although interesting results can also be obtained from shorter sequences. You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/isochore.html - -- Both **Window size** and **Shift increment** are intergers. - ------ - -**Example** - -- Input sequences:: - - >hg18_dna range=chrX:151073054-151073376 5'pad=0 3'pad=0 revComp=FALSE strand=? repeatMasking=none - TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA - GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATT - TGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCT - GCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGA - CAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACT - CAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCA - GCGGTTTCCAAGAGGATACAGTA - -- Output data file:: - - Position Percent G+C 1 .. 323 - 80 0.422 - 112 0.460 - 144 0.509 - 176 0.534 - 208 0.553 - 240 0.553 - -- Output graphics file: - -.. image:: ./static/emboss_icons/isochore.png - - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_lindna.xml --- a/tools/emboss_5/emboss_lindna.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ - - - Draws linear maps of DNA constructs - emboss - lindna -infile $input1 -graphout png -goutfile $out_file1 -ruler $ruler -blocktype $blocktype -maxgroups $maxgroups -maxlabels $maxlabels -intersymbol $intersymbol -intercolour $intercolour - -interticks $interticks -gapsize $gapsize -ticklines $ticklines -textheight $textheight -textlength $textlength -margin $margin -tickheight $tickheight -blockheight $blockheight -rangeheight - $rangeheight -gapgroup $gapgroup -postext $postext -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/lindna.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_marscan.xml --- a/tools/emboss_5/emboss_marscan.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ - - Finds MAR/SAR sites in nucleic sequences - emboss - marscan -sequence $input1 -outfile $out_file1 -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/marscan.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_maskfeat.xml --- a/tools/emboss_5/emboss_maskfeat.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - Mask off features of a sequence - emboss - maskfeat -sequence $input1 -outseq $out_file1 -type "$type" -tolower $tolower -maskchar "$maskchar" -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/maskfeat.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_maskseq.xml --- a/tools/emboss_5/emboss_maskseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - Mask off regions of a sequence - emboss - maskseq -sequence $input1 -outseq $out_file1 -regions "$regions" -tolower $tolower -maskchar "$maskchar" -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/maskseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_matcher.xml --- a/tools/emboss_5/emboss_matcher.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ - - Finds the best local alignments between two sequences - emboss - matcher -asequence $input1 -bsequence $input2 -outfile $out_file1 -alternatives $alternatives -gapopen $gapopen -gapextend $gapextend -aformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/matcher.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_megamerger.xml --- a/tools/emboss_5/emboss_megamerger.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - Merge two large overlapping nucleic acid sequences - emboss - megamerger -asequence $input1 -bsequence $input2 -outseq $out_file1 -outfile $out_file2 -wordsize $wordsize -prefer $prefer -osformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/megamerger.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_merger.xml --- a/tools/emboss_5/emboss_merger.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ - - Merge two overlapping nucleic acid sequences - emboss - merger -asequence $input1 -bsequence $input2 -outseq $out_file1 -outfile $out_file2 -gapopen $gapopen -gapextend $gapextend -osformat4 $out_format1 -aformat3 $out_format2 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/merger.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_msbar.xml --- a/tools/emboss_5/emboss_msbar.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ - - Mutate sequence beyond all recognition - emboss - msbar -sequence $input1 -outseq $out_file1 -count $count -point $point -block $block -codon $codon -inframe $inframe -minimum $minimum -maximum $maximum -osformat2 $out_format1 - -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/msbar.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_multiple_outputfile_wrapper.pl --- a/tools/emboss_5/emboss_multiple_outputfile_wrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#! /usr/bin/perl -w -use strict; - -my $cmd_string = join (" ",@ARGV); -my $results = `$cmd_string`; -my @files = split("\n",$results); -foreach my $thisLine (@files) -{ - if ($thisLine =~ /Created /) - { - $thisLine =~ /[\w|\.]+$/; - $thisLine =$&; - print "outfile: $thisLine\n"; - } - else - { - print $thisLine,"\n"; - } -} diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_needle.xml --- a/tools/emboss_5/emboss_needle.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ - - Needleman-Wunsch global alignment - emboss - needle -asequence $input1 -bsequence $input2 -outfile $out_file1 -gapopen $gapopen -gapextend $gapextend -brief $brief -aformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -needle reads any two sequences of the same type (DNA or protein). - ------ - -**Syntax** - -This tool uses the Needleman-Wunsch global alignment algorithm to find the optimum alignment (including gaps) of two sequences when considering their entire length. - -- **Optimal alignment:** Dynamic programming methods ensure the optimal global alignment by exploring all possible alignments and choosing the best. - -- **The Needleman-Wunsch algorithm** is a member of the class of algorithms that can calculate the best score and alignment in the order of mn steps, (where 'n' and 'm' are the lengths of the two sequences). - -- **Gap open penalty:** [10.0 for any sequence] The gap open penalty is the score taken away when a gap is created. The best value depends on the choice of comparison matrix. The default value assumes you are using the EBLOSUM62 matrix for protein sequences, and the EDNAFULL matrix for nucleotide sequences. (Floating point number from 1.0 to 100.0) - -- **Gap extension penalty:** [0.5 for any sequence] The gap extension, penalty is added to the standard gap penalty for each base or residue in the gap. This is how long gaps are penalized. Usually you will expect a few long gaps rather than many short gaps, so the gap extension penalty should be lower than the gap penalty. An exception is where one or both sequences are single reads with possible sequencing errors in which case you would expect many single base gaps. You can get this result by setting the gap open penalty to zero (or very low) and using the gap extension penalty to control gap scoring. (Floating point number from 0.0 to 10.0) - -You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/needle.html - ------ - -**Example** - -- Input File:: - - >hg18_dna range=chrX:151073054-151073136 5'pad=0 3'pad=0 revComp=FALSE strand=? repeatMasking=none - TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA - GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTG - -- If both Sequence1 and Sequence2 take the above file as input, Gap open penalty equals 10.0, Gap extension penalty equals 0.5, Brief identity and similarity is set to Yes, Output Alignment File Format is set to SRS pairs, the output file is:: - - ######################################## - # Program: needle - # Rundate: Mon Apr 02 2007 14:23:16 - # Align_format: srspair - # Report_file: ./database/files/dataset_7.dat - ######################################## - - #======================================= - # - # Aligned_sequences: 2 - # 1: hg18_dna - # 2: hg18_dna - # Matrix: EDNAFULL - # Gap_penalty: 10.0 - # Extend_penalty: 0.5 - # - # Length: 83 - # Identity: 83/83 (100.0%) - # Similarity: 83/83 (100.0%) - # Gaps: 0/83 ( 0.0%) - # Score: 415.0 - # - #======================================= - - hg18_dna 1 TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA 50 - |||||||||||||||||||||||||||||||||||||||||||||||||| - hg18_dna 1 TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA 50 - - hg18_dna 51 GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTG 83 - ||||||||||||||||||||||||||||||||| - hg18_dna 51 GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTG 83 - - #--------------------------------------- - #--------------------------------------- - - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_newcpgreport.xml --- a/tools/emboss_5/emboss_newcpgreport.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - Report CpG rich areas - emboss - newcpgreport -sequence $input1 -window $window -shift $shift -minlen $minlen -minpc $minpc -outfile $out_file1 -minoe $minoe -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/newcpgreport.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_newcpgseek.xml --- a/tools/emboss_5/emboss_newcpgseek.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ - - Reports CpG rich region - emboss - newcpgseek -sequence $input1 -outfile $out_file1 -score $score -auto - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/newcpgseek.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_newseq.xml --- a/tools/emboss_5/emboss_newseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - Type in a short new sequence - emboss - newseq -outseq $out_file1 -name "$seqname" -description "$description" -type $type -sequence "$sequence" -osformat5 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/newseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_noreturn.xml --- a/tools/emboss_5/emboss_noreturn.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ - - Removes carriage return from ASCII files - emboss - noreturn -infile $input1 -outfile $out_file1 -system $system -auto - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/noreturn.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_notseq.xml --- a/tools/emboss_5/emboss_notseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ - - Exclude a set of sequences and write out the remaining ones - emboss - notseq -sequence $input1 -outseq $out_file1 -exclude "$exclude" -osformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/notseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_nthseq.xml --- a/tools/emboss_5/emboss_nthseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ - - Writes one sequence from a multiple set of sequences - emboss - nthseq -sequence $input1 -outseq $out_file1 -number $number -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/nthseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_octanol.xml --- a/tools/emboss_5/emboss_octanol.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ - - - Displays protein hydropathy - emboss - emboss_single_outputfile_wrapper.pl octanol -sequence $input1 -graph png -goutfile $out_file1 -width $width -octanolplot $octanolplot -interfaceplot $interfaceplot - -differenceplot $differenceplot -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/octanol.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_oddcomp.xml --- a/tools/emboss_5/emboss_oddcomp.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ - - - Find protein sequence regions with a biased composition - emboss - oddcomp -sequence $input1 -infile $input2 -outfile $out_file1 -window $window -ignorebz $ignorebz -auto - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/oddcomp.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_palindrome.xml --- a/tools/emboss_5/emboss_palindrome.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ - - Looks for inverted repeats in a nucleotide sequence - emboss - palindrome -sequence $input1 -outfile $out_file1 -minpallen $minpallen -maxpallen $maxpallen -gaplimit $gaplimit -nummismatches $nummismatches -overlap $overlap -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/palindrome.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pasteseq.xml --- a/tools/emboss_5/emboss_pasteseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ - - Insert one sequence into another - emboss - pasteseq -asequence $input2 -bsequence $input1 -outseq $out_file1 -pos $pos -osformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input datasets need to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pasteseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_patmatdb.xml --- a/tools/emboss_5/emboss_patmatdb.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ - - Search a protein sequence with a motif - emboss - patmatdb -sequence $input1 -outfile $out_file1 -motif "$motif" -rformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/patmatdb.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pepcoil.xml --- a/tools/emboss_5/emboss_pepcoil.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ - - Predicts coiled coil regions - emboss - pepcoil -sequence $input1 -outfile $out_file1 -window $window -coil $coil -frame $frame -other $other -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepcoil.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pepinfo.xml --- a/tools/emboss_5/emboss_pepinfo.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ - - - Plots simple amino acid properties in parallel - emboss - emboss_single_outputfile_wrapper.pl pepinfo -sequence $input1 -outfile $out_file1 -goutfile $out_file2 -graph png -hwindow $hwindow $plot_type -auto - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepinfo.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pepnet.xml --- a/tools/emboss_5/emboss_pepnet.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - - - Displays proteins as a helical net - emboss - pepnet -sequence $input1 -graph png -goutfile $out_file1 -squares $squares -diamonds $diamonds -octags $octags -amphipathic $amphipathic -auto - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepnet.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pepstats.xml --- a/tools/emboss_5/emboss_pepstats.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - Protein statistics - emboss - pepstats -sequence $input1 -outfile $out_file1 -termini $termini -auto - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepstats.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pepwheel.xml --- a/tools/emboss_5/emboss_pepwheel.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ - - - Shows protein sequences as helices - emboss - emboss_single_outputfile_wrapper.pl pepwheel -sequence $input1 -graph png -goutfile $out_file1 -squares $squares -diamonds $diamonds -octags $octags -amphipathic - $amphipathic -steps $steps -turns $turns -wheel $wheel -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepwheel.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pepwindow.xml --- a/tools/emboss_5/emboss_pepwindow.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ - - - Displays protein hydropathy - emboss - emboss_single_outputfile_wrapper.pl pepwindow -sequence $input1 -graph png -goutfile $out_file1 -length $length -auto - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepwindow.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_pepwindowall.xml --- a/tools/emboss_5/emboss_pepwindowall.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ - - - Displays protein hydropathy of a set of sequences - emboss - emboss_single_outputfile_wrapper.pl pepwindowall -sequence $input1 -graph png -goutfile $out_file1 -length $length -auto - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepwindowall.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_plotcon.xml --- a/tools/emboss_5/emboss_plotcon.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ - - - Plot quality of conservation of a sequence alignment - emboss - emboss_single_outputfile_wrapper.pl plotcon -sequences $input1 -graph png -goutfile $out_file1 -winsize $winsize -auto - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/plotcon.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_plotorf.xml --- a/tools/emboss_5/emboss_plotorf.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ - - - Plot potential open reading frames - emboss - emboss_single_outputfile_wrapper.pl plotorf -sequence $input1 -graph png -goutfile $out_file1 -start $start -stop $stop -auto - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/plotorf.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_polydot.xml --- a/tools/emboss_5/emboss_polydot.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ - - - Displays all-against-all dotplots of a set of sequences - emboss - emboss_single_outputfile_wrapper.pl polydot -sequence $input1 -graph png -goutfile $output2 -outfeat $output1 -wordsize $wordsize -boxit $boxit -dumpfeat yes -gap - $gap -auto - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/polydot.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_preg.xml --- a/tools/emboss_5/emboss_preg.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ - - Regular expression search of a protein sequence - emboss - preg -sequence $input1 -outfile $out_file1 -pattern "$pattern" -auto - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/preg.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_prettyplot.xml --- a/tools/emboss_5/emboss_prettyplot.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ - - - Displays aligned sequences, with colouring and boxing - emboss - prettyplot -sequences $input1 -graph png -goutfile $out_file1 -residuesperline $residuesperline -resbreak $resbreak -ccolours $ccolours -cidentity $cidentity -csimilarity $csimilarity - -cother $cother -docolour $docolour -gtitle $title -pair $pair -identity $identity -box $box -boxcol $boxcol -boxcolval $boxcolval -name $name -maxnamelen $maxnamelen -number $number -listoptions - $listoptions -consensus $consensus -collision $collision -alternative $alternative -showscore $showscore -portrait $portrait -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/prettyplot.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_prettyseq.xml --- a/tools/emboss_5/emboss_prettyseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ - - Output sequence with translated ranges - emboss - prettyseq -sequence $input1 -outfile $out_file1 -ruler $ruler -plabel $plabel -nlabel $nlabel -width $width -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/prettyseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_primersearch.xml --- a/tools/emboss_5/emboss_primersearch.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - - Searches DNA sequences for matches with primer pairs - emboss - primersearch -seqall $input1 -infile $input2 -outfile $out_file1 -mismatchpercent $mismatchpercent -auto - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/primersearch.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_revseq.xml --- a/tools/emboss_5/emboss_revseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ - - Reverse and complement a sequence - emboss - revseq -sequence $input1 -outseq $out_file1 -reverse $reverse -complement $complement -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/revseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_seqmatchall.xml --- a/tools/emboss_5/emboss_seqmatchall.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ - - All-against-all comparison of a set of sequences - emboss - seqmatchall -sequence $input1 -outfile $out_file1 -wordsize $wordsize -aformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - . - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/seqmatchall.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_seqret.xml --- a/tools/emboss_5/emboss_seqret.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ - - Reads and writes sequences - emboss - seqret -sequence $input1 -outseq $out_file1 -feature $feature -firstonly $firstonly -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/seqret.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_showfeat.xml --- a/tools/emboss_5/emboss_showfeat.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ - - - Show features of a sequence - emboss - showfeat -sequence $input1 -outfile $out_file1 -matchsource "$matchsource" -matchtype "$matchtype" -matchtag "$matchtag" -matchvalue "$matchvalue" -sort $sort -annotation "$annotation" -id - $id -description "$description" -scale "$scale" -width "$width" -collapse $collapse -forward $forward -reverse $reverse -unknown $unknown -strand $strand -source $source -position $position -type - $type -tags $tags -values $values -stricttags $stricttags -html $html_out1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/showfeat.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_shuffleseq.xml --- a/tools/emboss_5/emboss_shuffleseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ - - - Shuffles a set of sequences maintaining composition - emboss - shuffleseq -sequence $input1 -outseq $out_file1 -shuffle "$shuffle" -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/shuffleseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_sigcleave.xml --- a/tools/emboss_5/emboss_sigcleave.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ - - Reports protein signal cleavage sites - emboss - sigcleave -sequence $input1 -outfile $out_file1 -minweight "$minweight" -prokaryote $prokaryote -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/sigcleave.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_single_outputfile_wrapper.pl --- a/tools/emboss_5/emboss_single_outputfile_wrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -#! /usr/bin/perl -w -use strict; -use File::Copy; - -my $cmd_string = join (" ",@ARGV); -my $results = `$cmd_string`; -my @files = split("\n",$results); -my $fileNameOut = $ARGV[6]; -my ($drive, $outputDir, $file) = File::Spec->splitpath( $fileNameOut ); -my $destination = $fileNameOut; - -foreach my $thisLine (@files) -{ - if ($thisLine =~ /Created /) - { - $thisLine =~ /[\w|\.]+$/; - $thisLine =$&; - #print "outfile: $thisLine\n"; - #there is only one file to move, so we can quit after finding it - move($drive.$outputDir.$thisLine,$fileNameOut); - exit(1); - } - else - { - print $thisLine,"\n"; - } -} diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_sirna.xml --- a/tools/emboss_5/emboss_sirna.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ - - Finds siRNA duplexes in mRNA - emboss - sirna -sequence $input1 -outfile $ofile1 -outseq $ofile2 -poliii $poliii -aa $aa -tt $tt -polybase $polybase -context $context -rformat2 $out_format1 -osformat3 $out_format2 - -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/sirna.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_sixpack.xml --- a/tools/emboss_5/emboss_sixpack.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,162 +0,0 @@ - - - Display a DNA sequence with 6-frame translation and ORFs - emboss - sixpack -sequence $input1 -outfile $ofile1 -outseq $ofile2 -table $table -firstorf $firstorf -lastorf $lastorf -mstart $mstart -reverse $reverse -orfminsize $orfminsize -uppercase - "$uppercase" -number $number -width "$width" -length "$length" -margin "$margin" -name $disp_name -description $description -offset "$offset" -html $html_out1 -osformat $out_format2 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/sixpack.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_skipseq.xml --- a/tools/emboss_5/emboss_skipseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ - - Reads and writes sequences, skipping first few - emboss - skipseq -sequence $input1 -outseq $out_file1 -skip "$skip" -feature $feature -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/skipseq.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_splitter.xml --- a/tools/emboss_5/emboss_splitter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ - - Split a sequence into (overlapping) smaller sequences - emboss - splitter -sequence $input1 -outseq $out_file1 -size "$size" -overlap "$overlap" -addoverlap $addoverlap -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/splitter.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_supermatcher.xml --- a/tools/emboss_5/emboss_supermatcher.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ - - - Match large sequences against one or more other sequences - emboss - supermatcher -asequence $input1 -bsequence $input2 -gapopen "$gapopen" -gapextend "$gapextend" -width "$width" -wordlen "$wordlen" -outfile $ofile1 -errorfile $ofile2 -aformat3 - $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/supermatcher.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_syco.xml --- a/tools/emboss_5/emboss_syco.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,197 +0,0 @@ - - - Synonymous codon usage Gribskov statistic plot - emboss - emboss_single_outputfile_wrapper.pl syco -sequence $input1 -graph png -goutfile $ofile1 -outfile $ofile2 -cfile $cfile -window "$window" -uncommon $uncommon -minimum "$minimum" - -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/syco.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_tcode.xml --- a/tools/emboss_5/emboss_tcode.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - Fickett TESTCODE statistic to identify protein-coding DNA - emboss - tcode -sequence $input1 -outfile $out_file1 -window "$window" -step "$step" -rformat $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/tcode.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_textsearch.xml --- a/tools/emboss_5/emboss_textsearch.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - Search sequence documentation. Slow, use SRS and Entrez! - emboss - textsearch -sequence $input1 -outfile $out_file1 -pattern "$pattern" -casesensitive -heading $heading -usa $usa -accession $accession -name $search_name -description $description -html - $html_out1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/textsearch.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_tmap.xml --- a/tools/emboss_5/emboss_tmap.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ - - Displays membrane spanning regions - emboss - emboss_single_outputfile_wrapper.pl tmap -sequences $input1 -outfile $out_file1 -goutfile $out_file2 -graph png -rformat $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/tmap.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_tranalign.xml --- a/tools/emboss_5/emboss_tranalign.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ - - Align nucleic coding regions given the aligned proteins - emboss - tranalign -asequence $input1 -bsequence $input2 -outseq $out_file1 -table $table -osformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/tranalign.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_transeq.xml --- a/tools/emboss_5/emboss_transeq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ - - Translate nucleic acid sequences - emboss - transeq -sequence $input1 -outseq $out_file1 -frame $frame -table $table -regions "$regions" -trim $trim -clean $clean -alternative $alternative -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/transeq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_trimest.xml --- a/tools/emboss_5/emboss_trimest.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ - - Trim poly-A tails off EST sequences - emboss - trimest -sequence $input1 -outseq $out_file1 -minlength "$minlength" -mismatches "$mismatches" -reverse $reverse -tolower $tolower -fiveprime $fiveprime -osformat2 $out_format1 - -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/trimest.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_trimseq.xml --- a/tools/emboss_5/emboss_trimseq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ - - Trim ambiguous bits off the ends of sequences - emboss - trimseq -sequence $input1 -outseq $out_file1 -window "$window" -percent "$percent" -strict $strict -star $star -left $left -right $right -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/trimseq.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_twofeat.xml --- a/tools/emboss_5/emboss_twofeat.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ - - Finds neighbouring pairs of features in sequences - emboss - twofeat -sequence $input1 -outfile $out_file1 -atype "$atype" -btype "$btype" -minrange "$minrange" -maxrange "$maxrange" -asource "$asource" -asense $asense -aminscore "$aminscore" - -amaxscore "$amaxscore" -atag "$atag" -avalue "$avalue" -bsource "$bsource" -bsense "$bsense" -bminscore "$bminscore" -bmaxscore "$bmaxscore" -btag "$btag" -bvalue "$bvalue" -overlap $overlap - -rangetype $rangetype -sense $sense -order $order -twoout $twoout -typeout "$typeout" -rformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/twofeat.html - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_union.xml --- a/tools/emboss_5/emboss_union.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ - - Reads sequence fragments and builds one sequence - emboss - union -sequence $input1 -outseq $out_file1 -osformat2 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/union.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_vectorstrip.xml --- a/tools/emboss_5/emboss_vectorstrip.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ - - Strips out DNA between a pair of vector sequences - emboss - vectorstrip -sequence $input1 -vectorsfile $input2 -outseq $ofile1 -outfile $ofile2 -vectorfile yes -mismatch "$mismatch" -besthits $besthits -linkera "$linkera" -linkerb - "$linkerb" -osformat4 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/vectorstrip.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_water.xml --- a/tools/emboss_5/emboss_water.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ - - Smith-Waterman local alignment - emboss - water -asequence $input1 -bsequence $input2 -outfile $out_file1 -gapopen "$gapopen" -gapextend "$gapextend" -brief $brief -aformat3 $out_format1 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input datasets need to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/water.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_wobble.xml --- a/tools/emboss_5/emboss_wobble.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ - - Wobble base plot - emboss - emboss_single_outputfile_wrapper.pl wobble -sequence $input1 -graph png -goutfile $ofile1 -outfile $ofile2 -window "$window" -bases "$bases" -auto - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/wobble.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_wordcount.xml --- a/tools/emboss_5/emboss_wordcount.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ - - Counts words of a specified size in a DNA sequence - emboss - wordcount -sequence $input1 -outfile $out_file1 -wordsize "$wordsize" -auto - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input dataset needs to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/wordcount.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/emboss_5/emboss_wordmatch.xml --- a/tools/emboss_5/emboss_wordmatch.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ - - Finds all exact matches of a given size between 2 sequences - emboss - wordmatch -asequence $input1 -bsequence $input2 -outfile $out_file1 -aoutfeat $out_file2 -boutfeat $out_file3 -wordsize "$wordsize" -aformat3 $out_format1 -offormat4 $out_format2 - -offormat5 $out_format3 -auto - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -The input datasets need to be sequences. - ------ - - You can view the original documentation here_. - - .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/wordmatch.html - - diff -r c2a356708570 -r 33c067c3ae34 tools/encode/gencode_partition.xml --- a/tools/encode/gencode_partition.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ - - an interval file - split_by_partitions.py ${GALAXY_DATA_INDEX_DIR} $input1 $out_file1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} - - - - - - - - - - - - - -For detailed information about partitioning, click here_. - -.. _here: http://genome.imim.es/gencode/wiki/index.php/Collecting_Feature_Sets_from_All_Analysis_Groups - -Datasets are partitioned according to the protocol below: - -A partition scheme has been defined that is similar to what has previously been done with TARs/TRANSFRAGs such that any feature can be classified as falling into one of the following 6 categories: - 1. **Coding** -- coding exons defined from the GENCODE experimentally verified coding set (coding in any transcript) - 2. **5UTR** -- 5' UTR exons defined from the GENCODE experimentally verified coding set (5' UTR in some transcript but never coding in any other) - 3. **3UTR** -- 3' UTR exons defined from the GENCODE experimentally verified coding set (3' UTR in some transcript but never coding in any other) - 4. **Intronic Proximal** -- intronic and no more than 5kb away from an exon. - 5. **Intergenic Proximal** -- between genes and no more than 5kb away from an exon. - 6. **Intronic Distal** -- intronic and greater than 5kb away from an exon. - 7. **Intergenic Distal** -- between genes and greater than 5kb away from an exon. - ------ - -.. class:: infomark - -**Note:** Features overlapping more than one partition will take the identity of the lower-numbered partition. - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/encode/random_intervals.xml --- a/tools/encode/random_intervals.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - -create a random set of intervals - random_intervals_no_bits.py $regions $input2 $input1 $out_file1 ${input2.metadata.chromCol} ${input2.metadata.startCol} ${input2.metadata.endCol} ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} $use_mask $strand_overlaps ${GALAXY_DATA_INDEX_DIR} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -This tool currently only works with ENCODE data from genome builds hg16 or hg17. - ------ - -.. class:: infomark - -**Note:** If you do not wish to mask a set of intervals, change the Use Mask option to No, this option will override any Mask files selected. - ------ - -**Syntax** - -This tool will attempt to create a random set of intervals that mimic those found within your source file. You may also specify a set of intervals to mask. - -**Allow overlaps** options - * **Across Strands** - random regions are allowed to overlap only if they are on different strands. - * **Any** - all overlaps are allowed. - * **None** - no overlapping regions are allowed. - -**Regions to use** options - * Bounding region of interest based on the dataset build. - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/encode/random_intervals_no_bits.py --- a/tools/encode/random_intervals_no_bits.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,253 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -#%prog bounding_region_file mask_intervals_file intervals_to_mimic_file out_file mask_chr mask_start mask_end interval_chr interval_start interval_end interval_strand use_mask allow_strand_overlaps -import sys, random -from copy import deepcopy -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import bx.intervals.io -import bx.intervals.intersection -import psyco_full - -assert sys.version_info[:2] >= ( 2, 4 ) - -max_iters = 5 - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -#Try to add a random region -def add_random_region( mimic_region, bound, exist_regions, plus_mask, minus_mask, overlaps ): - region_length, region_strand = mimic_region - plus_count = plus_mask.count_range() - minus_count = minus_mask.count_range() - gaps = [] - - if region_strand == "-": - gaps = minus_mask.get_gaps( region_length ) - else: - gaps = plus_mask.get_gaps( region_length ) - - while True: - try: - gap_length, gap_start, gap_end = gaps.pop( random.randint( 0, len( gaps ) - 1 ) ) - except: - break - try: - start = random.randint( bound.start + gap_start, bound.start + gap_end - region_length - 1 ) - except ValueError, ve: - stop_err( "Exception thrown generating random start value: %s" %str( ve ) ) - - end = start + region_length - try_plus_mask = plus_mask.copy() - try_minus_mask = minus_mask.copy() - - if region_strand == "-": - try_minus_mask.set_range( start - bound.start, end - bound.start ) - else: - try_plus_mask.set_range( start - bound.start, end - bound.start ) - - rand_region = bx.intervals.io.GenomicInterval( None, [bound.chrom, start, end, region_strand], 0, 1, 2, 3, "+", fix_strand=True ) - - if try_plus_mask.count_range() == plus_count + region_length or try_minus_mask.count_range() == minus_count + region_length: - if overlaps in ["strand", "all"]: #overlaps allowed across strands - exist_regions.append( rand_region ) - if overlaps == "strand": - return exist_regions, True, try_plus_mask, try_minus_mask - else: #overlaps allowed everywhere - return exist_regions, True, plus_mask, minus_mask - else: #no overlapping anywhere - exist_regions.append( rand_region ) - if region_strand == "-": - return exist_regions, True, try_minus_mask.copy(), try_minus_mask - else: - return exist_regions, True, try_plus_mask, try_plus_mask.copy() - return exist_regions, False, plus_mask, minus_mask - -def main(): - includes_strand = False - region_uid = sys.argv[1] - mask_fname = sys.argv[2] - intervals_fname = sys.argv[3] - out_fname = sys.argv[4] - try: - mask_chr = int( sys.argv[5] ) - 1 - except: - stop_err( "'%s' is an invalid chrom column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[5] ) ) - try: - mask_start = int( sys.argv[6] ) - 1 - except: - stop_err( "'%s' is an invalid start column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[6] ) ) - try: - mask_end = int( sys.argv[7] ) - 1 - except: - stop_err( "'%s' is an invalid end column for 'Intervals to Mask' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[7] ) ) - try: - interval_chr = int( sys.argv[8] ) - 1 - except: - stop_err( "'%s' is an invalid chrom column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[8] ) ) - try: - interval_start = int( sys.argv[9] ) - 1 - except: - stop_err( "'%s' is an invalid start column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[9] ) ) - try: - interval_end = int( sys.argv[10] ) - 1 - except: - stop_err( "'%s' is an invalid end column for 'File to Mimick' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[10] ) ) - try: - interval_strand = int( sys.argv[11] ) - 1 - includes_strand = True - except: - interval_strand = -1 - if includes_strand: - use_mask = sys.argv[12] - overlaps = sys.argv[13] - else: - use_mask = sys.argv[11] - overlaps = sys.argv[12] - available_regions = {} - loc_file = "%s/regions.loc" % sys.argv[-1] - - for i, line in enumerate( file( loc_file ) ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - fields = line.split( '\t' ) - #read each line, if not enough fields, go to next line - try: - build = fields[0] - uid = fields[1] - description = fields[2] - filepath = fields[3] - available_regions[uid] = filepath - except: - continue - - if region_uid not in available_regions: - stop_err( "Region '%s' is invalid." % region_uid ) - region_fname = available_regions[region_uid].strip() - - #set up bounding regions to hold random intervals - bounds = [] - for bound in bx.intervals.io.NiceReaderWrapper( open( region_fname, 'r' ), chrom_col=0, start_col=1, end_col=2, fix_strand=True, return_header=False, return_comments=False ): - bounds.append( bound ) - #set up length and number of regions to mimic - regions = [ [] for i in range( len( bounds ) ) ] - - for region in bx.intervals.io.NiceReaderWrapper( open( intervals_fname, 'r' ), chrom_col=interval_chr, start_col=interval_start, end_col=interval_end, strand_col=interval_strand, fix_strand=True, return_header=False, return_comments=False ): - #loop through bounds, find first proper bounds then add - #if an interval crosses bounds, it will be added to the first bound - for i in range( len( bounds ) ): - if bounds[i].chrom != region.chrom: - continue - intersecter = bx.intervals.intersection.Intersecter() - intersecter.add_interval( bounds[i] ) - if len( intersecter.find( region.start, region.end ) ) > 0: - regions[i].append( ( region.end - region.start, region.strand ) ) #add region to proper bound and go to next region - break - for region in regions: - region.sort() - region.reverse() - - #read mask file - mask = [] - if use_mask != "no_mask": - for region in bx.intervals.io.NiceReaderWrapper( open( mask_fname, 'r' ), chrom_col=mask_chr, start_col=mask_start, end_col=mask_end, fix_strand=True, return_header=False, return_comments=False ): - mask.append( region ) - - try: - out_file = open ( out_fname, "w" ) - except: - stop_err( "Error opening output file '%s'." % out_fname ) - - i = 0 - i_iters = 0 - region_count = 0 - best_regions = [] - num_fail = 0 - while i < len( bounds ): - i_iters += 1 - #order regions to mimic - regions_to_mimic = regions[i][0:] - if len( regions_to_mimic ) < 1: #if no regions to mimic, skip - i += 1 - i_iters = 0 - continue - #set up region mask - plus_mask = Region( bounds[i].end - bounds[i].start ) - for region in mask: - if region.chrom != bounds[i].chrom: continue - mask_start = region.start - bounds[i].start - mask_end = region.end - bounds[i].start - if mask_start >= 0 and mask_end > 0: - plus_mask.set_range( mask_start, mask_end ) - minus_mask = plus_mask.copy() - random_regions = [] - num_added = 0 - for j in range( len( regions[i] ) ): - random_regions, added, plus_mask, minus_mask = add_random_region( regions_to_mimic[j], bounds[i], random_regions, plus_mask, minus_mask, overlaps ) - if added: - num_added += 1 - if num_added == len( regions_to_mimic ) or i_iters >= max_iters: - if len( best_regions ) > len( random_regions ): - random_regions = best_regions.copy() - num_fail += ( len( regions_to_mimic ) - len( random_regions ) ) - i_iters = 0 - best_regions = [] - for region in random_regions: - print >>out_file, "%s\t%d\t%d\t%s\t%s\t%s" % ( region.chrom, region.start, region.end, "region_" + str( region_count ), "0", region.strand ) - region_count += 1 - else: - i -= 1 - if len( best_regions ) < len( random_regions ): - best_regions = random_regions[:] - i+=1 - - out_file.close() - if num_fail: - print "After %i iterations, %i regions could not be added." % (max_iters, num_fail) - if use_mask == "use_mask": - print "The mask you have provided may be too restrictive." - -class Region( list ): - """ - A list for on/off regions - """ - def __init__( self, size=0 ): - for i in range( size ): - self.append( False ) - def copy( self ): - return deepcopy( self ) - def set_range( self, start=0, end=None ): - if start < 0: - start = 0 - if ( not end and end != 0 ) or end > len( self ): - end = len( self ) - for i in range( start, end ): - self[i]=True - def count_range( self, start=0, end=None ): - if start < 0: - start = 0 - if ( not end and end != 0 ) or end > len( self ): - end = len( self ) - return self[start:end].count( True ) - def get_gaps( self, min_size = 0 ): - gaps = [] - start = end = 0 - while True: - try: - start = self[end:].index( False ) + end - except: - break - try: - end = self[start:].index( True ) + start - except: - end = len( self ) - if end > start and end - start >= min_size: - gaps.append( ( end - start, start, end ) ) - gaps.sort() - gaps.reverse() - return gaps - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/encode/split_by_partitions.py --- a/tools/encode/split_by_partitions.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,125 +0,0 @@ -#!/usr/bin/env python -#Original script from /home/james/work/encode/feature_partitions/split_by_partitions.py - -#Usage: python(2.4) split_by_partitions.py partition_index in_file out_file chrCol startCol endCol strandCol - -from __future__ import division - -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.bitset import * -from bx.bitset_builders import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - GALAXY_DATA_INDEX_DIR = sys.argv[1] - partition_index = '%s/encode_feature_partitions/partition_list.txt' % GALAXY_DATA_INDEX_DIR - partition_offset = "%s/encode_feature_partitions/" % GALAXY_DATA_INDEX_DIR - - warnings = [] - - # Load up the partitions - partitions = list() - try: - for line in open( partition_index ): - name, score, filename = line.split() - partitions.append( ( name, score, binned_bitsets_from_file( open( partition_offset+filename ) ) ) ) - except: - stop_err( "Error loading partitioning dataset." ) - - try: - in_file = open( sys.argv[2] ) - except: - stop_err( "Bad input data." ) - - try: - out_file = open( sys.argv[3], "w" ) - except: - stop_err( "Bad output file." ) - - try: - chrCol = int( sys.argv[4] ) - 1 - except: - stop_err( "Bad chr column: %s" % ( str( sys.argv[4] ) ) ) - try: - startCol = int( sys.argv[5] ) - 1 - except: - stop_err( "Bad start column: %s" % ( str( sys.argv[5] ) ) ) - try: - endCol = int( sys.argv[6] ) - 1 - except: - stop_err( "Bad end column: %s" % ( str( sys.argv[6] ) ) ) - try: - strandCol = int( sys.argv[7] )-1 - except: - strandCol = -1 - - line_count = 0 - skipped_lines = 0 - first_invalid_line = None - invalid_line = '' - try: - for line in in_file: - line_count += 1 - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - fields = line.split( '\t' ) - try: - chr, start, end = fields[chrCol], int( fields[startCol] ), int( fields[endCol] ) - except: - skipped_lines += 1 - if first_invalid_line is None: - first_invalid_line = line_count - invalid_line = line - continue - label = "input_line_" + str( line_count ) #if input file type was known to be bed, then could guess at label column - - if strandCol < 0: - strand = "+" - else: - try: - strand = fields[strandCol] - except: - strand = "+" - - # Find which partition it overlaps - overlap = 0 - for name, score, bb in partitions: - # Is there at least 1bp overlap? - if chr in bb: - overlap = bb[chr].count_range( start, end-start ) - if overlap > 0: - break - else: - # No overlap with any partition? For now throw this since the - # partitions tile the encode regions completely, indicate an interval - # that does not even overlap an encode region - warning = "warning: Interval (%s, %d, %d) does not overlap any partition" % ( chr, start, end ) + ", line[" + str( line_count ) + "]. " - warnings.append( warning ) - name = "no_overlap" - score = 0 - # Annotate with the name of the partition - frac_overlap = overlap / ( end-start ) - # BED6 plus? - print >>out_file, "%s\t%d\t%d\t%s\t%s\t%s\t%s\t%0.4f" % ( chr, start, end, label, score, strand, name, frac_overlap ) - except: - out_file.close() - in_file.close() - stop_err( "Unknown error while processing line # %d: %s" % ( line_count, line ) ) - out_file.close() - in_file.close() - - if warnings: - warn_msg = "This tool is useful on ENCODE regions only, %d warnings, 1st is: " % len( warnings ) - warn_msg += warnings[0] - print warn_msg - if skipped_lines: - print "Skipped %d invalid lines starting at line # %d: %s" % ( skipped_lines, first_invalid_line, invalid_line ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/evolution/add_scores.xml --- a/tools/evolution/add_scores.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ - - interspecies conservation scores - - - add_scores $input1 ${input1.metadata.dbkey} ${input1.metadata.chromCol} ${input1.metadata.startCol} ${GALAXY_DATA_INDEX_DIR}/add_scores.loc $out_file1 - - - - - - - - - - - - - - - add_scores - - - - - - - - - - - - - - -.. class:: warningmark - -This currently works only for build hg18. - ------ - -**Dataset formats** - -The input can be any interval_ format dataset. The output is also in interval format. -(`Dataset missing?`_) - -.. _interval: ./static/formatHelp.html#interval -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool adds a column that measures interspecies conservation at each SNP -position, using conservation scores for primates pre-computed by the -phyloP program. PhyloP performs an exact P-value computation under a -continuous Markov substitution model. - -The chromosome and start position -are used to look up the scores, so if a larger interval is in the input, -only the score for the first nucleotide is returned. - ------ - -**Example** - -- input file, with SNPs:: - - chr22 16440426 14440427 C/T - chr22 15494851 14494852 A/G - chr22 14494911 14494912 A/T - chr22 14550435 14550436 A/G - chr22 14611956 14611957 G/T - chr22 14612076 14612077 A/G - chr22 14668537 14668538 C - chr22 14668703 14668704 A/T - chr22 14668775 14668776 G - chr22 14680074 14680075 A/T - etc. - -- output file, showing conservation scores for primates:: - - chr22 16440426 14440427 C/T 0.509 - chr22 15494851 14494852 A/G 0.427 - chr22 14494911 14494912 A/T NA - chr22 14550435 14550436 A/G NA - chr22 14611956 14611957 G/T -2.142 - chr22 14612076 14612077 A/G 0.369 - chr22 14668537 14668538 C 0.419 - chr22 14668703 14668704 A/T -1.462 - chr22 14668775 14668776 G 0.470 - chr22 14680074 14680075 A/T 0.303 - etc. - - "NA" means that the phyloP score was not available. - ------ - -**Reference** - -Siepel A, Pollard KS, Haussler D. (2006) -New methods for detecting lineage-specific selection. -In Proceedings of the 10th International Conference on Research in Computational -Molecular Biology (RECOMB 2006), pp. 190-205. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/evolution/codingSnps.pl --- a/tools/evolution/codingSnps.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,528 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -######################################################################### -# codingSnps.pl -# This takes a bed file with the names being / separated nts -# and a gene bed file with cds start and stop. -# It then checks for changes in coding regions, reporting -# those that cause a frameshift or substitution in the amino acid. -######################################################################### - -my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib -if (!@ARGV or scalar @ARGV < 3) { - print "Usage: codingSnps.pl snps.bed genes.bed (/dir/*$seqFlag|Galaxy build= loc=) [chr=# start=# end=# snp=# keepColumns=1] > codingSnps.txt\n"; - exit; -} -my $uniq = 0; #flag for whether want uniq positions -my $syn = 0; #flag for if want synonomous changes rather than non-syn -my $keep = 0; #keep old columns and append new ones -my $snpFile = shift @ARGV; -my $geneFile = shift @ARGV; -my $nibDir = shift @ARGV; #2bit or nib, depending on flag above -if ($nibDir eq 'Galaxy') { getGalaxyInfo(); } -my $col0 = 0; #bed like columns in default positions -my $col1 = 1; -my $col2 = 2; -my $col3 = 3; -#column positions 1 based coming in (for Galaxy) -foreach (@ARGV) { - if (/chr=(\d+)/) { $col0 = $1 -1; } - elsif (/start=(\d+)/) { $col1 = $1 -1; } - elsif (/end=(\d+)/) { $col2 = $1 -1; } - elsif (/snp=(\d+)/) { $col3 = $1 -1; } - elsif (/keepColumns=1/) { $keep = 1; } -} -if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { - print STDERR "ERROR column numbers are given with origin 1\n"; - exit 1; -} -my @genes; #bed lines for genes, sorted by chrom and start -my %chrSt; #index in array where each chrom starts -my %codon; #hash of codon amino acid conversions -my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom -my $ignoreN = 1; #skip N - -my %amb = ( -"R" => "A/G", -"Y" => "C/T", -"S" => "C/G", -"W" => "A/T", -"K" => "G/T", -"M" => "A/C", -"B" => "C/G/T", -"D" => "A/G/T", -"H" => "A/C/T", -"V" => "A/C/G", -"N" => "A/C/G/T" -); -fill_codon(); -open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") - or die "Couldn't open and sort $geneFile, $!\n"; -my $i = 0; -while() { - chomp; - if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; } - push(@genes, "$_"); - my @f = split(/\t/); - if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; } - $i++; -} -close FH or die "Couldn't close $geneFile, $!\n"; - -if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; } - -#open snps sorted as well -my $s1 = $col0 + 1; #sort order is origin 1 -my $s2 = $col1 + 1; -open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |") - or die "Couldn't open and sort $snpFile, $!\n"; -$i = 0; -my @g; #one genes fields, should be used repeatedly -my %done; -while() { - chomp; - if (/^\s*#/) { next; } #comment - my @s = split(/\t/); #SNP fields - if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } - my $size = $#s; - if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) { - print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n"; - exit 1; - } - if ($s[$col1] =~ /\D/) { - print STDERR "ERROR the start point must be an integer not $s[$col1]\n"; - exit 1; - } - if ($s[$col2] =~ /\D/) { - print STDERR "ERROR the start point must be an integer not $s[$col2]\n"; - exit 1; - } - if ($s[$col3] eq 'N' && $ignoreN) { next; } - if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; } - if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row - $i = $chrSt{$s[$col0]}; - @g = split(/\t/, $genes[$i]); - if (scalar @g < 12) { - print STDERR "ERROR the gene file must be the whole genes in BED format\n"; - exit 1; - } - }elsif (!@g) { - next; #no gene for this chrom - }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom - $i = $chrSt{$s[$col0]}; - @g = split(/\t/, $genes[$i]); - }elsif ($s[$col0] ne $g[0]) { - next; #no gene for this chrom - }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) { - next; #before any genes - }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) { - next; #after all genes on chr - }else { - while ($s[$col1] > $g[2] && $i < $#genes) { - $i++; - @g = split(/\t/, $genes[$i]); - if ($s[$col0] ne $g[0]) { last; } #end of gene - } - if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) { - next; #no overlap with genes - } - } - - processSnp(\@s, \@g); - if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; } - - my $k = $i + 1; #check for more genes without losing data of first - if ($k <= $#genes) { - my @g2 = split(/\t/, $genes[$k]); - while (@g2 && $k <= $#genes) { - @g2 = split(/\t/, $genes[$k]); - if ($s[$col0] ne $g2[0]) { - undef @g2; - last; #not same chrom - }else { - while ($s[$col1] > $g2[2] && $k <= $#genes) { - $k++; - @g2 = split(/\t/, $genes[$k]); - if ($s[$col0] ne $g2[0]) { last; } #end of chrom - } - if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) { - undef @g2; - last; #no overlap with more genes - } - processSnp(\@s, \@g2); - if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; } - } - $k++; - } - } -} -close FH or die "Couldn't close $snpFile, $!\n"; - -exit; - -######################################################################## -sub processSnp { - my $sref = shift; - my $gref = shift; - #overlaps gene, but maybe not coding seq - #inside cds - if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) { - return; #outside of coding - } - #now check exon - my $i = 0; - my @st = split(/,/, $gref->[11]); - my @size = split(/,/, $gref->[10]); - if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; } - my @pos; - my $in = 0; - for($i = 0; $i < $gref->[9]; $i++) { - my $sta = $gref->[1] + $st[$i] + 1; #1 based position - my $end = $sta + $size[$i] - 1; # - if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes - if ($end < $gref->[6]) { next; } #utr only - if ($sta > $gref->[7]) { next; } #utr only - #shorten to coding only - if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; } - if ($end > $gref->[7]) { $end = $gref->[7]; } - if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; } - elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; } - push(@pos, ($sta .. $end)); #add exon worth of positions - } - #@pos has coding positions for whole gene (chr coors), - #and $in has whether we need to continue - if (!$in) { return; } #not in coding exon - if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons - if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or - $sref->[$col3] =~ /^-+$/) { #indel or del - my $copy = $sref->[$col3]; - my $c = ($copy =~ tr/-//); - if ($c % 3 == 0) { return; } #not frameshift - #handle bed4 or any interval file - if (!$keep) { - print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; - print "\t$gref->[3]\tframeshift\n"; - }else { - my @s = @{$sref}; - print join("\t", @s), "\t$gref->[3]\tframeshift\n"; - } - $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; - return; - }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion - my $copy = $sref->[$col3]; - my $c = ($copy =~ tr/\[ACTG]+//); - if ($c % 3 == 0) { return; } #not frameshift - #handle bed4 or any interval file - if (!$keep) { - print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; - print "\t$gref->[3]\tframeshift\n"; - }else { - my @s = @{$sref}; - print join("\t", @s), "\t$gref->[3]\tframeshift\n"; - } - $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; - return; - }elsif ($sref->[$col3] =~ /-/) { #indel and sub? - return; #skip - } - #check for amino acid substitutions - my $s = $sref->[$col1] + 1; - my $e = $sref->[$col2]; - my $len = $sref->[$col2] - $sref->[$col1]; - if ($gref->[5] eq '-') { - @pos = reverse(@pos); - my $t = $s; - $s = $e; - $e = $t; - } - $i = 0; - my $found = 0; - foreach (@pos) { - if ($s == $_) { - $found = 1; - last; - } - $i++; - } - if ($found) { - my $fs = $i; #keep original start index - #have index where substitution starts - my $cp = $i % 3; - $i -= $cp; #i is now first position in codon - my $cdNum = int($i / 3) + 1; - my $ls = $i; - if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; } - if (!@pos) { die "ERROR not defined array pos\n"; } - if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; } - if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; } - while ($ls <= $#pos && $pos[$ls] ne $e) { - $ls++; - } - my $i2 = $ls + (2 - ($ls % 3)); - if ($i2 > $#pos) { return; } #not a full codon, partial gene? - - if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; } - my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]); - if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; } - my @vars = split(/\//, $sref->[$col3]); - if ($gref->[5] eq '-') { #complement oldnts and revcomp vars - $oldnts = compl($oldnts); - if (!$oldnts) { return; } #skip this one - $oldnts = join('', (reverse(split(/ */, $oldnts)))); - foreach (@vars) { - $_ = reverse(split(/ */)); #needed for indels - $_ = compl($_); - } - } - my $r = $fs - $i; #difference in old indexes gives new index - my @newnts; - my $changed = ''; - foreach my $v (@vars) { - if (!$v or length($v) != 1) { return; } #only simple changes - my @new = split(/ */, $oldnts); - $changed = splice(@new, $r, $len, split(/ */, $v)); - #should only change single nt - push(@newnts, join("", @new)); - } - #now compute amino acids - my $oldaa = getaa($oldnts); - my @newaa; - my $change = 0; #flag for if there is a change - foreach my $v (@newnts) { - my $t = getaa($v); - if ($t ne $oldaa) { $change = 1; } - push(@newaa, $t); - } - if (!$change && $syn) { - if (!$keep) { - print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; - print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; - }else { - my @s = @{$sref}; - print join("\t", @s), - "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; - } - return; - }elsif ($syn) { return; } #only want synonymous changes - if (!$change) { return; } #no change in amino acids - if (!$keep) { - print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; - if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref - if (!$changed) { return; } #skip this one - print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; - }else { - my @s = @{$sref}; - print join("\t", @s); - if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref - if (!$changed) { return; } #skip this one - print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; - } - $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; - } -} - -sub getnts { - my $chr = shift; - my @pos = @_; #list of positions not necessarily in order - #list may be reversed or have gaps(introns), at least 3 bps - my $seq = ''; - if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; } - if ($pos[0] < $pos[1]) { #not reversed - my $s = $pos[0]; - for(my $i = 1; $i <= $#pos; $i++) { - if ($pos[$i] == $pos[$i-1] + 1) { next; } - if ($seqFlag eq '2bit') { - $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]); - }else { - $seq .= fetchSeqNib($chr, $s, $pos[$i-1]); - } - $s = $pos[$i]; - } - if (length $seq != scalar @pos) { #still need to fetch seq - if ($seqFlag eq '2bit') { - $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]); - }else { - $seq .= fetchSeqNib($chr, $s, $pos[$#pos]); - } - } - }else { #reversed - my $s = $pos[$#pos]; - for(my $i = $#pos -1; $i >= 0; $i--) { - if ($pos[$i] == $pos[$i+1] + 1) { next; } - if ($seqFlag eq '2bit') { - $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]); - }else { - $seq .= fetchSeqNib($chr, $s, $pos[$i+1]); - } - $s = $pos[$i]; - } - if (length $seq != scalar @pos) { #still need to fetch seq - if ($seqFlag eq '2bit') { - $seq .= fetchSeq2bit($chr, $s, $pos[0]); - }else { - $seq .= fetchSeqNib($chr, $s, $pos[0]); - } - } - } -} - -sub fetchSeq2bit { - my $chr = shift; - my $st = shift; - my $end = shift; - my $strand = '+'; - $st--; #change to UCSC numbering - open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or - die "Couldn't run twoBitToFa, $!\n"; - my $seq = ''; - while () { - chomp; - if (/^>/) { next; } #header - $seq .= uc($_); - } - close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n"; - return $seq; -} - -sub fetchSeqNib { - my $chr = shift; - my $st = shift; - my $end = shift; - my $strand = '+'; - $st--; #change to UCSC numbering - open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; - my $seq = ''; - while () { - chomp; - if (/^>/) { next; } #header - $seq .= $_; - } - close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; - return $seq; -} - -sub compl { - my $nts = shift; - my $comp = ''; - if (!$nts) { die "ERROR called compl with nts undefined"; } - foreach my $n (split(/ */, $nts)) { - if ($n eq 'A') { $comp .= 'T'; } - elsif ($n eq 'T') { $comp .= 'A'; } - elsif ($n eq 'C') { $comp .= 'G'; } - elsif ($n eq 'G') { $comp .= 'C'; } - elsif ($n eq 'N') { $comp .= 'N'; } - elsif ($n eq '-') { $comp .= '-'; } #deletion - else { $comp = undef; } - } - return $comp; -} - -sub getaa { - my $nts = shift; #in multiples of 3 - my $aa = ''; - my @n = split(/ */, $nts); - while (@n) { - my @t = splice(@n, 0, 3); - my $n = uc(join("", @t)); - if (!exists $codon{$n}) { $aa .= 'N'; next; } - $aa .= $codon{$n}; - } - return $aa; -} - -sub fill_codon { -$codon{GCA} = 'Ala'; -$codon{GCC} = 'Ala'; -$codon{GCG} = 'Ala'; -$codon{GCT} = 'Ala'; -$codon{CGG} = 'Arg'; -$codon{CGT} = 'Arg'; -$codon{CGC} = 'Arg'; -$codon{AGA} = 'Arg'; -$codon{AGG} = 'Arg'; -$codon{CGA} = 'Arg'; -$codon{AAC} = 'Asn'; -$codon{AAT} = 'Asn'; -$codon{GAC} = 'Asp'; -$codon{GAT} = 'Asp'; -$codon{TGC} = 'Cys'; -$codon{TGT} = 'Cys'; -$codon{CAG} = 'Gln'; -$codon{CAA} = 'Gln'; -$codon{GAA} = 'Glu'; -$codon{GAG} = 'Glu'; -$codon{GGG} = 'Gly'; -$codon{GGA} = 'Gly'; -$codon{GGC} = 'Gly'; -$codon{GGT} = 'Gly'; -$codon{CAC} = 'His'; -$codon{CAT} = 'His'; -$codon{ATA} = 'Ile'; -$codon{ATT} = 'Ile'; -$codon{ATC} = 'Ile'; -$codon{CTA} = 'Leu'; -$codon{CTC} = 'Leu'; -$codon{CTG} = 'Leu'; -$codon{CTT} = 'Leu'; -$codon{TTG} = 'Leu'; -$codon{TTA} = 'Leu'; -$codon{AAA} = 'Lys'; -$codon{AAG} = 'Lys'; -$codon{ATG} = 'Met'; -$codon{TTC} = 'Phe'; -$codon{TTT} = 'Phe'; -$codon{CCT} = 'Pro'; -$codon{CCA} = 'Pro'; -$codon{CCC} = 'Pro'; -$codon{CCG} = 'Pro'; -$codon{TCA} = 'Ser'; -$codon{AGC} = 'Ser'; -$codon{AGT} = 'Ser'; -$codon{TCC} = 'Ser'; -$codon{TCT} = 'Ser'; -$codon{TCG} = 'Ser'; -$codon{TGA} = 'Stop'; -$codon{TAG} = 'Stop'; -$codon{TAA} = 'Stop'; -$codon{ACT} = 'Thr'; -$codon{ACA} = 'Thr'; -$codon{ACC} = 'Thr'; -$codon{ACG} = 'Thr'; -$codon{TGG} = 'Trp'; -$codon{TAT} = 'Tyr'; -$codon{TAC} = 'Tyr'; -$codon{GTC} = 'Val'; -$codon{GTA} = 'Val'; -$codon{GTG} = 'Val'; -$codon{GTT} = 'Val'; -} - -sub getGalaxyInfo { - my $build; - my $locFile; - foreach (@ARGV) { - if (/build=(.*)/) { $build = $1; } - elsif (/loc=(.*)/) { $locFile = $1; } - } - if (!$build or !$locFile) { - print STDERR "ERROR missing build or locfile for Galaxy input\n"; - exit 1; - } - # read $locFile to get $nibDir (ignoring commets) - open(LF, "< $locFile") || die "open($locFile): $!\n"; - while() { - s/#.*$//; - s/(?:^\s+|\s+$)//g; - next if (/^$/); - - my @t = split(/\t/); - if ($t[0] eq $build) { $nibDir = $t[1]; } - } - close(LF); - if ($nibDir eq 'Galaxy') { - print STDERR "Failed to find sequence directory in locfile $locFile\n"; - } - $nibDir .= "/$build.2bit"; #we want full path and filename -} - diff -r c2a356708570 -r 33c067c3ae34 tools/evolution/codingSnps.xml --- a/tools/evolution/codingSnps.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ - - amino-acid changes caused by a set of SNPs - - - codingSnps.pl $input1 $input2 Galaxy build=${input1.metadata.dbkey} loc=${GALAXY_DATA_INDEX_DIR}/codingSnps.loc chr=${input1.metadata.chromCol} start=${input1.metadata.startCol} end=${input1.metadata.endCol} snp=$col1 > $out_file1 - - - - - - - - - - - - - - - - - - - - cat - sort - ucsc_tools - - - - - - - - - - - - - - - - - - -.. class:: infomark - -The build must be defined for the input files and must be the same for both files. -Use the pencil icon to add the build to the files if necessary. - ------ - -**Dataset formats** - -The SNP dataset is in interval_ format, with a column of SNPs as described below. -The gene dataset is in BED_ format with 12 columns. The output dataset is also interval. -(`Dataset missing?`_) - -.. _interval: ./static/formatHelp.html#interval -.. _BED: ./static/formatHelp.html#bed -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool identifies which SNPs create amino-acid changes in the specified -coding regions. The first input file contains the SNPs and must be an interval file. -It needs the chromosome, start, and end position as well as the SNP. The -SNP can be given using ambiguous-nucleotide symbols or a list of two to four -alleles -separated by '/'. Any other columns in the first input file will not be -used but will be kept for the output. The second input file contains the genes -to be used for defining the coding regions. This file must be a BED file with -the first 12 columns standard BED columns. The output is the same as the -first input file with -several columns added: the name field from the line of the gene input file -used, the amino acids, the codon number, and the reference nucleotide that -changed in the amino acid. -The amino acids are listed with the reference amino acid first, then a colon, -and then the amino acids for the alleles. If a SNP is not in a coding region -or is synonymous then it is not included in the output file. - ------ - -**Example** - -- first input file, with SNPs:: - - chr22 15660821 15660822 A/G - chr22 15825725 15825726 G/T - chr22 15827035 15827036 G - chr22 15827135 15827136 C/G - chr22 15830928 15830929 A/G - chr22 15830951 15830952 G - chr22 15830955 15830956 C/T - chr22 15848885 15848886 C/T - chr22 15849048 15849049 A/C - chr22 15919711 15919712 A/G - etc. - - or, indicating polymorphisms using ambiguous-nucleotide symbols:: - - chr22 15660821 15660822 R - chr22 15825725 15825726 K - chr22 15827035 15827036 G - chr22 15827135 15827136 S - chr22 15830928 15830929 R - chr22 15830951 15830952 G - chr22 15830955 15830956 Y - chr22 15848885 15848886 Y - chr22 15849048 15849049 M - chr22 15919711 15919712 R - etc. - -- second input file, with UCSC annotations for human genes:: - - chr22 15688363 15690225 uc010gqr.1 0 + 15688363 15688363 0 2 587,794, 0,1068, - chr22 15822826 15869112 uc002zlw.1 0 - 15823622 15869004 0 10 940,105,97,91,265,86,251,208,304,282, 0,1788,2829,3241,4163,6361,8006,26023,29936,46004, - chr22 15826991 15869112 uc010gqs.1 0 - 15829218 15869004 0 5 1380,86,157,304,282, 0,2196,21858,25771,41839, - chr22 15897459 15919682 uc002zlx.1 0 + 15897459 15897459 0 4 775,128,103,1720, 0,8303,10754,20503, - chr22 15945848 15971389 uc002zly.1 0 + 15945981 15970710 0 13 271,25,147,113,127,48,164,84,85,12,102,42,2193, 0,12103,12838,13816,15396,17037,17180,18535,19767,20632,20894,22768,23348, - etc. - -- output file, showing non-synonymous substitutions in coding regions:: - - chr22 15825725 15825726 G/T uc002zlw.1 Gln:Pro/Gln 469 T - chr22 15827035 15827036 G uc002zlw.1 Glu:Asp 414 C - chr22 15827135 15827136 C/G uc002zlw.1 Gly:Gly/Ala 381 C - chr22 15830928 15830929 A/G uc002zlw.1 Ala:Ser/Pro 281 C - chr22 15830951 15830952 G uc002zlw.1 Leu:Pro 273 A - chr22 15830955 15830956 C/T uc002zlw.1 Ser:Gly/Ser 272 T - chr22 15848885 15848886 C/T uc002zlw.1 Ser:Trp/Stop 217 G - chr22 15848885 15848886 C/T uc010gqs.1 Ser:Trp/Stop 200 G - chr22 15849048 15849049 A/C uc002zlw.1 Gly:Stop/Gly 163 C - etc. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/evolution/codingSnps_filter.py --- a/tools/evolution/codingSnps_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -#!/usr/bin/env python - -# runs after the job (and after the default post-filter) -import os -from galaxy import eggs -from galaxy import jobs -from galaxy.tools.parameters import DataToolParameter -# Older py compatibility -try: - set() -except: - from sets import Set as set - -def validate_input( trans, error_map, param_values, page_param_map ): - dbkeys = set() - data_param_names = set() - data_params = 0 - for name, param in page_param_map.iteritems(): - if isinstance( param, DataToolParameter ): - # for each dataset parameter - if param_values.get(name, None) != None: - dbkeys.add( param_values[name].dbkey ) - data_params += 1 - # check meta data - try: - param = param_values[name] - startCol = int( param.metadata.startCol ) - endCol = int( param.metadata.endCol ) - chromCol = int( param.metadata.chromCol ) - if param.metadata.strandCol is not None: - strandCol = int ( param.metadata.strandCol ) - else: - strandCol = 0 - except: - error_msg = "The attributes of this dataset are not properly set. " + \ - "Click the pencil icon in the history item to set the chrom, start, end and strand columns." - error_map[name] = error_msg - data_param_names.add( name ) - if len( dbkeys ) > 1: - for name in data_param_names: - error_map[name] = "All datasets must belong to same genomic build, " \ - "this dataset is linked to build '%s'" % param_values[name].dbkey - if data_params != len(data_param_names): - for name in data_param_names: - error_map[name] = "A dataset of the appropriate type is required" diff -r c2a356708570 -r 33c067c3ae34 tools/evolution/mutate_snp_codon.py --- a/tools/evolution/mutate_snp_codon.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -#!/usr/bin/env python -""" -Script to mutate SNP codons. -Dan Blankenberg -""" - -import sys, string - -def strandify( fields, column ): - strand = '+' - if column >= 0 and column < len( fields ): - strand = fields[ column ] - if strand not in [ '+', '-' ]: - strand = '+' - return strand - -def main(): - # parse command line - input_file = sys.argv[1] - out = open( sys.argv[2], 'wb+' ) - codon_chrom_col = int( sys.argv[3] ) - 1 - codon_start_col = int( sys.argv[4] ) - 1 - codon_end_col = int( sys.argv[5] ) - 1 - codon_strand_col = int( sys.argv[6] ) - 1 - codon_seq_col = int( sys.argv[7] ) - 1 - - snp_chrom_col = int( sys.argv[8] ) - 1 - snp_start_col = int( sys.argv[9] ) - 1 - snp_end_col = int( sys.argv[10] ) - 1 - snp_strand_col = int( sys.argv[11] ) - 1 - snp_observed_col = int( sys.argv[12] ) - 1 - - max_field_index = max( codon_chrom_col, codon_start_col, codon_end_col, codon_strand_col, codon_seq_col, snp_chrom_col, snp_start_col, snp_end_col, snp_strand_col, snp_observed_col ) - - DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) - skipped_lines = 0 - errors = {} - for name, message in [ ('max_field_index','not enough fields'), ( 'codon_len', 'codon length must be 3' ), ( 'codon_seq', 'codon sequence must have length 3' ), ( 'snp_len', 'SNP length must be 3' ), ( 'snp_observed', 'SNP observed values must have length 3' ), ( 'empty_comment', 'empty or comment'), ( 'no_overlap', 'codon and SNP do not overlap' ) ]: - errors[ name ] = { 'count':0, 'message':message } - line_count = 0 - for line_count, line in enumerate( open( input_file ) ): - line = line.rstrip( '\n\r' ) - if line and not line.startswith( '#' ): - fields = line.split( '\t' ) - if max_field_index >= len( fields ): - skipped_lines += 1 - errors[ 'max_field_index' ]['count'] += 1 - continue - - #read codon info - codon_chrom = fields[codon_chrom_col] - codon_start = int( fields[codon_start_col] ) - codon_end = int( fields[codon_end_col] ) - if codon_end - codon_start != 3: - #codons must be length 3 - skipped_lines += 1 - errors[ 'codon_len' ]['count'] += 1 - continue - codon_strand = strandify( fields, codon_strand_col ) - codon_seq = fields[codon_seq_col].upper() - if len( codon_seq ) != 3: - #codon sequence must have length 3 - skipped_lines += 1 - errors[ 'codon_seq' ]['count'] += 1 - continue - - #read snp info - snp_chrom = fields[snp_chrom_col] - snp_start = int( fields[snp_start_col] ) - snp_end = int( fields[snp_end_col] ) - if snp_end - snp_start != 1: - #snps must be length 1 - skipped_lines += 1 - errors[ 'snp_len' ]['count'] += 1 - continue - snp_strand = strandify( fields, snp_strand_col ) - snp_observed = fields[snp_observed_col].split( '/' ) - snp_observed = [ observed for observed in snp_observed if len( observed ) == 1 ] - if not snp_observed: - #sequence replacements must be length 1 - skipped_lines += 1 - errors[ 'snp_observed' ]['count'] += 1 - continue - - #Determine index of replacement for observed values into codon - offset = snp_start - codon_start - #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2 - if codon_strand == '-': - offset = 2 - offset - if offset < 0 or offset > 2: #assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset ) - #codon and snp do not overlap - skipped_lines += 1 - errors[ 'no_overlap' ]['count'] += 1 - continue - - for observed in snp_observed: - if codon_strand != snp_strand: - #if our SNP is on a different strand than our codon, take complement of provided observed SNP base - observed = observed.translate( DNA_COMP ) - snp_codon = [ char for char in codon_seq ] - snp_codon[offset] = observed.upper() - snp_codon = ''.join( snp_codon ) - - if codon_seq != snp_codon: #only output when we actually have a different codon - out.write( "%s\t%s\n" % ( line, snp_codon ) ) - else: - skipped_lines += 1 - errors[ 'empty_comment' ]['count'] += 1 - if skipped_lines: - print "Skipped %i (%4.2f%%) of %i lines; reasons: %s" % ( skipped_lines, ( float( skipped_lines )/float( line_count ) ) * 100, line_count, ', '.join( [ "%s (%i)" % ( error['message'], error['count'] ) for error in errors.itervalues() if error['count'] ] ) ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/evolution/mutate_snp_codon.xml --- a/tools/evolution/mutate_snp_codon.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - - with SNPs - mutate_snp_codon.py $input1 $output1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} $codon_seq_col $snp_chrom_col $snp_start_col $snp_end_col $snp_strand_col $snp_observed_col - - - - - - - - - - - - - - - - - - - - - - - - - -This tool takes an interval file as input. This input should contain a set of codon locations and corresponding DNA sequence (such as from the *Extract Genomic DNA* tool) joined to SNP locations with observed values (such as *all fields from selected table* from the snp130 table of hg18 at the UCSC Table browser). This interval file should have the metadata (chromosome, start, end, strand) set for the columns containing the locations of the codons. The user needs to specify the columns containing the sequence for the codon as well as the genomic positions and observed values (values should be split by '/') for the SNP data as tool input; SNPs positions and sequence substitutes must have a length of exactly 1. Only genomic intervals which yield a different sequence string are output. All sequence characters are converted to uppercase during processing. - - For example, using these settings: - - * **metadata** **chromosome**, **start**, **end** and **strand** set to **1**, **2**, **3** and **6**, respectively - * **Codon Sequence column** set to **c8** - * **SNP chromosome column** set to **c17** - * **SNP start column** set to **c18** - * **SNP end column** set to **c19** - * **SNP strand column** set to **c22** - * **SNP observed column** set to **c25** - - with the following input:: - - chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3 - chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3 - chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3 - chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3 - chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3 - chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1 - - - will produce:: - - chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3 GGA - chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3 TGT - chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3 AAA - chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3 GCA - chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3 GCC - chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1 GCC - - diff -r c2a356708570 -r 33c067c3ae34 tools/extract/extract_genomic_dna.py --- a/tools/extract/extract_genomic_dna.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,283 +0,0 @@ -#!/usr/bin/env python -""" -usage: %prog $input $out_file1 - -1, --cols=N,N,N,N: Columns for start, end, strand in input file - -d, --dbkey=N: Genome build of input file - -o, --output_format=N: the data type of the output file - -g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc - -I, --interpret_features: if true, complete features are interpreted when input is GFF - -F, --fasta=: genomic sequences to use for extraction - -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than 'traditional' 0-based, closed format. -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, string, os, re, tempfile, subprocess -from bx.cookbook import doc_optparse -from bx.intervals.io import Header, Comment -import bx.seq.nib -import bx.seq.twobit -from galaxy.tools.util.galaxyops import * -from galaxy.datatypes.util import gff_util - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def reverse_complement( s ): - complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" } - reversed_s = [] - for i in s: - reversed_s.append( complement_dna[i] ) - reversed_s.reverse() - return "".join( reversed_s ) - -def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ): - seq_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR - seq_path = '' - for line in open( seq_file ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( "#" ) and line.startswith( 'seq' ): - fields = line.split( '\t' ) - if len( fields ) < 3: - continue - if fields[1] == dbkey: - seq_path = fields[2].strip() - break - return seq_path - -def __main__(): - # - # Parse options, args. - # - options, args = doc_optparse.parse( __doc__ ) - try: - chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols ) - dbkey = options.dbkey - output_format = options.output_format - gff_format = options.gff - interpret_features = options.interpret_features - GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR - fasta_file = options.fasta - input_filename, output_filename = args - except: - doc_optparse.exception() - - includes_strand_col = strand_col >= 0 - strand = None - nibs = {} - twobits = {} - - # - # Set path to sequence data. - # - if fasta_file: - # Need to create 2bit file from fasta file. - try: - seq_path = tempfile.NamedTemporaryFile( dir="." ).name - cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path ) - - tmp_name = tempfile.NamedTemporaryFile( dir="." ).name - tmp_stderr = open( tmp_name, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - - # Get stderr, allowing for case where it's very large. - tmp_stderr = open( tmp_name, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - - # Error checking. - if returncode != 0: - raise Exception, stderr - except Exception, e: - stop_err( 'Error running faToTwoBit. ' + str( e ) ) - else: - seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ) - if not os.path.exists( seq_path ): - # If this occurs, we need to fix the metadata validator. - stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey ) - - # - # Fetch sequences. - # - - # Get feature's line(s). - def get_lines( feature ): - if isinstance( feature, gff_util.GFFFeature ): - return feature.lines() - else: - return [ feature.rstrip( '\r\n' ) ] - - skipped_lines = 0 - first_invalid_line = 0 - invalid_lines = [] - fout = open( output_filename, "w" ) - warnings = [] - warning = '' - twobitfile = None - file_iterator = open( input_filename ) - if gff_format and interpret_features: - file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=False ) - line_count = 1 - for feature in file_iterator: - # Ignore comments, headers. - if isinstance( feature, ( Header, Comment ) ): - line_count += 1 - continue - - if gff_format and interpret_features: - # Processing features. - gff_util.convert_gff_coords_to_bed( feature ) - chrom = feature.chrom - start = feature.start - end = feature.end - strand = feature.strand - else: - # Processing lines, either interval or GFF format. - line = feature.rstrip( '\r\n' ) - if line and not line.startswith( "#" ): - fields = line.split( '\t' ) - try: - chrom = fields[chrom_col] - start = int( fields[start_col] ) - end = int( fields[end_col] ) - if gff_format: - start, end = gff_util.convert_gff_coords_to_bed( [start, end] ) - if includes_strand_col: - strand = fields[strand_col] - except: - warning = "Invalid chrom, start or end column values. " - warnings.append( warning ) - if not invalid_lines: - invalid_lines = get_lines( feature ) - first_invalid_line = line_count - skipped_lines += len( invalid_lines ) - continue - if start > end: - warning = "Invalid interval, start '%d' > end '%d'. " % ( start, end ) - warnings.append( warning ) - if not invalid_lines: - invalid_lines = get_lines( feature ) - first_invalid_line = line_count - skipped_lines += len( invalid_lines ) - continue - - if strand not in ['+', '-']: - strand = '+' - sequence = '' - else: - continue - - # Open sequence file and get sequence for feature/interval. - if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ): - # TODO: improve support for GFF-nib interaction. - if chrom in nibs: - nib = nibs[chrom] - else: - nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) ) - try: - sequence = nib.get( start, end-start ) - except Exception, e: - warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey ) - warnings.append( warning ) - if not invalid_lines: - invalid_lines = get_lines( feature ) - first_invalid_line = line_count - skipped_lines += len( invalid_lines ) - continue - elif seq_path and os.path.isfile( seq_path ): - if not(twobitfile): - twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) ) - try: - if options.gff and interpret_features: - # Create sequence from intervals within a feature. - sequence = '' - for interval in feature.intervals: - sequence += twobitfile[interval.chrom][interval.start:interval.end] - else: - sequence = twobitfile[chrom][start:end] - except: - warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " %( start, end-start, chrom ) - warnings.append( warning ) - if not invalid_lines: - invalid_lines = get_lines( feature ) - first_invalid_line = line_count - skipped_lines += len( invalid_lines ) - continue - else: - warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey ) - warnings.append( warning ) - if not invalid_lines: - invalid_lines = get_lines( feature ) - first_invalid_line = line_count - skipped_lines += len( invalid_lines ) - continue - if sequence == '': - warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " \ - % ( chrom, start, end, dbkey ) - warnings.append( warning ) - if not invalid_lines: - invalid_lines = get_lines( feature ) - first_invalid_line = line_count - skipped_lines += len( invalid_lines ) - continue - if includes_strand_col and strand == "-": - sequence = reverse_complement( sequence ) - - if output_format == "fasta" : - l = len( sequence ) - c = 0 - if gff_format: - start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] ) - fields = [dbkey, str( chrom ), str( start ), str( end ), strand] - meta_data = "_".join( fields ) - fout.write( ">%s\n" % meta_data ) - while c < l: - b = min( c + 50, l ) - fout.write( "%s\n" % str( sequence[c:b] ) ) - c = b - else: # output_format == "interval" - if gff_format and interpret_features: - # TODO: need better GFF Reader to capture all information needed - # to produce this line. - meta_data = "\t".join( - [feature.chrom, "galaxy_extract_genomic_dna", "interval", \ - str( feature.start ), str( feature.end ), feature.score, feature.strand, - ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] ) - else: - meta_data = "\t".join( fields ) - if gff_format: - format_str = "%s seq \"%s\";\n" - else: - format_str = "%s\t%s\n" - fout.write( format_str % ( meta_data, str( sequence ) ) ) - - # Update line count. - if isinstance( feature, gff_util.GFFFeature ): - line_count += len( feature.intervals ) - else: - line_count += 1 - - fout.close() - - if warnings: - warn_msg = "%d warnings, 1st is: " % len( warnings ) - warn_msg += warnings[0] - print warn_msg - if skipped_lines: - # Error message includes up to the first 10 skipped lines. - print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join( invalid_lines[:10] ) ) - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/extract/extract_genomic_dna.xml --- a/tools/extract/extract_genomic_dna.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,174 +0,0 @@ - - using coordinates from assembled/unassembled genomes - - extract_genomic_dna.py $input $out_file1 -o $out_format -d $dbkey - - #if str( $interpret_features ) == "yes": - -I - #end if - - ## Columns to use in input file. - #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -1 1,4,5,7 --gff - #else: - -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol} - #end if - - #if $seq_source.index_source == "cached": - ## Genomic data from cache. - -g ${GALAXY_DATA_INDEX_DIR} - #else: - ## Genomic data from history. - -F $seq_source.ref_file - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - faToTwoBit - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -This tool requires interval or gff (special tabular formatted data). If your data is not TAB delimited, first use *Text Manipulation->Convert*. - -.. class:: warningmark - -Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified). - -.. class:: warningmark - -All of the following will cause a line from the input dataset to be skipped and a warning generated. The number of warnings and skipped lines is documented in the resulting history item. - - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates. - - Sequences that fall outside of the range of a line's start and end coordinates. - - Chromosome, start or end coordinates that are invalid for the specified build. - - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ). - -.. class:: infomark - - **Extract genomic DNA using coordinates from ASSEMBLED genomes and UNassembled genomes** previously were achieved by two separate tools. - ------ - -**What it does** - -This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format. - -If strand is not defined, the default value is "+". - ------ - -**Example** - -If the input dataset is:: - - chr7 127475281 127475310 NM_000230 0 + - chr7 127485994 127486166 NM_000230 0 + - chr7 127486011 127486166 D49487 0 + - -Extracting sequences with **FASTA** output data type returns:: - - >hg17_chr7_127475281_127475310_+ - GTAGGAATCGCAGCGCCAGCGGTTGCAAG - >hg17_chr7_127485994_127486166_+ - GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG - GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC - CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG - GATCAATGACATTTCACACACG - >hg17_chr7_127486011_127486166_+ - TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG - CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA - CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC - ACACG - -Extracting sequences with **Interval** output data type returns:: - - chr7 127475281 127475310 NM_000230 0 + GTAGGAATCGCAGCGCCAGCGGTTGCAAG - chr7 127485994 127486166 NM_000230 0 + GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG - chr7 127486011 127486166 D49487 0 + TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG - - - diff -r c2a356708570 -r 33c067c3ae34 tools/extract/liftOver_wrapper.py --- a/tools/extract/liftOver_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda -""" -Converts coordinates from one build/assembly to another using liftOver binary and mapping files downloaded from UCSC. -""" - -import os, string, subprocess, sys -import tempfile -import re - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def safe_bed_file(infile): - """Make a BED file with track and browser lines ready for liftOver. - - liftOver will fail with track or browser lines. We can make it happy - by converting these to comments. See: - - https://lists.soe.ucsc.edu/pipermail/genome/2007-May/013561.html - """ - fix_pat = re.compile("^(track|browser)") - (fd, fname) = tempfile.mkstemp() - in_handle = open(infile) - out_handle = open(fname, "w") - for line in in_handle: - if fix_pat.match(line): - line = "#" + line - out_handle.write(line) - in_handle.close() - out_handle.close() - return fname - -if len( sys.argv ) < 9: - stop_err( "USAGE: prog input out_file1 out_file2 input_dbkey output_dbkey infile_type minMatch multiple " ) - -infile = sys.argv[1] -outfile1 = sys.argv[2] -outfile2 = sys.argv[3] -in_dbkey = sys.argv[4] -mapfilepath = sys.argv[5] -infile_type = sys.argv[6] -gff_option = "" -if infile_type == "gff": - gff_option = "-gff " -minMatch = sys.argv[7] -multiple = int(sys.argv[8]) -multiple_option = "" -if multiple: - minChainT = sys.argv[9] - minChainQ = sys.argv[10] - minSizeQ = sys.argv[11] - multiple_option = " -multiple -minChainT=%s -minChainQ=%s -minSizeQ=%s " %(minChainT,minChainQ,minSizeQ) - -try: - assert float(minMatch) -except: - minMatch = 0.1 -#ensure dbkey is set -if in_dbkey == "?": - stop_err( "Input dataset genome build unspecified, click the pencil icon in the history item to specify it." ) - -if not os.path.isfile( mapfilepath ): - stop_err( "%s mapping is not currently available." % ( mapfilepath.split('/')[-1].split('.')[0] ) ) - -safe_infile = safe_bed_file(infile) -cmd_line = "liftOver " + gff_option + "-minMatch=" + str(minMatch) + multiple_option + " " + safe_infile + " " + mapfilepath + " " + outfile1 + " " + outfile2 + " > /dev/null" - -try: - # have to nest try-except in try-finally to handle 2.4 - try: - proc = subprocess.Popen( args=cmd_line, shell=True, stderr=subprocess.PIPE ) - returncode = proc.wait() - stderr = proc.stderr.read() - if returncode != 0: - raise Exception, stderr - except Exception, e: - raise Exception, 'Exception caught attempting conversion: ' + str( e ) -finally: - os.remove(safe_infile) diff -r c2a356708570 -r 33c067c3ae34 tools/extract/liftOver_wrapper.xml --- a/tools/extract/liftOver_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ - - between assemblies and genomes - - liftOver_wrapper.py - $input - "$out_file1" - "$out_file2" - $dbkey - $to_dbkey - #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__) or isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gtf').__class__): - "gff" - #else: - "interval" - #end if - $minMatch ${multiple.choice} ${multiple.minChainT} ${multiple.minChainQ} ${multiple.minSizeQ} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ucsc_tools - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -Make sure that the genome build of the input dataset is specified (click the pencil icon in the history item to set it if necessary). - -.. class:: warningmark - -This tool can work with interval, GFF, and GTF datasets. It requires the interval datasets to have chromosome in column 1, -start co-ordinate in column 2 and end co-ordinate in column 3. BED comments -and track and browser lines will be ignored, but if other non-interval lines -are present the tool will return empty output datasets. - ------ - -.. class:: infomark - -**What it does** - -This tool is based on the LiftOver utility and Chain track from `the UC Santa Cruz Genome Browser`__. - -It converts coordinates and annotations between assemblies and genomes. It produces 2 files, one containing all the mapped coordinates and the other containing the unmapped coordinates, if any. - - .. __: http://genome.ucsc.edu/ - ------ - -**Example** - -Converting the following hg16 intervals to hg18 intervals:: - - chrX 85170 112199 AK002185 0 + - chrX 110458 112199 AK097346 0 + - chrX 112203 121212 AK074528 0 - - -will produce the following hg18 intervals:: - - chrX 132991 160020 AK002185 0 + - chrX 158279 160020 AK097346 0 + - chrX 160024 169033 AK074528 0 - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/extract/phastOdds/get_scores_galaxy.py --- a/tools/extract/phastOdds/get_scores_galaxy.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,147 +0,0 @@ -#!/usr/bin/env python - -""" -usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options] - -p, --perCol: standardize to lod per column -""" - -from __future__ import division - -import sys -from galaxy import eggs -from numpy import * -from tables import * - -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -from bx import intervals - -# ignore wanrnings about NumArray flavor -from warnings import filterwarnings -from tables.exceptions import FlavorWarning -filterwarnings("ignore", category=FlavorWarning) - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write(msg) - sys.exit() - -def main(): - # Parse command line - options, args = doc_optparse.parse( __doc__ ) - try: - h5_fname = args[0] - mapping_fname = args[1] - in_fname = args[2] - out_fname = args[3] - chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] ) - per_col = bool( options.perCol ) - except Exception, e: - doc_optparse.exception() - - if h5_fname == 'None.h5': - stop_err( 'Invalid genome build, this tool currently only works with data from build hg17. Click the pencil icon in your history item to correct the build if appropriate.' ) - - # Open the h5 file - h5 = openFile( h5_fname, mode = "r" ) - # Load intervals and names for the subregions - intersecters = {} - for i, line in enumerate( file( mapping_fname ) ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - chr, start, end, name = line.split()[0:4] - if not intersecters.has_key( chr ): - intersecters[ chr ] = intervals.Intersecter() - intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) ) - - # Find the subregion containing each input interval - skipped_lines = 0 - first_invalid_line = 0 - invalid_line = '' - out_file = open( out_fname, "w" ) - warnings = [] - warning = '' - for i, line in enumerate( file( in_fname ) ): - line = line.rstrip( '\r\n' ) - if line.startswith( '#' ): - if i == 0: - out_file.write( "%s\tscore\n" % line ) - else: - out_file.write( "%s\n" % line ) - fields = line.split( "\t" ) - try: - chr = fields[ chrom_col ] - start = int( fields[ start_col ] ) - end = int( fields[ end_col ] ) - except: - warning = "Invalid value for chrom, start or end column." - warnings.append( warning ) - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - # Find matching interval - try: - matches = intersecters[ chr ].find( start, end ) - except: - warning = "'%s' is not a valid chrom value for the region. " %chr - warnings.append( warning ) - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - if not len( matches ) == 1: - warning = "Interval must match exactly one target region. " - warnings.append( warning ) - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - region = matches[0] - if not ( start >= region.start and end <= region.end ): - warning = "Interval must fall entirely within region. " - warnings.append( warning ) - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - region_name = region.value - rel_start = start - region.start - rel_end = end - region.start - if not rel_start < rel_end: - warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end ) - warnings.append( warning ) - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - s = h5.getNode( h5.root, "scores_" + region_name ) - c = h5.getNode( h5.root, "counts_" + region_name ) - score = s[rel_end-1] - count = c[rel_end-1] - if rel_start > 0: - score -= s[rel_start-1] - count -= c[rel_start-1] - if per_col: - score /= count - fields.append( str( score ) ) - out_file.write( "%s\n" % "\t".join( fields ) ) - # Close the file handle - h5.close() - out_file.close() - - if warnings: - warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings ) - warn_msg += warnings[0] - print warn_msg - if skipped_lines: - print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/extract/phastOdds/phastOdds_tool.xml --- a/tools/extract/phastOdds/phastOdds_tool.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ - - for each interval - get_scores_galaxy.py $per_col ${score_file}.h5 ${score_file}.mapping.bed $input $output ${input.metadata.chromCol} ${input.metadata.startCol} ${input.metadata.endCol} - - - - - - - - - - - - - - - - - - - - numpy - tables - - - - - - - - - - - -.. class:: warningmark - -This tool currently only works with interval data from genome build hg17. - -.. class:: warningmark - -This tool assumes that the input dataset is in interval format and contains at least a chrom column, a start column and an end column. These 3 columns can be dispersed throughout any number of other data columns. - ------ - -**Syntax** - -Append a column to each line of an interval file containing the phastOdds score for that interval. - ------ - -**Example** - -If your original data has the following format: - -+-----+-----+---+ -|chrom|start|end| -+-----+-----+---+ - -and you choose to compute phastOdds scores, your output will look like this: - -+-----+-----+---+-----+ -|chrom|start|end|score| -+-----+-----+---+-----+ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_compute_length.py --- a/tools/fasta_tools/fasta_compute_length.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -#!/usr/bin/env python -""" -Input: fasta, int -Output: tabular -Return titles with lengths of corresponding seq -""" - -import sys, os - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - - infile = sys.argv[1] - out = open( sys.argv[2], 'w') - keep_first_char = int( sys.argv[3] ) - - fasta_title = '' - seq_len = 0 - - # number of char to keep in the title - if keep_first_char == 0: - keep_first_char = None - else: - keep_first_char += 1 - - first_entry = True - - for line in open( infile ): - line = line.strip() - if not line or line.startswith( '#' ): - continue - if line[0] == '>': - if first_entry == False: - out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) ) - else: - first_entry = False - fasta_title = line - seq_len = 0 - else: - seq_len += len(line) - - # last fasta-entry - out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) ) - out.close() - -if __name__ == "__main__" : __main__() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_compute_length.xml --- a/tools/fasta_tools/fasta_compute_length.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ - - - fasta_compute_length.py $input $output $keep_first - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool counts the length of each fasta sequence in the file. The output file has two columns per line (separated by tab): fasta titles and lengths of the sequences. The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry. - ------ - -**Example** - -Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run:: - - >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG >EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAAfa - -Running this tool while setting **How many characters to keep?** to **14** will produce this:: - - EYKX4VC02EQLO5 108 - EYKX4VC02D4GS2 60 - - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_concatenate_by_species.py --- a/tools/fasta_tools/fasta_concatenate_by_species.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -""" -Takes a Multiple Alignment FASTA file and concatenates -sequences for each species, resulting in one sequence -alignment per species. -""" - -import sys, tempfile -from galaxy import eggs -from galaxy.tools.util.maf_utilities import iter_fasta_alignment -from galaxy.util.odict import odict - -def __main__(): - input_filename = sys.argv[1] - output_filename = sys.argv[2] - species = odict() - cur_size = 0 - for components in iter_fasta_alignment( input_filename ): - species_not_written = species.keys() - for component in components: - if component.species not in species: - species[component.species] = tempfile.TemporaryFile() - species[component.species].write( "-" * cur_size ) - species[component.species].write( component.text ) - try: - species_not_written.remove( component.species ) - except ValueError: - #this is a new species - pass - for spec in species_not_written: - species[spec].write( "-" * len( components[0].text ) ) - cur_size += len( components[0].text ) - out = open( output_filename, 'wb' ) - for spec, f in species.iteritems(): - f.seek( 0 ) - out.write( ">%s\n%s\n" % ( spec, f.read() ) ) - out.close() - -if __name__ == "__main__" : __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_concatenate_by_species.xml --- a/tools/fasta_tools/fasta_concatenate_by_species.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - FASTA alignment by species - fasta_concatenate_by_species.py $input1 $out_file1 - - - - - - - - - - - - - - -**What it does** - -This tools attempts to parse FASTA headers to determine the species for each sequence in a multiple FASTA alignment. -It then linearly concatenates the sequences for each species in the file, creating one sequence per determined species. - -------- - -**Example** - -Starting FASTA:: - - >hg18.chr1(+):10016339-10016341|hg18_0 - GT - >panTro2.chr1(+):10195380-10195382|panTro2_0 - GT - >rheMac2.chr1(+):13119747-13119749|rheMac2_0 - GT - >mm8.chr4(-):148269679-148269681|mm8_0 - GT - >canFam2.chr5(+):66213635-66213637|canFam2_0 - GT - - >hg18.chr1(-):100323677-100323679|hg18_1 - GT - >panTro2.chr1(-):101678671-101678673|panTro2_1 - GT - >rheMac2.chr1(-):103154011-103154013|rheMac2_1 - GT - >mm8.chr3(+):116620616-116620618|mm8_1 - GT - >canFam2.chr6(+):52954092-52954094|canFam2_1 - GT - - - -becomes:: - - >hg18 - GTGT - >panTro2 - GTGT - >rheMac2 - GTGT - >mm8 - GTGT - >canFam2 - GTGT - - -.. class:: warningmark - - This tool will only work properly on files with Galaxy style FASTA headers. - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_filter_by_length.py --- a/tools/fasta_tools/fasta_filter_by_length.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -#!/usr/bin/env python -""" -Input: fasta, minimal length, maximal length -Output: fasta -Return sequences whose lengths are within the range. -""" - -import sys, os - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def __main__(): - input_filename = sys.argv[1] - try: - min_length = int( sys.argv[2] ) - except: - stop_err( "Minimal length of the return sequence requires a numerical value." ) - try: - max_length = int( sys.argv[3] ) - except: - stop_err( "Maximum length of the return sequence requires a numerical value." ) - output_filename = sys.argv[4] - output_handle = open( output_filename, 'w' ) - tmp_size = 0 #-1 - tmp_buf = '' - at_least_one = 0 - for line in file(input_filename): - if not line or line.startswith('#'): - continue - if line[0] == '>': - if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0): - output_handle.write(tmp_buf) - at_least_one = 1 - tmp_buf = line - tmp_size = 0 - else: - if max_length == 0 or tmp_size < max_length: - tmp_size += len(line.rstrip('\r\n')) - tmp_buf += line - # final flush of buffer - if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0): - output_handle.write(tmp_buf.rstrip('\r\n')) - at_least_one = 1 - output_handle.close() - if at_least_one == 0: - print "There is no sequence that falls within your range." - -if __name__ == "__main__" : __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_filter_by_length.xml --- a/tools/fasta_tools/fasta_filter_by_length.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ - - - fasta_filter_by_length.py $input $min_length $max_length $output - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP**. To return sequences longer than a certain length, set *Minimal length* to desired value and leave *Maximum length* set to '0'. - ------ - -**What it does** - -Outputs sequences between *Minimal length* and *Maximum length*. - ------ - -**Example** - -Suppose you have the following FASTA formatted sequences:: - - >seq1 - TCATTTAATGAC - >seq2 - ATGGC - >seq3 - TCACATGATGCCG - >seq4 - ATGGAAGC - -Setting the **Minimal length** to **10**, and the **Maximum length** to **0** will return all sequences longer than 10 bp:: - - >seq1 - TCATTTAATGAC - >seq3 - TCACATGATGCCG - - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_to_tabular.py --- a/tools/fasta_tools/fasta_to_tabular.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -#!/usr/bin/env python -# This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools -""" -Input: fasta (input file), tabular (output file), int (truncation of id), int (columns from description) -Output: tabular -format convert: fasta to tabular -""" - -import sys, os - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def __main__(): - if len(sys.argv) != 5: - stop_err("Wrong number of argument. Expect four (fasta, tabular, truncation, columns)") - infile = sys.argv[1] - outfile = sys.argv[2] - keep_first = int( sys.argv[3] ) - descr_split = int( sys.argv[4] ) - fasta_title = fasta_seq = '' - if keep_first == 0: - keep_first = None - elif descr_split == 1: - #Added one for the ">" character - #(which is removed if using descr_split > 1) - keep_first += 1 - if descr_split < 1: - stop_err("Bad description split value (should be 1 or more)") - out = open( outfile, 'w' ) - for i, line in enumerate( open( infile ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if line.startswith( '>' ): - #Don't want any existing tabs to trigger extra columns: - line = line.replace('\t', ' ') - if i > 0: - out.write('\n') - if descr_split == 1: - out.write(line[1:keep_first]) - else: - words = line[1:].split(None, descr_split-1) - #apply any truncation to first word (the id) - words[0] = words[0][0:keep_first] - #pad with empty columns if required - words += [""]*(descr_split-len(words)) - out.write("\t".join(words)) - out.write('\t') - else: - out.write(line) - if i > 0: - out.write('\n') - out.close() - -if __name__ == "__main__" : __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/fasta_to_tabular.xml --- a/tools/fasta_tools/fasta_to_tabular.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ - - converter - fasta_to_tabular.py $input $output $keep_first $descr_columns - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts FASTA formatted sequences to TAB-delimited format. - -Many tools consider the first word of the FASTA ">" title line to be an identifier, and any remaining text to be a free form description. -It is therefore useful to split this text into two columns in Galaxy (identifier and any description) by setting **How many columns to divide title string into?** to **2**. -In some cases the description can be usefully broken up into more columns -- see the examples . - -The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry. -With the introduction of the **How many columns to divide title string into?** option this setting is of limited use, but does still allow you to truncate the identifier. - ------ - -**Example** - -Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run:: - - >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ - TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG - TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG - >EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ - AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAA - -Running this tool with the default settings will produce this (2 column output): - -========================================================================== ======================================= -EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG -EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA -========================================================================== ======================================= - -Having the full title line (the FASTA ">" line text) as a column is not always ideal. - -The **How many characters to keep?** option is useful if your identifiers are all the same length. -In this example the identifier is 14 characters, so setting **How many characters to keep?** to **14** (and leaving **How many columns to divide title string into?** as the default, **1**) will produce this (2 column output): - -============== ======================================= -EYKX4VC02EQLO5 TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG -EYKX4VC02D4GS2 AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA -============== ======================================= - -If however your FASTA file has identifiers of variable length, it is better to split the text into at least two columns. -Running this tool with **How many columns to divide title string into?** to **2** will produce this (3 column output): - -============== =========================================================== ======================================= -EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG -EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA -============== =========================================================== ======================================= - -Running this tool with **How many columns to divide title string into?** to **5** will produce this (5 column output): - -============== ========== ============ ======== ========================== ======================================= -EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG -EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA -============== ========== ============ ======== ========================== ======================================= - -Running this tool with **How many columns to divide title string into?** to **5** and **How many characters to keep?** to **10** will produce this (5 column output). -Notice that only the first column is truncated to 10 characters -- and be careful not to trim your sequence names too much (generally they should be unique): - -========== ========== ============ ======== ========================== ======================================= -EYKX4VC02E length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG -EYKX4VC02D length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA -========== ========== ============ ======== ========================== ======================================= - -Note the sequences have been truncated for display purposes in the above tables. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/tabular_to_fasta.py --- a/tools/fasta_tools/tabular_to_fasta.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ -#!/usr/bin/env python -""" -Input: fasta, minimal length, maximal length -Output: fasta -Return sequences whose lengths are within the range. -""" -import sys, os - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def __main__(): - infile = sys.argv[1] - title_col = sys.argv[2] - seq_col = sys.argv[3] - outfile = sys.argv[4] - - if title_col == None or title_col == 'None' or seq_col == None or seq_col == 'None': - stop_err( "Columns not specified." ) - try: - seq_col = int( seq_col ) - 1 - except: - stop_err( "Invalid Sequence Column: %s." %str( seq_col ) ) - - title_col_list = title_col.split( ',' ) - out = open( outfile, 'w' ) - skipped_lines = 0 - first_invalid_line = 0 - invalid_line = "" - i = 0 - - for i, line in enumerate( open( infile ) ): - error = False - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - fields = line.split( '\t' ) - fasta_title = [] - for j in title_col_list: - try: - j = int( j ) - 1 - fasta_title.append( fields[j] ) - except: - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - error = True - break - if not error: - try: - fasta_seq = fields[seq_col] - if fasta_title[0].startswith( ">" ): - fasta_title[0] = fasta_title[0][1:] - print >> out, ">%s\n%s" % ( "_".join( fasta_title ), fasta_seq ) - except: - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - out.close() - - if skipped_lines > 0: - print 'Data issue: skipped %d blank or invalid lines starting at #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) - -if __name__ == "__main__" : __main__() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/fasta_tools/tabular_to_fasta.xml --- a/tools/fasta_tools/tabular_to_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ - - converts tabular file to FASTA format - tabular_to_fasta.py $input $title_col $seq_col $output - - - - - - - - - - - - - - - - - - -**What it does** - -Converts tab delimited data into FASTA formatted sequences. - ------------ - -**Example** - -Suppose this is a sequence file produced by Illumina (Solexa) sequencer:: - - 5 300 902 419 GACTCATGATTTCTTACCTATTAGTGGTTGAACATC - 5 300 880 431 GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT - -Selecting **c3** and **c4** as the **Title column(s)** and **c5** as the **Sequence column** will result in:: - - >902_419 - GACTCATGATTTCTTACCTATTAGTGGTTGAACATC - >880_431 - GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_combiner.py --- a/tools/fastq/fastq_combiner.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -#Dan Blankenberg -import sys, os, shutil -from galaxy_utils.sequence.fastq import fastqWriter, fastqSequencingRead, fastqCombiner, fastqFakeFastaScoreReader -from galaxy_utils.sequence.fasta import fastaReader, fastaNamedReader - -def main(): - #Read command line arguments - fasta_filename = sys.argv[1] - fasta_type = sys.argv[2] or 'fasta' #should always be fasta or csfasta? what if txt? - qual_filename = sys.argv[3] - qual_type = sys.argv[4] or 'qualsanger' #qual454 qualsolid - output_filename = sys.argv[5] - force_quality_encoding = sys.argv[6] - if force_quality_encoding == 'None': - force_quality_encoding = None - - format = 'sanger' - if fasta_type == 'csfasta' or qual_type == 'qualsolid': - format = 'cssanger' - elif qual_type == 'qualsolexa': - format = 'solexa' - elif qual_type == 'qualillumina': - format = 'illumina' - - out = fastqWriter( open( output_filename, 'wb' ), format = format, force_quality_encoding = force_quality_encoding ) - if qual_filename == 'None': - qual_input = fastqFakeFastaScoreReader( format, quality_encoding = force_quality_encoding ) - else: - qual_input = fastaNamedReader( open( qual_filename, 'rb' ) ) - - fastq_combiner = fastqCombiner( format ) - i = None - skip_count = 0 - for i, sequence in enumerate( fastaReader( open( fasta_filename, 'rb' ) ) ): - quality = qual_input.get( sequence ) - if quality: - fastq_read = fastq_combiner.combine( sequence, quality ) - out.write( fastq_read ) - else: - skip_count += 1 - out.close() - if i is None: - print "Your file contains no valid FASTA sequences." - else: - print qual_input.has_data() - print 'Combined %s of %s sequences with quality scores (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_combiner.xml --- a/tools/fastq/fastq_combiner.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ - - into FASTQ - fastq_combiner.py '$fasta_file' '${fasta_file.extension}' '$qual_file' '${qual_file.extension}' '$output_file' '$force_quality_encoding' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool joins a FASTA file to a Quality Score file, creating a single FASTQ block for each read. - -Specifying a set of quality scores is optional; when not provided, the output will be fastqsanger or fastqcssanger (when a csfasta is provided) with each quality score being the maximal allowed value (93). - -Use this tool, for example, to convert 454-type output to FASTQ. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_filter.py --- a/tools/fastq/fastq_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -#Dan Blankenberg -import sys, os, shutil -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - -def main(): - #Read command line arguments - input_filename = sys.argv[1] - script_filename = sys.argv[2] - output_filename = sys.argv[3] - additional_files_path = sys.argv[4] - input_type = sys.argv[5] or 'sanger' - - #Save script file for debuging/verification info later - os.mkdir( additional_files_path ) - shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) - - out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) - - i = None - reads_kept = 0 - for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - local = {'fastq_read':fastq_read, 'ret_val':False} - execfile( script_filename, {}, local ) - if local['ret_val']: - out.write( fastq_read ) - reads_kept += 1 - out.close() - if i is None: - print "Your file contains no valid fastq reads." - else: - print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_filter.xml --- a/tools/fastq/fastq_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,318 +0,0 @@ - - reads by quality score and length - fastq_filter.py $input_file $fastq_filter_file $output_file $output_file.files_path '${input_file.extension[len( 'fastq' ):]}' - - - - - - - - - - - - - - - - - - - - - - - int( float( value ) ) == float( value ) - - - - int( float( value ) ) == float( value ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -def fastq_read_pass_filter( fastq_read ): - def mean( score_list ): - return float( sum( score_list ) ) / float( len( score_list ) ) - if len( fastq_read ) < $min_size: - return False - if $max_size > 0 and len( fastq_read ) > $max_size: - return False - num_deviates = $max_num_deviants - qual_scores = fastq_read.get_decimal_quality_scores() - for qual_score in qual_scores: - if qual_score < $min_quality or ( $max_quality > 0 and qual_score > $max_quality ): - if num_deviates == 0: - return False - else: - num_deviates -= 1 -#if $paired_end.value == 'single_end': - qual_scores_split = [ qual_scores ] -#else: - qual_scores_split = [ qual_scores[ 0:int( len( qual_scores ) / 2 ) ], qual_scores[ int( len( qual_scores ) / 2 ): ] ] -#end if -#for $fastq_filter in $fastq_filters: - for split_scores in qual_scores_split: - left_column_offset = $fastq_filter[ 'offset_type' ][ 'left_column_offset' ] - right_column_offset = $fastq_filter[ 'offset_type' ][ 'right_column_offset' ] -#if $fastq_filter[ 'offset_type' ]['base_offset_type'] == 'offsets_percent': - left_column_offset = int( round( float( left_column_offset ) / 100.0 * float( len( split_scores ) ) ) ) - right_column_offset = int( round( float( right_column_offset ) / 100.0 * float( len( split_scores ) ) ) ) -#end if - if right_column_offset > 0: - split_scores = split_scores[ left_column_offset:-right_column_offset] - else: - split_scores = split_scores[ left_column_offset:] - if split_scores: ##if a read doesn't have enough columns, it passes by default - if not ( ${fastq_filter[ 'score_operation' ]}( split_scores ) $fastq_filter[ 'score_comparison' ] $fastq_filter[ 'score' ] ): - return False -#end for - return True -ret_val = fastq_read_pass_filter( fastq_read ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool allows you to build complex filters to be applied to each read in a FASTQ file. - -**Basic Options:** - * You can specify a minimum and maximum read lengths. - * You can specify minimum and maximum per base quality scores, with optionally specifying the number of bases that are allowed to deviate from this range (default of 0 deviant bases). - * If your data is paired-end, select the proper checkbox; this will cause each read to be internally split down the middle and filters applied to each half using the offsets specified. - -**Advance Options:** - * You can specify any number of advanced filters. - * 5' and 3' offsets are defined, starting at zero, increasing from the respective end of the reads. For example, a quality string of "ABCDEFG", with 5' and 3' offsets of 1 and 1, respectively, specified will yield "BCDEF". - * You can specify either absolute offset values, or percentage offset values. *Absolute Values* based offsets are useful for fixed length reads (e.g. Illumina or SOLiD data). *Percentage of Read Length* based offsets are useful for variable length reads (e.g. 454 data). When using the percent-based method, offsets are rounded to the nearest integer. - * The user specifies the aggregating action (min, max, sum, mean) to perform on the quality score values found between the specified offsets to be used with the user defined comparison operation and comparison value. - * If a set of offsets is specified that causes the remaining quality score list to be of length zero, then the read will **pass** the quality filter unless the size range filter is used to remove these reads. - ------ - -.. class:: warningmark - -Adapter bases in color space reads are excluded from filtering. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_groomer.py --- a/tools/fastq/fastq_groomer.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -#Dan Blankenberg -import sys -from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter - -def main(): - input_filename = sys.argv[1] - input_type = sys.argv[2] - output_filename = sys.argv[3] - output_type = sys.argv[4] - force_quality_encoding = sys.argv[5] - summarize_input = sys.argv[6] == 'summarize_input' - if force_quality_encoding == 'None': - force_quality_encoding = None - - aggregator = fastqAggregator() - out = fastqWriter( open( output_filename, 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding ) - read_count = None - if summarize_input: - reader = fastqVerboseErrorReader - else: - reader = fastqReader - for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type, apply_galaxy_conventions = True ) ): - if summarize_input: - aggregator.consume_read( fastq_read ) - out.write( fastq_read ) - out.close() - - if read_count is not None: - print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) - if input_type != output_type and 'solexa' in [ input_type, output_type ]: - print "Converted between Solexa and PHRED scores." - if summarize_input: - print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) - ascii_range = aggregator.get_ascii_range() - decimal_range = aggregator.get_decimal_range() - print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed - print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) - else: - print "No valid FASTQ reads were provided." - - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_groomer.xml --- a/tools/fastq/fastq_groomer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,371 +0,0 @@ - - convert between various FASTQ quality formats - fastq_groomer.py '$input_file' '$input_type' '$output_file' -#if str( $options_type['options_type_selector'] ) == 'basic': -#if str( $input_type ) == 'cssanger': -'cssanger' -#else: -'sanger' -#end if -'ascii' 'summarize_input' -#else: -'${options_type.output_type}' '${options_type.force_quality_encoding}' '${options_type.summarize_input}' -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool offers several conversions options relating to the FASTQ format. - -When using *Basic* options, the output will be *sanger* formatted or *cssanger* formatted (when the input is Color Space Sanger). - -When converting, if a quality score falls outside of the target score range, it will be coerced to the closest available value (i.e. the minimum or maximum). - -When converting between Solexa and the other formats, quality scores are mapped between Solexa and PHRED scales using the equations found in `Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16.`_ - -When converting between color space (csSanger) and base/sequence space (Sanger, Illumina, Solexa) formats, adapter bases are lost or gained; if gained, the base 'G' is used as the adapter. You cannot convert a color space read to base space if there is no adapter present in the color space sequence. Any masked or ambiguous nucleotides in base space will be converted to 'N's when determining color space encoding. - ------ - -**Quality Score Comparison** - -:: - - SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS - ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII - ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ - | | | | | | - 33 59 64 73 104 126 - - S - Sanger Phred+33, 93 values (0, 93) (0 to 60 expected in raw reads) - I - Illumina 1.3 Phred+64, 62 values (0, 62) (0 to 40 expected in raw reads) - X - Solexa Solexa+64, 67 values (-5, 62) (-5 to 40 expected in raw reads) - -Diagram adapted from http://en.wikipedia.org/wiki/FASTQ_format - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - -.. _Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16.: http://www.ncbi.nlm.nih.gov/pubmed/20015970 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_manipulation.py --- a/tools/fastq/fastq_manipulation.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -#Dan Blankenberg -import sys, os, shutil -import imp -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - -def main(): - #Read command line arguments - input_filename = sys.argv[1] - script_filename = sys.argv[2] - output_filename = sys.argv[3] - additional_files_path = sys.argv[4] - input_type = sys.argv[5] or 'sanger' - - #Save script file for debuging/verification info later - os.mkdir( additional_files_path ) - shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) - - fastq_manipulator = imp.load_module( 'fastq_manipulator', open( script_filename ), script_filename, ( '', 'r', imp.PY_SOURCE ) ) - - out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) - - i = None - reads_manipulated = 0 - for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - new_read = fastq_manipulator.match_and_manipulate_read( fastq_read ) - if new_read: - out.write( new_read ) - if new_read != fastq_read: - reads_manipulated += 1 - out.close() - if i is None: - print "Your file contains no valid FASTQ reads." - else: - print 'Manipulated %s of %s reads (%.2f%%).' % ( reads_manipulated, i + 1, float( reads_manipulated ) / float( i + 1 ) * 100.0 ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_manipulation.xml --- a/tools/fastq/fastq_manipulation.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,429 +0,0 @@ - - - reads on various attributes - fastq_manipulation.py $input_file $fastq_manipulation_file $output_file $output_file.files_path '${input_file.extension[len( 'fastq' ):]}' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int( float( value ) ) == float( value ) - - - - int( float( value ) ) == float( value ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##create an importable module -#import binascii -import re -import binascii -from string import maketrans -##does read match -def match_read( fastq_read ): - #for $match_block in $match_blocks: - #if $match_block['match_type']['match_type_selector'] == 'identifier': - search_target = fastq_read.identifier[1:] ##don't include @ - #elif $match_block['match_type']['match_type_selector'] == 'sequence': - search_target = fastq_read.sequence - #elif $match_block['match_type']['match_type_selector'] == 'quality': - search_target = fastq_read.quality - #else: - #continue - #end if - if not re.search( binascii.unhexlify( "${ binascii.hexlify( str( match_block['match_type']['match']['match_by'] ) ) }" ), search_target ): - return False - #end for - return True -##modify matched reads -def manipulate_read( fastq_read ): - new_read = fastq_read.clone() - #for $manipulate_block in $manipulate_blocks: - #if $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'identifier': - #if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'translate': - new_read.identifier = "@%s" % new_read.identifier[1:].translate( maketrans( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['from'] ) ) }" ), binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['to'] ) ) }" ) ) ) - #end if - #elif $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'sequence': - #if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'translate': - new_read.sequence = new_read.sequence.translate( maketrans( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['from'] ) ) }" ), binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['to'] ) ) }" ) ) ) - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'rev_comp': - new_read = new_read.reverse_complement() - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'rev_no_comp': - new_read = new_read.reverse() - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'no_rev_comp': - new_read = new_read.complement() - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'trim': - #if $manipulate_block['manipulation_type']['manipulation']['offset_type']['base_offset_type'] == 'offsets_percent': - left_column_offset = int( round( float( ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['left_column_offset'] } ) / 100.0 * float( len( new_read ) ) ) ) - right_column_offset = int( round( float( ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['right_column_offset'] } ) / 100.0 * float( len( new_read ) ) ) ) - #else - left_column_offset = ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['left_column_offset'] } - right_column_offset = ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['right_column_offset'] } - #end if - if right_column_offset > 0: - right_column_offset = -right_column_offset - else: - right_column_offset = None - new_read = new_read.slice( left_column_offset, right_column_offset ) - if not ( ${str( manipulate_block['manipulation_type']['manipulation']['keep_zero_length'] ) == 'keep_zero_length'} or len( new_read ) ): - return None - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'dna_to_rna': - new_read = new_read.sequence_as_DNA() - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'rna_to_dna': - new_read = new_read.sequence_as_RNA() - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'change_adapter': - if new_read.sequence_space == 'color': - new_read = new_read.change_adapter( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['new_adapter'] ) ) }" ) ) - #end if - #elif $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'quality': - #if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'translate': - new_read.quality = new_read.quality.translate( maketrans( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['from'] ) ) }" ), binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['to'] ) ) }" ) ) ) - #elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'map_score': - def score_method( score ): - raise Exception, "Unimplemented" ##This option is not yet available, need to abstract out e.g. column adding tool action: preventing users from using 'harmful' actions - new_read.quality_map( score_method ) - #end if - #elif $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'miscellaneous': - #if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'remove': - return None - #end if - #else: - #continue - #end if - #end for - if new_read.description != "+": - new_read.description = "+%s" % new_read.identifier[1:] ##ensure description is still valid - return new_read -def match_and_manipulate_read( fastq_read ): - new_read = fastq_read - if match_read( fastq_read ): - new_read = manipulate_read( fastq_read ) - return new_read - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool allows you to build complex manipulations to be applied to each matching read in a FASTQ file. A read must match all matching directives in order for it to be manipulated; if a read does not match, it is output in a non-modified manner. All reads matching will have each of the specified manipulations performed upon them, in the order specified. - -Regular Expression Matches are made using re.search, see http://docs.python.org/library/re.html for more information. - All matching is performed on a single line string, regardless if e.g. the sequence or quality score spans multiple lines in the original file. - -String translations are performed using string.translate, see http://docs.python.org/library/string.html#string.translate and http://docs.python.org/library/string.html#string.maketrans for more information. - -.. class:: warningmark - -Only color space reads can have adapter bases substituted. - - ------ - -**Example** - -Suppose you have a color space sanger formatted sequence (fastqcssanger) and you want to double-encode the color space into psuedo-nucleotide space (this is different from converting) to allow these reads to be used in tools which do not natively support it (using specially designed indexes). This tool can handle this manipulation, however, this is generally not recommended as results tend to be poorer than those produced from tools which are specially designed to handle color space data. - -Steps: - -1. Click **Add new Match Reads** and leave the matching options set to the default (Matching by sequence name/identifier using the regular expression "\*."; thereby matching all reads). -2. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "Change Adapter Base" and set **New Adapter** to "" (an empty text field). -3. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "String Translate" and set **From** to "0123." and **To** to "ACGTN". -4. Click Execute. The new history item will contained double-encoded psuedo-nucleotide space reads. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_masker_by_quality.py --- a/tools/fastq/fastq_masker_by_quality.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -#Dan Blankenberg -import string -from optparse import OptionParser -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - - -def get_score_comparer( operator ): - if operator == 'gt': - return compare_gt - elif operator == 'ge': - return compare_ge - elif operator == 'eq': - return compare_eq - elif operator == 'lt': - return compare_lt - elif operator == 'le': - return compare_le - elif operator == 'ne': - return compare_ne - raise 'Invalid operator provided: %s' % operator - -def compare_gt( quality_score, threshold_value ): - return quality_score > threshold_value - -def compare_ge( quality_score, threshold_value ): - return quality_score >= threshold_value - -def compare_eq( quality_score, threshold_value ): - return quality_score == threshold_value - -def compare_ne( quality_score, threshold_value ): - return quality_score != threshold_value - -def compare_lt( quality_score, threshold_value ): - return quality_score < threshold_value - -def compare_le( quality_score, threshold_value ): - return quality_score <= threshold_value - -class BaseReplacer( object ): - def __init__( self, replace_character ): - self.replace_character = replace_character - def __call__( self, base_character ): - return self.replace_character - -def main(): - usage = "usage: %prog [options] input_file output_file" - parser = OptionParser( usage=usage ) - parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) - parser.add_option( '-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use' ) - parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt','ge','eq','lt', 'le', 'ne' ), help='Mask base when score is' ) - parser.add_option( '-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) - parser.add_option( "-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking") - ( options, args ) = parser.parse_args() - - if len ( args ) != 2: - parser.error( "Need to specify an input file and an output file" ) - - score_comparer = get_score_comparer( options.score_comparison ) - - if options.lowercase: - base_masker = string.lower - else: - base_masker = BaseReplacer( options.mask_character ) - - out = fastqWriter( open( args[1], 'wb' ), format = options.format ) - - num_reads = None - num_reads_excluded = 0 - for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): - sequence_list = list( fastq_read.sequence ) - for i, quality_score in enumerate( fastq_read.get_decimal_quality_scores() ): - if score_comparer( quality_score, options.quality_score ): - sequence_list[ i ] = base_masker( sequence_list[ i ] ) - fastq_read.sequence = "".join( sequence_list ) - out.write( fastq_read ) - - if num_reads is not None: - print "Processed %i %s reads." % ( num_reads + 1, options.format ) - else: - print "No valid FASTQ reads were provided." - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_masker_by_quality.xml --- a/tools/fastq/fastq_masker_by_quality.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - - by quality score - fastq_masker_by_quality.py '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s '${quality_score}' -c '${score_comparison}' - #if $mask_type.value == 'lowercase' - --lowercase - #else - -m '${mask_type}' - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool allows masking base characters in FASTQ format files dependent upon user specified quality score value and comparison method. - -This tool is not available for use on color space (csSanger) formats. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_deinterlacer.py --- a/tools/fastq/fastq_paired_end_deinterlacer.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -#Florent Angly -import sys -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner - -def main(): - input_filename = sys.argv[1] - input_type = sys.argv[2] or 'sanger' - mate1_filename = sys.argv[3] - mate2_filename = sys.argv[4] - single1_filename = sys.argv[5] - single2_filename = sys.argv[6] - - type = input_type - input = fastqNamedReader( open( input_filename, 'rb' ), format = type ) - mate1_out = fastqWriter( open( mate1_filename, 'wb' ), format = type ) - mate2_out = fastqWriter( open( mate2_filename, 'wb' ), format = type ) - single1_out = fastqWriter( open( single1_filename, 'wb' ), format = type ) - single2_out = fastqWriter( open( single2_filename, 'wb' ), format = type ) - joiner = fastqJoiner( type ) - - i = None - skip_count = 0 - found = {} - for i, mate1 in enumerate( fastqReader( open( input_filename, 'rb' ), format = type ) ): - - if mate1.identifier in found: - del found[mate1.identifier] - continue - - mate2 = input.get( joiner.get_paired_identifier( mate1 ) ) - - if mate2: - # This is a mate pair - found[mate2.identifier] = None - if joiner.is_first_mate( mate1 ): - mate1_out.write( mate1 ) - mate2_out.write( mate2 ) - else: - mate1_out.write( mate2 ) - mate2_out.write( mate1 ) - else: - # This is a single - skip_count += 1 - if joiner.is_first_mate( mate1 ): - single1_out.write( mate1 ) - else: - single2_out.write( mate1 ) - - if i is None: - print "Your input file contained no valid FASTQ sequences." - else: - if skip_count: - print 'There were %i reads with no mate.' % skip_count - print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 ) - - input.close() - mate1_out.close() - mate2_out.close() - single1_out.close() - single2_out.close() - - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_deinterlacer.xml --- a/tools/fastq/fastq_paired_end_deinterlacer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ - - on paired end reads - fastq_paired_end_deinterlacer.py '$input_file' '${input_file.extension[len( 'fastq' ):]}' '$output1_pairs_file' '$output2_pairs_file' '$output1_singles_file' '$output2_singles_file' - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -De-interlaces a single fastq dataset representing paired-end run into two fastq datasets containing only the first or second mate read. Reads without mate are saved in separate output files. - -Sequence identifiers for paired-end reads must follow the /1 and /2 convention. - ------ - -**Input** - -A multiple-fastq file containing paired-end reads, for example:: - - @1539:931/1 - ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG - +1539:931/1 - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - @1539:931/2 - CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT - +1539:931/2 - WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - ------ - -**Output** - -Multi-fastq file with left-hand mate only:: - - @1539:931/1 - ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG - +1539:931/1 - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - -Multi-fastq file with right-hand mate only:: - - @1539:931/2 - CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT - +1539:931/2 - WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_interlacer.py --- a/tools/fastq/fastq_paired_end_interlacer.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -#Florent Angly -import sys -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner - -def main(): - mate1_filename = sys.argv[1] - mate1_type = sys.argv[2] or 'sanger' - mate2_filename = sys.argv[3] - mate2_type = sys.argv[4] or 'sanger' - outfile_pairs = sys.argv[5] - outfile_singles = sys.argv[6] - - if mate1_type != mate2_type: - print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type ) - return - - type = mate1_type - joiner = fastqJoiner( type ) - out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type ) - out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type ) - - # Pairs + singles present in mate1 - nof_singles = 0 - nof_pairs = 0 - mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type ) - i = None - for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ): - mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) ) - if mate2: - out_pairs.write( mate1 ) - out_pairs.write( mate2 ) - nof_pairs += 1 - else: - out_singles.write( mate1 ) - nof_singles += 1 - - # Singles present in mate2 - mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type ) - j = None - for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ): - mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) ) - if not mate1: - out_singles.write( mate2 ) - nof_singles += 1 - - if (i is None) and (j is None): - print "Your input files contained no valid FASTQ sequences." - else: - print 'There were %s single reads.' % ( nof_singles ) - print 'Interlaced %s pairs of sequences.' % ( nof_pairs ) - - mate1_input.close() - mate2_input.close() - out_pairs.close() - out_singles.close() - - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_interlacer.xml --- a/tools/fastq/fastq_paired_end_interlacer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - on paired end reads - fastq_paired_end_interlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$outfile_pairs' '$outfile_singles' - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool joins paired end FASTQ reads from two separate files, one with the left mates and one with the right mates, into a single files where left mates alternate with their right mates. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is included in a separate file. - -Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user. - ------ - -**Input** - -Left-hand mates, for example:: - - @1539:931/1 - ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG - +1539:931/1 - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - -Right-hand mates, for example:: - - @1539:931/2 - CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT - +1539:931/2 - WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - ------ - -**Output** - -A multiple-fastq file containing interlaced left and right paired reads:: - - @1539:931/1 - ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG - +1539:931/1 - BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - @1539:931/2 - CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT - +1539:931/2 - WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB - -A multiple-fastq file containing reads that have no mate is also produced. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_joiner.py --- a/tools/fastq/fastq_paired_end_joiner.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -#Dan Blankenberg -import sys, os, shutil -from galaxy_utils.sequence.fastq import fastqReader, fastqNamedReader, fastqWriter, fastqJoiner - -def main(): - #Read command line arguments - input1_filename = sys.argv[1] - input1_type = sys.argv[2] or 'sanger' - input2_filename = sys.argv[3] - input2_type = sys.argv[4] or 'sanger' - output_filename = sys.argv[5] - - if input1_type != input2_type: - print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) - - input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) - joiner = fastqJoiner( input1_type ) - out = fastqWriter( open( output_filename, 'wb' ), format = input1_type ) - - i = None - skip_count = 0 - for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ): - identifier = joiner.get_paired_identifier( fastq_read ) - fastq_paired = input2.get( identifier ) - if fastq_paired is None: - skip_count += 1 - else: - out.write( joiner.join( fastq_read, fastq_paired ) ) - out.close() - - if i is None: - print "Your file contains no valid FASTQ reads." - else: - print input2.has_data() - print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_joiner.xml --- a/tools/fastq/fastq_paired_end_joiner.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ - - on paired end reads - fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file' - - - - - - - - - - - - - - - -**What it does** - -This tool joins paired end FASTQ reads from two separate files into a single read in one file. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is excluded from the output. - -Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user. - ------ - -**Input formats** - -Left-hand Read:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 - GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC - +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 - hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh - -Right-hand Read:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 - GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA - +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 - hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR - ------ - -**Output** - -A multiple-fastq file, for example:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758 - GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA - +HWI-EAS91_1_30788AAXX:7:21:1542:1758 - hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_splitter.py --- a/tools/fastq/fastq_paired_end_splitter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -#Dan Blankenberg -import sys, os, shutil -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqSplitter - -def main(): - #Read command line arguments - input_filename = sys.argv[1] - input_type = sys.argv[2] or 'sanger' - output1_filename = sys.argv[3] - output2_filename = sys.argv[4] - - splitter = fastqSplitter() - out1 = fastqWriter( open( output1_filename, 'wb' ), format = input_type ) - out2 = fastqWriter( open( output2_filename, 'wb' ), format = input_type ) - - i = None - skip_count = 0 - for i, fastq_read in enumerate( fastqReader( open( input_filename, 'rb' ), format = input_type ) ): - read1, read2 = splitter.split( fastq_read ) - if read1 and read2: - out1.write( read1 ) - out2.write( read2 ) - else: - skip_count += 1 - out1.close() - out2.close() - if i is None: - print "Your file contains no valid FASTQ reads." - else: - print 'Split %s of %s reads (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_paired_end_splitter.xml --- a/tools/fastq/fastq_paired_end_splitter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - on joined paired end reads - fastq_paired_end_splitter.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$output1_file' '$output2_file' - - - - - - - - - - - - - - - -**What it does** - -Splits a single fastq dataset representing paired-end run into two datasets (one for each end). This tool works only for datasets where both ends have **the same** length. - -Sequence identifiers will have /1 or /2 appended for the split left-hand and right-hand reads, respectively. - ------ - -**Input format** - -A multiple-fastq file, for example:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758 - GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA - +HWI-EAS91_1_30788AAXX:7:21:1542:1758 - hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR - - ------ - -**Outputs** - -Left-hand Read:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 - GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC - +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 - hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh - -Right-hand Read:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 - GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA - +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 - hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_stats.py --- a/tools/fastq/fastq_stats.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -#Dan Blankenberg -import sys -from galaxy_utils.sequence.fastq import fastqReader, fastqAggregator - -VALID_NUCLEOTIDES = [ 'A', 'C', 'G', 'T', 'N' ] -VALID_COLOR_SPACE = map( str, range( 7 ) ) + [ '.' ] -SUMMARY_STAT_ORDER = ['read_count', 'min_score', 'max_score', 'sum_score', 'mean_score', 'q1', 'med_score', 'q3', 'iqr', 'left_whisker', 'right_whisker' ] - -def main(): - input_filename = sys.argv[1] - output_filename = sys.argv[2] - input_type = sys.argv[3] or 'sanger' - - aggregator = fastqAggregator() - num_reads = None - fastq_read = None - for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - aggregator.consume_read( fastq_read ) - out = open( output_filename, 'wb' ) - valid_nucleotides = VALID_NUCLEOTIDES - if fastq_read: - if fastq_read.sequence_space == 'base': - out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n' ) - else: - out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n' ) - valid_nucleotides = VALID_COLOR_SPACE - for i in range( aggregator.get_max_read_length() ): - column_stats = aggregator.get_summary_statistics_for_column( i ) - out.write( '%i\t' % ( i + 1 ) ) - out.write( '%s\t' * len( SUMMARY_STAT_ORDER ) % tuple( [ column_stats[ key ] for key in SUMMARY_STAT_ORDER ] ) ) - out.write( '%s\t' % ','.join( map( str, column_stats['outliers'] ) ) ) - base_counts = aggregator.get_base_counts_for_column( i ) - for nuc in valid_nucleotides: - out.write( "%s\t" % base_counts.get( nuc, 0 ) ) - extra_nucs = sorted( [ nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides ] ) - out.write( "%s\t%s\n" % ( ','.join( extra_nucs ), ','.join( str( base_counts[nuc] ) for nuc in extra_nucs ) ) ) - out.close() - if num_reads is None: - print "No valid fastq reads could be processed." - else: - print "%i fastq reads were processed." % ( num_reads + 1 ) - print "Based upon quality values and sequence characters, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) - ascii_range = aggregator.get_ascii_range() - decimal_range = aggregator.get_decimal_range() - print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed - print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_stats.xml --- a/tools/fastq/fastq_stats.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ - - by column - fastq_stats.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}' - - - - - - - - - - - - - -This tool creates summary statistics on a FASTQ file. - -.. class:: infomark - -**TIP:** This statistics report can be used as input for the **Boxplot** and **Nucleotides Distribution** tools. - ------ - -**The output file will contain the following fields:** - -* column = column number (1 to 36 for a 36-cycles read Solexa file) -* count = number of bases found in this column. -* min = Lowest quality score value found in this column. -* max = Highest quality score value found in this column. -* sum = Sum of quality score values for this column. -* mean = Mean quality score value for this column. -* Q1 = 1st quartile quality score. -* med = Median quality score. -* Q3 = 3rd quartile quality score. -* IQR = Inter-Quartile range (Q3-Q1). -* lW = 'Left-Whisker' value (for boxplotting). -* rW = 'Right-Whisker' value (for boxplotting). -* outliers = Scores falling beyond the left and right whiskers (comma separated list). -* A_Count = Count of 'A' nucleotides found in this column. -* C_Count = Count of 'C' nucleotides found in this column. -* G_Count = Count of 'G' nucleotides found in this column. -* T_Count = Count of 'T' nucleotides found in this column. -* N_Count = Count of 'N' nucleotides found in this column. -* Other_Nucs = Comma separated list of other nucleotides found in this column. -* Other_Count = Comma separated count of other nucleotides found in this column. - -For example:: - - #column count min max sum mean Q1 med Q3 IQR lW rW outliers A_Count C_Count G_Count T_Count N_Count other_bases other_base_count - 1 14336356 2 33 450600675 31.4306281875 32.0 33.0 33.0 1.0 31 33 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 4482314 2199633 4425957 3208745 19707 - 2 14336356 2 34 441135033 30.7703737965 30.0 33.0 33.0 3.0 26 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 4419184 2170537 4627987 3118567 81 - 3 14336356 2 34 433659182 30.2489127642 29.0 32.0 33.0 4.0 23 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 4310988 2941988 3437467 3645784 129 - 4 14336356 2 34 433635331 30.2472490917 29.0 32.0 33.0 4.0 23 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 4110637 3007028 3671749 3546839 103 - 5 14336356 2 34 432498583 30.167957813 29.0 32.0 33.0 4.0 23 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 4348275 2935903 3293025 3759029 124 - ------ - -.. class:: warningmark - -Adapter bases in color space reads are excluded from statistics. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_to_fasta.py --- a/tools/fastq/fastq_to_fasta.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -#Dan Blankenberg -import sys -from galaxy_utils.sequence.fastq import fastqReader -from galaxy_utils.sequence.fasta import fastaWriter - -def main(): - input_filename = sys.argv[1] - output_filename = sys.argv[2] - input_type = sys.argv[3] or 'sanger' #input type should ordinarily be unnecessary - - num_reads = None - fastq_read = None - out = fastaWriter( open( output_filename, 'wb' ) ) - for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - out.write( fastq_read ) - out.close() - if num_reads is None: - print "No valid FASTQ reads could be processed." - else: - print "%i FASTQ reads were converted to FASTA." % ( num_reads + 1 ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_to_fasta.xml --- a/tools/fastq/fastq_to_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ - - converter - fastq_to_fasta.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}' - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts FASTQ sequencing reads to FASTA sequences. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_to_tabular.py --- a/tools/fastq/fastq_to_tabular.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -#Dan Blankenberg -import sys -from galaxy_utils.sequence.fastq import fastqReader - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - if len(sys.argv) != 5: - stop_err("Wrong number of arguments. Expect: fasta tabular desrc_split [type]") - input_filename = sys.argv[1] - output_filename = sys.argv[2] - descr_split = int( sys.argv[3] ) - 1 - if descr_split < 0: - stop_err("Bad description split value (should be 1 or more)") - input_type = sys.argv[4] or 'sanger' #input type should ordinarily be unnecessary - - num_reads = None - fastq_read = None - out = open( output_filename, 'wb' ) - if descr_split == 0: - #Don't divide the description into multiple columns - for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - out.write( "%s\t%s\t%s\n" % ( fastq_read.identifier[1:].replace( '\t', ' ' ), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) ) - else: - for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - words = fastq_read.identifier[1:].replace( '\t', ' ' ).split(None, descr_split) - #pad with empty columns if required - words += [""]*(descr_split-len(words)) - out.write( "%s\t%s\t%s\n" % ("\t".join(words), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) ) - out.close() - if num_reads is None: - print "No valid FASTQ reads could be processed." - else: - print "%i FASTQ reads were converted to Tabular." % ( num_reads + 1 ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_to_tabular.xml --- a/tools/fastq/fastq_to_tabular.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ - - converter - fastq_to_tabular.py '$input_file' '$output_file' $descr_columns '${input_file.extension[len( 'fastq' ):]}' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts FASTQ sequencing reads to a Tabular file. - -It is conventional to take the first word of the FASTQ "@" title line as the identifier, and any remaining text to be a free form description. -It is therefore often useful to split this text into two columns in Galaxy (identifier and any description) by setting **How many columns to divide title string into?** to **2**. -In some cases the description can be usefully broken up into more columns -- see the examples . - -Tab characters, if present in the source FASTQ title, will be converted to spaces. - ------ - -**Example** - -Consider the following two 454 reads in Sanger FASTQ format (using line wrapping for display, but do note not all tools will accept line wrapped FASTQ files):: - - @FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] - tcagTTAAGATGGGATAATATCCTCAGATTGCGTGATGAACTTTGTTCTGGTGGAGGAGAAGGAAGTGCATTCGACGTAT - GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcacaaatcagtgacataaatattatttagatttcgggagcaact - ttatttattccacaagcaggtttaaattttaaatttaaattattgcagaagactttaaattaacctcgttgtcggagtca - tttgttcggttattggtcgaaagtaaccncgggaagtgccgaaaactaacaaacaaaagaagatagtgaaattttaatta - aaanaaatagccaaacgtaactaactaaaacggacccgtcgaggaactgccaacggacgacacagggagtagnnn - +FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] - FFFDDDDDDDA666?688FFHGGIIIIIIIIIIIIIIIIIIHHHIIIIIIIIIGHGFFFFF====DFFFFFFFFFFFFFF - D???:3104/76=:5...4.3,,,366////4<ABBAAA=CCFDDDDDDDD:666CDFFFF=<ABA=;:333111<===9 - 9;B889FFFFFFDDBDBDDD=8844231..,,,-,,,,,,,,1133..---17111,,,,,22555131121.--.,333 - 11,.,,3--,,.,,--,3511123..--!,,,,--,----9,,,,8=,,-,,,-,,,,---26:9:5-..1,,,,11//, - ,,,!,,1917--,,,,-3.,--,,17,,,,---+11113.030000,,,044400036;96662.//;7><;!!! - @FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] - tcagCCAGCAATTCCGACTTAATTGTTCTTCTTCCATCATTCATCTCGACTAACAGTTCTACGATTAATGAGTTTGGCtt - taatttgttgttcattattgtcacaattacactactgagactgccaaggcacncagggataggnn - +FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] - FFFFFFFFFDDDDFFFFGFDDDDBAAAAA=<4444@@B=555:BBBBB@@?8:8<?<89898<84442;==3,,,514,, - ,11,,,.,,21777555513,..--1115758.//34488><<;;;;9944/!/4,,,57855!! - -By default this is converted into a 3 column tabular file, with the full FASTQ title used as column 1: - -=================================================================================================== ============== ============== -FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] tcagTTAA...nnn FFFDDDDD...!!! -FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] tcagCCAG...gnn FFFFFFFF...5!! -=================================================================================================== ============== ============== - -If you specified the title should be turned into 2 columns, you'd get 4 columns in total: - -============== ==================================================================================== ============== ============== -FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] tcagTTAA...nnn FFFDDDDD...!!! -FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] tcagCCAG...gnn FFFFFFFF...5!! -============== ==================================================================================== ============== ============== - -Similarly, for this example treating the title string as 7 columns makes sense: - -============== ============ ========== =========== ============= ============== =================== ============== ============== -FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] tcagTTAA...nnn FFFDDDDD...!!! -FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] tcagCCAG...gnn FFFFFFFF...5!! -============== ============ ========== =========== ============= ============== =================== ============== ============== - -Note the sequences and quality strings have been truncated for display purposes in the above tables. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_trimmer.py --- a/tools/fastq/fastq_trimmer.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -#Dan Blankenberg -import sys -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - -def main(): - input_filename = sys.argv[1] - output_filename = sys.argv[2] - left_offset = sys.argv[3] - right_offset = sys.argv[4] - percent_offsets = sys.argv[5] == 'offsets_percent' - input_type = sys.argv[6] or 'sanger' - keep_zero_length = sys.argv[7] == 'keep_zero_length' - - out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) - num_reads_excluded = 0 - num_reads = None - for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): - if percent_offsets: - left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) ) - right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) ) - else: - left_column_offset = int( left_offset ) - right_column_offset = int( right_offset ) - if right_column_offset > 0: - right_column_offset = -right_column_offset - else: - right_column_offset = None - fastq_read = fastq_read.slice( left_column_offset, right_column_offset ) - if keep_zero_length or len( fastq_read ): - out.write( fastq_read ) - else: - num_reads_excluded += 1 - out.close() - if num_reads is None: - print "No valid fastq reads could be processed." - else: - print "%i fastq reads were processed." % ( num_reads + 1 ) - if num_reads_excluded: - print "%i reads of zero length were excluded from the output." % num_reads_excluded - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_trimmer.xml --- a/tools/fastq/fastq_trimmer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ - - by column - fastq_trimmer.py '$input_file' '$output_file' '${offset_type['left_column_offset']}' '${offset_type['right_column_offset']}' '${offset_type['base_offset_type']}' '${input_file.extension[len( 'fastq' ):]}' '$keep_zero_length' - - - - - - - - - - - int( float( value ) ) == float( value ) - - - - int( float( value ) ) == float( value ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool allows you to trim the ends of reads. - -You can specify either absolute or percent-based offsets. Offsets are calculated, starting at 0, from the respective end to be trimmed. When using the percent-based method, offsets are rounded to the nearest integer. - -For example, if you have a read of length 36:: - - @Some FASTQ Sanger Read - CAATATGTNCTCACTGATAAGTGGATATNAGCNCCA - + - =@@.@;B-%?8>CBA@>7@7BBCA4-48%<;;%<B@ - -And you set absolute offsets of 2 and 9:: - - @Some FASTQ Sanger Read - ATATGTNCTCACTGATAAGTGGATA - + - @.@;B-%?8>CBA@>7@7BBCA4-4 - -Or you set percent offsets of 6% and 20% (corresponds to absolute offsets of 2,7 for a read length of 36):: - - @Some FASTQ Sanger Read - ATATGTNCTCACTGATAAGTGGATATN - + - @.@;B-%?8>CBA@>7@7BBCA4-48% - ------ - -.. class:: warningmark - -Trimming a color space read will cause any adapter base to be lost. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_trimmer_by_quality.py --- a/tools/fastq/fastq_trimmer_by_quality.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#Dan Blankenberg -from optparse import OptionParser -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - -def mean( score_list ): - return float( sum( score_list ) ) / float( len( score_list ) ) - -ACTION_METHODS = { 'min':min, 'max':max, 'sum':sum, 'mean':mean } - -def compare( aggregated_value, operator, threshold_value ): - if operator == '>': - return aggregated_value > threshold_value - elif operator == '>=': - return aggregated_value >= threshold_value - elif operator == '==': - return aggregated_value == threshold_value - elif operator == '<': - return aggregated_value < threshold_value - elif operator == '<=': - return aggregated_value <= threshold_value - elif operator == '!=': - return aggregated_value != threshold_value - -def exclude( value_list, exclude_indexes ): - rval = [] - for i, val in enumerate( value_list ): - if i not in exclude_indexes: - rval.append( val ) - return rval - -def exclude_and_compare( aggregate_action, aggregate_list, operator, threshold_value, exclude_indexes = None ): - if not aggregate_list or compare( aggregate_action( aggregate_list ), operator, threshold_value ): - return True - if exclude_indexes: - for exclude_index in exclude_indexes: - excluded_list = exclude( aggregate_list, exclude_index ) - if not excluded_list or compare( aggregate_action( excluded_list ), operator, threshold_value ): - return True - return False - -def main(): - usage = "usage: %prog [options] input_file output_file" - parser = OptionParser( usage=usage ) - parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) - parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' ) - parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' ) - parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' ) - parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' ) - parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' ) - parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' ) - parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) - parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") - ( options, args ) = parser.parse_args() - - if len ( args ) != 2: - parser.error( "Need to specify an input file and an output file" ) - - if options.window_size < 1: - parser.error( 'You must specify a strictly positive window size' ) - - if options.window_step < 1: - parser.error( 'You must specify a strictly positive step size' ) - - #determine an exhaustive list of window indexes that can be excluded from aggregation - exclude_window_indexes = [] - last_exclude_indexes = [] - for exclude_count in range( min( options.exclude_count, options.window_size ) ): - if last_exclude_indexes: - new_exclude_indexes = [] - for exclude_list in last_exclude_indexes: - for window_index in range( options.window_size ): - if window_index not in exclude_list: - new_exclude = sorted( exclude_list + [ window_index ] ) - if new_exclude not in exclude_window_indexes + new_exclude_indexes: - new_exclude_indexes.append( new_exclude ) - exclude_window_indexes += new_exclude_indexes - last_exclude_indexes = new_exclude_indexes - else: - for window_index in range( options.window_size ): - last_exclude_indexes.append( [ window_index ] ) - exclude_window_indexes = list( last_exclude_indexes ) - - out = fastqWriter( open( args[1], 'wb' ), format = options.format ) - action = ACTION_METHODS[ options.aggregation_action ] - - num_reads = None - num_reads_excluded = 0 - for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): - for trim_end in options.trim_ends: - quality_list = fastq_read.get_decimal_quality_scores() - if trim_end == '5': - lwindow_position = 0 #left position of window - while True: - if lwindow_position >= len( quality_list ): - fastq_read.sequence = '' - fastq_read.quality = '' - break - if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ): - fastq_read = fastq_read.slice( lwindow_position, None ) - break - lwindow_position += options.window_step - else: - rwindow_position = len( quality_list ) #right position of window - while True: - lwindow_position = rwindow_position - options.window_size #left position of window - if rwindow_position <= 0 or lwindow_position < 0: - fastq_read.sequence = '' - fastq_read.quality = '' - break - if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ): - fastq_read = fastq_read.slice( None, rwindow_position ) - break - rwindow_position -= options.window_step - if options.keep_zero_length or len( fastq_read ): - out.write( fastq_read ) - else: - num_reads_excluded += 1 - out.close() - if num_reads is None: - print "No valid FASTQ reads could be processed." - else: - print "%i FASTQ reads were processed." % ( num_reads + 1 ) - if num_reads_excluded: - print "%i reads of zero length were excluded from the output." % num_reads_excluded - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/fastq_trimmer_by_quality.xml --- a/tools/fastq/fastq_trimmer_by_quality.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ - - by sliding window - fastq_trimmer_by_quality.py '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s '$window_size' - -t '$step_size' -e '$trim_ends' -a '$aggregation_action' -x '$exclude_count' -c '$score_comparison' -q '$quality_score' - #if $keep_zero_length.value: - -k - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool allows you to trim the ends of reads based upon the aggregate value of quality scores found within a sliding window; a sliding window of size 1 is equivalent to 'simple' trimming of the ends. - -The user specifies the aggregating action (min, max, sum, mean) to perform on the quality score values found within the sliding window to be used with the user defined comparison operation and comparison value. - -The user can provide a maximum count of bases that can be excluded from the aggregation within the window. When set, this tool will first check the aggregation of the entire window, then after removing 1 value, then after removing 2 values, up to the number declared. Setting this value to be equal to or greater than the window size will cause no trimming to occur. - ------ - -.. class:: warningmark - -Trimming a color space read will cause any adapter base to be lost. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/tabular_to_fastq.py --- a/tools/fastq/tabular_to_fastq.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -#Dan Blankenberg -import sys - -def main(): - input_filename = sys.argv[1] - output_filename = sys.argv[2] - identifier_col = int( sys.argv[3] ) - 1 - sequence_col = int( sys.argv[4] ) - 1 - quality_col = int( sys.argv[5] ) - 1 - - max_col = max( identifier_col, sequence_col, quality_col ) - num_reads = None - fastq_read = None - skipped_lines = 0 - out = open( output_filename, 'wb' ) - for num_reads, line in enumerate( open( input_filename ) ): - fields = line.rstrip( '\n\r' ).split( '\t' ) - if len( fields ) > max_col: - out.write( "@%s\n%s\n+\n%s\n" % ( fields[identifier_col], fields[sequence_col], fields[quality_col] ) ) - else: - skipped_lines += 1 - - out.close() - if num_reads is None: - print "Input was empty." - else: - print "%i tabular lines were written as FASTQ reads. Be sure to use the FASTQ Groomer tool on this output before further analysis." % ( num_reads + 1 - skipped_lines ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/fastq/tabular_to_fastq.xml --- a/tools/fastq/tabular_to_fastq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - converter - tabular_to_fastq.py '$input_file' '$output_file' '$identifier' '$sequence' '$quality' - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool attempts to convert a tabular file containing sequencing read data to a FASTQ formatted file. The FASTQ Groomer tool should always be used on the output of this tool. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fasta_clipping_histogram.xml --- a/tools/fastx_toolkit/fasta_clipping_histogram.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ - - chart - fastx_toolkit - fasta_clipping_histogram.pl $input $outfile - - - - - - - - - - -**What it does** - -This tool creates a histogram image of sequence lengths distribution in a given fasta dataset file. - -**TIP:** Use this tool after clipping your library (with **FASTX Clipper tool**), to visualize the clipping results. - ------ - -**Output Examples** - -In the following library, most sequences are 24-mers to 27-mers. -This could indicate an abundance of endo-siRNAs (depending of course of what you've tried to sequence in the first place). - -.. image:: ./static/fastx_icons/fasta_clipping_histogram_1.png - - -In the following library, most sequences are 19,22 or 23-mers. -This could indicate an abundance of miRNAs (depending of course of what you've tried to sequence in the first place). - -.. image:: ./static/fastx_icons/fasta_clipping_histogram_2.png - - ------ - - -**Input Formats** - -This tool accepts short-reads FASTA files. The reads don't have to be short, but they do have to be on a single line, like so:: - - >sequence1 - AGTAGTAGGTGATGTAGAGAGAGAGAGAGTAG - >sequence2 - GTGTGTGTGGGAAGTTGACACAGTA - >sequence3 - CCTTGAGATTAACGCTAATCAAGTAAAC - - -If the sequences span over multiple lines:: - - >sequence1 - CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAG - TCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAG - aactggtctttacctTTAAGTTG - -Use the **FASTA Width Formatter** tool to re-format the FASTA into a single-lined sequences:: - - >sequence1 - CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAGTCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAGaactggtctttacctTTAAGTTG - - ------ - - - -**Multiplicity counts (a.k.a reads-count)** - -If the sequence identifier (the text after the '>') contains a dash and a number, it is treated as a multiplicity count value (i.e. how many times that individual sequence repeated in the original FASTA file, before collapsing). - -Example 1 - The following FASTA file *does not* have multiplicity counts:: - - >seq1 - GGATCC - >seq2 - GGTCATGGGTTTAAA - >seq3 - GGGATATATCCCCACACACACACAC - -Each sequence is counts as one, to produce the following chart: - -.. image:: ./static/fastx_icons/fasta_clipping_histogram_3.png - - -Example 2 - The following FASTA file have multiplicity counts:: - - >seq1-2 - GGATCC - >seq2-10 - GGTCATGGGTTTAAA - >seq3-3 - GGGATATATCCCCACACACACACAC - -The first sequence counts as 2, the second as 10, the third as 3, to produce the following chart: - -.. image:: ./static/fastx_icons/fasta_clipping_histogram_4.png - -Use the **FASTA Collapser** tool to create FASTA files with multiplicity counts. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fasta_formatter.xml --- a/tools/fastx_toolkit/fasta_formatter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ - - formatter - fastx_toolkit - - zcat -f '$input' | fasta_formatter -w $width -o $output - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool re-formats a FASTA file, changing the width of the nucleotides lines. - -**TIP:** Outputting a single line (with **width = 0**) can be useful for scripting (with **grep**, **awk**, and **perl**). Every odd line is a sequence identifier, and every even line is a nucleotides line. - --------- - -**Example** - -Input FASTA file (each nucleotides line is 50 characters long):: - - >Scaffold3648 - AGGAATGATGACTACAATGATCAACTTAACCTATCTATTTAATTTAGTTC - CCTAATGTCAGGGACCTACCTGTTTTTGTTATGTTTGGGTTTTGTTGTTG - TTGTTTTTTTAATCTGAAGGTATTGTGCATTATATGACCTGTAATACACA - ATTAAAGTCAATTTTAATGAACATGTAGTAAAAACT - >Scaffold9299 - CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAG - TCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAG - aactggtctttacctTTAAGTTG - - -Output FASTA file (with width=80):: - - >Scaffold3648 - AGGAATGATGACTACAATGATCAACTTAACCTATCTATTTAATTTAGTTCCCTAATGTCAGGGACCTACCTGTTTTTGTT - ATGTTTGGGTTTTGTTGTTGTTGTTTTTTTAATCTGAAGGTATTGTGCATTATATGACCTGTAATACACAATTAAAGTCA - ATTTTAATGAACATGTAGTAAAAACT - >Scaffold9299 - CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAGTCTTCGGTCATAACACAAACCCAGACCTAC - GTATATGACAAAGCTAATAGaactggtctttacctTTAAGTTG - -Output FASTA file (with width=0 => single line):: - - >Scaffold3648 - AGGAATGATGACTACAATGATCAACTTAACCTATCTATTTAATTTAGTTCCCTAATGTCAGGGACCTACCTGTTTTTGTTATGTTTGGGTTTTGTTGTTGTTGTTTTTTTAATCTGAAGGTATTGTGCATTATATGACCTGTAATACACAATTAAAGTCAATTTTAATGAACATGTAGTAAAAACT - >Scaffold9299 - CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAGTCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAGaactggtctttacctTTAAGTTG - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fasta_nucleotide_changer.xml --- a/tools/fastx_toolkit/fasta_nucleotide_changer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ - - converter - fastx_toolkit - zcat -f '$input' | fasta_nucleotide_changer $mode -v -o $output - - - - - - - - - - - - - - - - -**What it does** - -This tool converts RNA FASTA files to DNA (and vice-versa). - -In **RNA-to-DNA** mode, U's are changed into T's. - -In **DNA-to-RNA** mode, T's are changed into U's. - --------- - -**Example** - -Input RNA FASTA file ( from Sanger's mirBase ):: - - >cel-let-7 MIMAT0000001 Caenorhabditis elegans let-7 - UGAGGUAGUAGGUUGUAUAGUU - >cel-lin-4 MIMAT0000002 Caenorhabditis elegans lin-4 - UCCCUGAGACCUCAAGUGUGA - >cel-miR-1 MIMAT0000003 Caenorhabditis elegans miR-1 - UGGAAUGUAAAGAAGUAUGUA - -Output DNA FASTA file (with RNA-to-DNA mode):: - - >cel-let-7 MIMAT0000001 Caenorhabditis elegans let-7 - TGAGGTAGTAGGTTGTATAGTT - >cel-lin-4 MIMAT0000002 Caenorhabditis elegans lin-4 - TCCCTGAGACCTCAAGTGTGA - >cel-miR-1 MIMAT0000003 Caenorhabditis elegans miR-1 - TGGAATGTAAAGAAGTATGTA - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastq_quality_boxplot.xml --- a/tools/fastx_toolkit/fastq_quality_boxplot.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ - - - fastx_toolkit - - fastq_quality_boxplot_graph.sh -t '$input.name' -i $input -o $output - - - - - - - - - - -**What it does** - -Creates a boxplot graph for the quality scores in the library. - -.. class:: infomark - -**TIP:** Use the **FASTQ Statistics** tool to generate the report file needed for this tool. - ------ - -**Output Examples** - -* Black horizontal lines are medians -* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1) -* Whiskers show outlier at max. 1.5*IQR - - -An excellent quality library (median quality is 40 for almost all 36 cycles): - -.. image:: ./static/fastx_icons/fastq_quality_boxplot_1.png - - -A relatively good quality library (median quality degrades towards later cycles): - -.. image:: ./static/fastx_icons/fastq_quality_boxplot_2.png - -A low quality library (median drops quickly): - -.. image:: ./static/fastx_icons/fastq_quality_boxplot_3.png - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastq_quality_converter.xml --- a/tools/fastx_toolkit/fastq_quality_converter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ - - (ASCII-Numeric) - fastx_toolkit - zcat -f $input | fastq_quality_converter $QUAL_FORMAT -o $output -Q $offset - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Converts a Solexa FASTQ file to/from numeric or ASCII quality format. - -.. class:: warningmark - -Re-scaling is **not** performed. (e.g. conversion from Phred scale to Solexa scale). - - ------ - -FASTQ with Numeric quality scores:: - - @CSHL__2_FC042AGWWWXX:8:1:120:202 - ACGATAGATCGGAAGAGCTAGTATGCCGTTTTCTGC - +CSHL__2_FC042AGWWWXX:8:1:120:202 - 40 40 40 40 20 40 40 40 40 6 40 40 28 40 40 25 40 20 40 -1 30 40 14 27 40 8 1 3 7 -1 11 10 -1 21 10 8 - @CSHL__2_FC042AGWWWXX:8:1:103:1185 - ATCACGATAGATCGGCAGAGCTCGTTTACCGTCTTC - +CSHL__2_FC042AGWWWXX:8:1:103:1185 - 40 40 40 40 40 35 33 31 40 40 40 32 30 22 40 -0 9 22 17 14 8 36 15 34 22 12 23 3 10 -0 8 2 4 25 30 2 - - -FASTQ with ASCII quality scores:: - - @CSHL__2_FC042AGWWWXX:8:1:120:202 - ACGATAGATCGGAAGAGCTAGTATGCCGTTTTCTGC - +CSHL__2_FC042AGWWWXX:8:1:120:202 - hhhhThhhhFhh\hhYhTh?^hN[hHACG?KJ?UJH - @CSHL__2_FC042AGWWWXX:8:1:103:1185 - ATCACGATAGATCGGCAGAGCTCGTTTACCGTCTTC - +CSHL__2_FC042AGWWWXX:8:1:103:1185 - hhhhhca_hhh`^Vh@IVQNHdObVLWCJ@HBDY^B - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastq_quality_filter.xml --- a/tools/fastx_toolkit/fastq_quality_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ - - - fastx_toolkit - - zcat -f '$input' | fastq_quality_filter -q $quality -p $percent -v -o $output -#if $input.ext == "fastqsanger": --Q 33 -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool filters reads based on quality scores. - -.. class:: infomark - -Using **percent = 100** requires all cycles of all reads to be at least the quality cut-off value. - -.. class:: infomark - -Using **percent = 50** requires the median quality of the cycles (in each read) to be at least the quality cut-off value. - --------- - -Quality score distribution (of all cycles) is calculated for each read. If it is lower than the quality cut-off value - the read is discarded. - - -**Example**:: - - @CSHL_4_FC042AGOOII:1:2:214:584 - GACAATAAAC - +CSHL_4_FC042AGOOII:1:2:214:584 - 30 30 30 30 30 30 30 30 20 10 - -Using **percent = 50** and **cut-off = 30** - This read will not be discarded (the median quality is higher than 30). - -Using **percent = 90** and **cut-off = 30** - This read will be discarded (90% of the cycles do no have quality equal to / higher than 30). - -Using **percent = 100** and **cut-off = 20** - This read will be discarded (not all cycles have quality equal to / higher than 20). - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastq_to_fasta.xml --- a/tools/fastx_toolkit/fastq_to_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ - - converter - fastx_toolkit - gunzip -cf $input | fastq_to_fasta $SKIPN $RENAMESEQ -o $output -v -#if $input.ext == "fastqsanger": --Q 33 -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts data from Solexa format to FASTA format (scroll down for format description). - --------- - -**Example** - -The following data in Solexa-FASTQ format:: - - @CSHL_4_FC042GAMMII_2_1_517_596 - GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - +CSHL_4_FC042GAMMII_2_1_517_596 - 40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40 - -Will be converted to FASTA (with 'rename sequence names' = NO):: - - >CSHL_4_FC042GAMMII_2_1_517_596 - GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - -Will be converted to FASTA (with 'rename sequence names' = YES):: - - >1 - GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_artifacts_filter.xml --- a/tools/fastx_toolkit/fastx_artifacts_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ - - - fastx_toolkit - zcat -f '$input' | fastx_artifacts_filter -v -o "$output" -#if $input.ext == "fastqsanger": --Q 33 -#end if - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool filters sequencing artifacts (reads with all but 3 identical bases). - --------- - -**The following is an example of sequences which will be filtered out**:: - - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAACACAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC - AAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAA - AAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAA - AAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAA - AAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAA - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_barcode_splitter.xml --- a/tools/fastx_toolkit/fastx_barcode_splitter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ - - - fastx_toolkit - fastx_barcode_splitter_galaxy_wrapper.sh $BARCODE $input "$input.name" "$output.files_path" --mismatches $mismatches --partial $partial $EOL > $output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool splits a Solexa library (FASTQ file) or a regular FASTA file into several files, using barcodes as the split criteria. - --------- - -**Barcode file Format** - -Barcode files are simple text files. -Each line should contain an identifier (descriptive name for the barcode), and the barcode itself (A/C/G/T), separated by a TAB character. -Example:: - - #This line is a comment (starts with a 'number' sign) - BC1 GATCT - BC2 ATCGT - BC3 GTGAT - BC4 TGTCT - -For each barcode, a new FASTQ file will be created (with the barcode's identifier as part of the file name). -Sequences matching the barcode will be stored in the appropriate file. - -One additional FASTQ file will be created (the 'unmatched' file), where sequences not matching any barcode will be stored. - -The output of this tool is an HTML file, displaying the split counts and the file locations. - -**Output Example** - -.. image:: ./static/fastx_icons/barcode_splitter_output_example.png - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_barcode_splitter_galaxy_wrapper.sh --- a/tools/fastx_toolkit/fastx_barcode_splitter_galaxy_wrapper.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ -#!/bin/bash - -# FASTX-toolkit - FASTA/FASTQ preprocessing tools. -# Copyright (C) 2009 A. Gordon (gordon@cshl.edu) -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -# -#This is a shell script wrapper for 'fastx_barcode_splitter.pl' -# -# 1. Output files are saved at the dataset's files_path directory. -# -# 2. 'fastx_barcode_splitter.pl' outputs a textual table. -# This script turns it into pretty HTML with working URL -# (so lazy users can just click on the URLs and get their files) - -BARCODE_FILE="$1" -FASTQ_FILE="$2" -LIBNAME="$3" -OUTPUT_PATH="$4" -shift 4 -# The rest of the parameters are passed to the split program - -if [ "$OUTPUT_PATH" == "" ]; then - echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [LIBRARY_NAME] [OUTPUT_PATH]" >&2 - exit 1 -fi - -#Sanitize library name, make sure we can create a file with this name -LIBNAME=${LIBNAME//\.gz/} -LIBNAME=${LIBNAME//\.txt/} -LIBNAME=${LIBNAME//[^[:alnum:]]/_} - -if [ ! -r "$FASTQ_FILE" ]; then - echo "Error: Input file ($FASTQ_FILE) not found!" >&2 - exit 1 -fi -if [ ! -r "$BARCODE_FILE" ]; then - echo "Error: barcode file ($BARCODE_FILE) not found!" >&2 - exit 1 -fi -mkdir -p "$OUTPUT_PATH" -if [ ! -d "$OUTPUT_PATH" ]; then - echo "Error: failed to create output path '$OUTPUT_PATH'" >&2 - exit 1 -fi - -PUBLICURL="" -BASEPATH="$OUTPUT_PATH/" -#PREFIX="$BASEPATH"`date "+%Y-%m-%d_%H%M__"`"${LIBNAME}__" -PREFIX="$BASEPATH""${LIBNAME}__" -SUFFIX=".txt" - -RESULTS=`zcat -f "$FASTQ_FILE" | fastx_barcode_splitter.pl --bcfile "$BARCODE_FILE" --prefix "$PREFIX" --suffix "$SUFFIX" "$@"` -if [ $? != 0 ]; then - echo "error" -fi - -# -# Convert the textual tab-separated table into simple HTML table, -# with the local path replaces with a valid URL -echo "" -echo "$RESULTS" | sed -r "s|$BASEPATH(.*)|\\1|" | sed ' -i
-s|\t||g -a<\/td><\/tr> -' -echo "

" -echo "

" diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_clipper.xml --- a/tools/fastx_toolkit/fastx_clipper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ - - adapter sequences - fastx_toolkit - - zcat -f $input | fastx_clipper -l $minlength -a $clip_source.clip_sequence -d $keepdelta -o $output -v $KEEP_N $DISCARD_OPTIONS -#if $input.ext == "fastqsanger": - -Q 33 -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - use this for hairpin barcoding. keep at 0 unless you know what you're doing. - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool clips adapters from the 3'-end of the sequences in a FASTA/FASTQ file. - --------- - - -**Clipping Illustration:** - -.. image:: ./static/fastx_icons/fastx_clipper_illustration.png - - - - - - - - -**Clipping Example:** - -.. image:: ./static/fastx_icons/fastx_clipper_example.png - - - -**In the above example:** - -* Sequence no. 1 was discarded since it wasn't clipped (i.e. didn't contain the adapter sequence). (**Output** parameter). -* Sequence no. 5 was discarded --- it's length (after clipping) was shorter than 15 nt (**Minimum Sequence Length** parameter). - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_collapser.xml --- a/tools/fastx_toolkit/fastx_collapser.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ - - sequences - fastx_toolkit - zcat -f '$input' | fastx_collapser -v -o '$output' -#if $input.ext == "fastqsanger": --Q 33 -#end if - - - - - - - - - - - - -**What it does** - -This tool collapses identical sequences in a FASTA file into a single sequence. - --------- - -**Example** - -Example Input File (Sequence "ATAT" appears multiple times):: - - >CSHL_2_FC0042AGLLOO_1_1_605_414 - TGCG - >CSHL_2_FC0042AGLLOO_1_1_537_759 - ATAT - >CSHL_2_FC0042AGLLOO_1_1_774_520 - TGGC - >CSHL_2_FC0042AGLLOO_1_1_742_502 - ATAT - >CSHL_2_FC0042AGLLOO_1_1_781_514 - TGAG - >CSHL_2_FC0042AGLLOO_1_1_757_487 - TTCA - >CSHL_2_FC0042AGLLOO_1_1_903_769 - ATAT - >CSHL_2_FC0042AGLLOO_1_1_724_499 - ATAT - -Example Output file:: - - >1-1 - TGCG - >2-4 - ATAT - >3-1 - TGGC - >4-1 - TGAG - >5-1 - TTCA - -.. class:: infomark - -Original Sequence Names / Lane descriptions (e.g. "CSHL_2_FC0042AGLLOO_1_1_742_502") are discarded. - -The output sequence name is composed of two numbers: the first is the sequence's number, the second is the multiplicity value. - -The following output:: - - >2-4 - ATAT - -means that the sequence "ATAT" is the second sequence in the file, and it appeared 4 times in the input FASTA file. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_nucleotides_distribution.xml --- a/tools/fastx_toolkit/fastx_nucleotides_distribution.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ - - - fastx_toolkit - fastx_nucleotide_distribution_graph.sh -t '$input.name' -i $input -o $output - - - - - - - - - - -**What it does** - -Creates a stacked-histogram graph for the nucleotide distribution in the Solexa library. - -.. class:: infomark - -**TIP:** Use the **FASTQ Statistics** tool to generate the report file needed for this tool. - ------ - -**Output Examples** - -The following chart clearly shows the barcode used at the 5'-end of the library: **GATCT** - -.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_1.png - -In the following chart, one can almost 'read' the most abundant sequence by looking at the dominant values: **TGATA TCGTA TTGAT GACTG AA...** - -.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_2.png - -The following chart shows a growing number of unknown (N) nucleotides towards later cycles (which might indicate a sequencing problem): - -.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_3.png - -But most of the time, the chart will look rather random: - -.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_4.png - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_quality_statistics.xml --- a/tools/fastx_toolkit/fastx_quality_statistics.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ - - - fastx_toolkit - zcat -f $input | fastx_quality_stats -o $output -Q 33 - - - - - - - - - - - - - - - - - - -**What it does** - -Creates quality statistics report for the given Solexa/FASTQ library. - -.. class:: infomark - -**TIP:** This statistics report can be used as input for **Quality Score** and **Nucleotides Distribution** tools. - ------ - -**The output file will contain the following fields:** - -* column = column number (1 to 36 for a 36-cycles read Solexa file) -* count = number of bases found in this column. -* min = Lowest quality score value found in this column. -* max = Highest quality score value found in this column. -* sum = Sum of quality score values for this column. -* mean = Mean quality score value for this column. -* Q1 = 1st quartile quality score. -* med = Median quality score. -* Q3 = 3rd quartile quality score. -* IQR = Inter-Quartile range (Q3-Q1). -* lW = 'Left-Whisker' value (for boxplotting). -* rW = 'Right-Whisker' value (for boxplotting). -* A_Count = Count of 'A' nucleotides found in this column. -* C_Count = Count of 'C' nucleotides found in this column. -* G_Count = Count of 'G' nucleotides found in this column. -* T_Count = Count of 'T' nucleotides found in this column. -* N_Count = Count of 'N' nucleotides found in this column. - - -For example:: - - 1 6362991 -4 40 250734117 39.41 40 40 40 0 40 40 1396976 1329101 678730 2958184 0 - 2 6362991 -5 40 250531036 39.37 40 40 40 0 40 40 1786786 1055766 1738025 1782414 0 - 3 6362991 -5 40 248722469 39.09 40 40 40 0 40 40 2296384 984875 1443989 1637743 0 - 4 6362991 -4 40 248214827 39.01 40 40 40 0 40 40 2536861 1167423 1248968 1409739 0 - 36 6362991 -5 40 117158566 18.41 7 15 30 23 -5 40 4074444 1402980 63287 822035 245 - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_renamer.xml --- a/tools/fastx_toolkit/fastx_renamer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ - - - fastx_toolkit - zcat -f $input | fastx_renamer -n $TYPE -o $output -v -#if $input.ext == "fastqsanger": --Q 33 -#end if - - - - - - - - - - - - - - - - - -**What it does** - -This tool renames the sequence identifiers in a FASTQ/A file. - -.. class:: infomark - -Use this tool at the beginning of your workflow, as a way to keep the original sequence (before trimming, clipping, barcode-removal, etc). - --------- - -**Example** - -The following Solexa-FASTQ file:: - - @CSHL_4_FC042GAMMII_2_1_517_596 - GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - +CSHL_4_FC042GAMMII_2_1_517_596 - 40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40 - -Renamed to **nucleotides sequence**:: - - @GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - +GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - 40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40 - -Renamed to **numeric counter**:: - - @1 - GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT - +1 - 40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40 - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_reverse_complement.xml --- a/tools/fastx_toolkit/fastx_reverse_complement.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - - fastx_toolkit - zcat -f '$input' | fastx_reverse_complement -v -o $output -#if $input.ext == "fastqsanger": --Q 33 -#end if - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool reverse-complements each sequence in a library. -If the library is a FASTQ, the quality-scores are also reversed. - --------- - -**Example** - -Input FASTQ file:: - - @CSHL_1_FC42AGWWWXX:8:1:3:740 - TGTCTGTAGCCTCNTCCTTGTAATTCAAAGNNGGTA - +CSHL_1_FC42AGWWWXX:8:1:3:740 - 33 33 33 34 33 33 33 33 33 33 33 33 27 5 27 33 33 33 33 33 33 27 21 27 33 32 31 29 26 24 5 5 15 17 27 26 - - -Output FASTQ file:: - - @CSHL_1_FC42AGWWWXX:8:1:3:740 - TACCNNCTTTGAATTACAAGGANGAGGCTACAGACA - +CSHL_1_FC42AGWWWXX:8:1:3:740 - 26 27 17 15 5 5 24 26 29 31 32 33 27 21 27 33 33 33 33 33 33 27 5 27 33 33 33 33 33 33 33 33 34 33 33 33 - ------- - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/fastx_toolkit/fastx_trimmer.xml --- a/tools/fastx_toolkit/fastx_trimmer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ - - - fastx_toolkit - zcat -f '$input' | fastx_trimmer -v -f $first -l $last -o $output -#if $input.ext == "fastqsanger": --Q 33 -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool trims (cut bases from) sequences in a FASTA/Q file. - --------- - -**Example** - -Input Fasta file (with 36 bases in each sequences):: - - >1-1 - TATGGTCAGAAACCATATGCAGAGCCTGTAGGCACC - >2-1 - CAGCGAGGCTTTAATGCCATTTGGCTGTAGGCACCA - - -Trimming with First=1 and Last=21, we get a FASTA file with 21 bases in each sequences (starting from the first base):: - - >1-1 - TATGGTCAGAAACCATATGCA - >2-1 - CAGCGAGGCTTTAATGCCATT - -Trimming with First=6 and Last=10, will generate a FASTA file with 5 bases (bases 6,7,8,9,10) in each sequences:: - - >1-1 - TCAGA - >2-1 - AGGCT - - ------ - -This tool is based on `FASTX-toolkit`__ by Assaf Gordon. - - .. __: http://hannonlab.cshl.edu/fastx_toolkit/ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/CreateInterval.pl --- a/tools/filters/CreateInterval.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#! /usr/bin/perl -w - -# Accepts chrom, start, end, name, and strand -# If strand is void sets it to plus -# CreateInterval.pl $chrom $start $end $name $strand $output - -my $strand = "+"; - -die "Not enough arguments\n" unless @ARGV == 6; - -open OUT, ">$ARGV[5]" or die "Cannot open $ARGV[5]:$!\n"; - -$strand = "-" if $ARGV[4] eq "minus"; -$ARGV[3] =~ s/\s+/_/g; -$ARGV[3] =~ s/\t+/_/g; - -print OUT "$ARGV[0]\t$ARGV[1]\t$ARGV[2]\t$ARGV[3]\t0\t$strand\n"; -close OUT; - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/CreateInterval.xml --- a/tools/filters/CreateInterval.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ - - as a new dataset - CreateInterval.pl $chrom $start $end "$name" $strand $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**TIP**. Once your interval appears in history, you must tell Galaxy which genome it belongs to by clicking pencil icon or the "?" link in the history item. - ------ - -**What it does** - -This tool allows you to create a single genomic interval. The resulting history item will be in the BED format. - ------ - -**Example** - -Typing the following values in the form:: - - Chromosome: chrX - Start position: 151087187 - End position: 151370486 - Name: NM_000808 - Strand: minus - -will create a single interval:: - - chrX 151087187 151370486 NM_000808 0 - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/axt_to_concat_fasta.py --- a/tools/filters/axt_to_concat_fasta.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -#!/usr/bin/env python -""" -Adapted from bx/scripts/axt_to_concat_fasta.py -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) - -import sys -import bx.align.axt - -def usage(s=None): - message = """ -axt_to_fasta species1 species2 < axt_file > fasta_file -""" - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - - # check the command line - species1 = sys.argv[1] - species2 = sys.argv[2] - - # convert the alignment blocks - - reader = bx.align.axt.Reader(sys.stdin,support_ids=True,\ - species1=species1,species2=species2) - sp1text = list() - sp2text = list() - for a in reader: - sp1text.append(a.components[0].text) - sp2text.append(a.components[1].text) - sp1seq = "".join(sp1text) - sp2seq = "".join(sp2text) - print_component_as_fasta(sp1seq,species1) - print_component_as_fasta(sp2seq,species2) - - - -# $$$ this should be moved to a bx.align.fasta module - -def print_component_as_fasta(text,src): - header = ">" + src - print header - print text - - -if __name__ == "__main__": main() - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/axt_to_concat_fasta.xml --- a/tools/filters/axt_to_concat_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - Converts an AXT formatted file to a concatenated FASTA alignment - axt_to_concat_fasta.py $dbkey_1 $dbkey_2 < $axt_input > $out_file1 - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section. - --------- - -**Syntax** - -This tool converts an AXT formatted file to the FASTA format, and concatenates the results in the same build. - -- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. - -- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code. - - - This format contains an one line header. It starts with a " >" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence. - - The remaining lines contain the sequence itself. - - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence. - - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs. - ------ - -**Example** - -- AXT format:: - - 0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 - TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA - TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA - - 1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900 - CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA - CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA - -- Convert the above file to concatenated FASTA format:: - - >hg16 - TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGACACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA - >mm5 - TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGACACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/axt_to_fasta.py --- a/tools/filters/axt_to_fasta.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -#!/usr/bin/env python -""" -Adapted from bx/scripts/axt_to_fasta.py -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) - -import sys -import bx.align.axt - -def usage(s=None): - message = """ -axt_to_fasta species1 species2 < axt_file > fasta_file -""" - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - - # check the command line - species1 = sys.argv[1] - species2 = sys.argv[2] - - # convert the alignment blocks - - reader = bx.align.axt.Reader(sys.stdin,support_ids=True,\ - species1=species1,species2=species2) - - for a in reader: - if ("id" in a.attributes): id = a.attributes["id"] - else: id = None - print_component_as_fasta(a.components[0],id) - print_component_as_fasta(a.components[1],id) - print - - -# $$$ this should be moved to a bx.align.fasta module - -def print_component_as_fasta(c,id=None): - header = ">%s_%s_%s" % (c.src,c.start,c.start+c.size) - if (id != None): header += " " + id - print header - print c.text - - -if __name__ == "__main__": main() - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/axt_to_fasta.xml --- a/tools/filters/axt_to_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ - - Converts an AXT formatted file to FASTA format - axt_to_fasta.py $dbkey_1 $dbkey_2 < $axt_input > $out_file1 - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section. - --------- - - -**Syntax** - -This tool converts an AXT formatted file to the FASTA format. - -- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. - -- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code. - - - This format contains an one line header. It starts with a " >" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence. - - The remaining lines contain the sequence itself. - - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence. - - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs. - ------ - -**Example** - -- AXT format:: - - 0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 - TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA - TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA - - 1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900 - CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA - CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA - -- Convert the above file to FASTA format:: - - >hg16.chr19(+):3001012-3001075|hg16_0 - TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA - >mm5.chr11(-):70568380-70568443|mm5_0 - TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA - - >hg16.chr19(+):3008279-3008357|hg16_1 - CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA - >mm5.chr11(-):70573976-70574054|mm5_1 - CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/axt_to_lav.py --- a/tools/filters/axt_to_lav.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,176 +0,0 @@ -#!/usr/bin/env python -""" -Application to convert AXT file to LAV file -------------------------------------------- - -:Author: Bob Harris (rsharris@bx.psu.edu) -:Version: $Revision: $ - -The application reads an AXT file from standard input and writes a LAV file to -standard out; some statistics are written to standard error. -""" - -import sys, copy -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import bx.align.axt -import bx.align.lav - -assert sys.version_info[:2] >= ( 2, 4 ) - -def usage(s=None): - message = """ -axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file - Each spec is of the form seq_file[:species_name]:lengths_file. - - seq_file should be a format string for the file names for the individual - sequences, with %s to be replaced by the alignment's src field. For example, - "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib", - etc. - - species_name is optional. If present, it is prepended to the alignment's src - field. - - Lengths files provide the length of each chromosome (lav format needs this - information but axt file does not contain it). The format is a series of - lines of the form - - The chromosome field in each axt block must match some in - the lengths file. -""" - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - global debug - - # parse the command line - - primary = None - secondary = None - silent = False - - # pick off options - - args = sys.argv[1:] - seq_file2 = open(args.pop(-1),'w') - seq_file1 = open(args.pop(-1),'w') - lav_out = args.pop(-1) - axt_in = args.pop(-1) - while (len(args) > 0): - arg = args.pop(0) - val = None - fields = arg.split("=",1) - if (len(fields) == 2): - arg = fields[0] - val = fields[1] - if (val == ""): - usage("missing a value in %s=" % arg) - - if (arg == "--silent") and (val == None): - silent = True - elif (primary == None) and (val == None): - primary = arg - elif (secondary == None) and (val == None): - secondary = arg - else: - usage("unknown argument: %s" % arg) - - if (primary == None): - usage("missing primary file name and length") - - if (secondary == None): - usage("missing secondary file name and length") - - try: - (primaryFile,primary,primaryLengths) = parse_spec(primary) - except: - usage("bad primary spec (must be seq_file[:species_name]:lengths_file") - - try: - (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary) - except: - usage("bad secondary spec (must be seq_file[:species_name]:lengths_file") - - # read the lengths - - speciesToLengths = {} - speciesToLengths[primary] = read_lengths (primaryLengths) - speciesToLengths[secondary] = read_lengths (secondaryLengths) - - # read the alignments - - out = bx.align.lav.Writer(open(lav_out,'w'), \ - attributes = { "name_format_1" : primaryFile, - "name_format_2" : secondaryFile }) - - axtsRead = 0 - axtsWritten = 0 - for axtBlock in bx.align.axt.Reader(open(axt_in), \ - species_to_lengths = speciesToLengths, - species1 = primary, - species2 = secondary, - support_ids = True): - axtsRead += 1 - out.write (axtBlock) - primary_c = axtBlock.get_component_by_src_start(primary) - secondary_c = axtBlock.get_component_by_src_start(secondary) - - print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size) - print >>seq_file1,primary_c.text - print >>seq_file1 - - print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size) - print >>seq_file2,secondary_c.text - print >>seq_file2 - axtsWritten += 1 - - out.close() - seq_file1.close() - seq_file2.close() - - if (not silent): - sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten)) - -def parse_spec(spec): # returns (seq_file,species_name,lengths_file) - fields = spec.split(":") - if (len(fields) == 2): return (fields[0],"",fields[1]) - elif (len(fields) == 3): return (fields[0],fields[1],fields[2]) - else: raise ValueError - -def read_lengths (fileName): - - chromToLength = {} - - f = file (fileName, "r") - - for lineNumber,line in enumerate(f): - line = line.strip() - if (line == ""): continue - if (line.startswith("#")): continue - - fields = line.split () - if (len(fields) != 2): - raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line) - - chrom = fields[0] - try: - length = int(fields[1]) - except: - raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line) - - if (chrom in chromToLength): - raise "%s appears more than once (%s:%d): %s" \ - % (chrom,fileName,lineNumber) - - chromToLength[chrom] = length - - f.close () - - return chromToLength - - -if __name__ == "__main__": main() - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/axt_to_lav.xml --- a/tools/filters/axt_to_lav.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,94 +0,0 @@ - - Converts an AXT formatted file to LAV format - axt_to_lav.py /galaxy/data/$dbkey_1/seq/%s.nib:$dbkey_1:${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/${dbkey_1}.len /galaxy/data/$dbkey_2/seq/%s.nib:$dbkey_2:${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/${dbkey_2}.len $align_input $lav_file $seq_file1 $seq_file2 - - - - - - - - - - - - -.. class:: warningmark - -**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section. - --------- - - -**Syntax** - -This tool converts an AXT formatted file to the LAV format. - -- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. - -- **LAV format** LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ. - -- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code. - - - This format contains an one line header. It starts with a ">" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence. - - The remaining lines contain the sequence itself. - - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence. - - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs. - ------ - -**Example** - -- AXT format:: - - 0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 - TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA - TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA - - 1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900 - CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA - CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA - -- Convert the above file to LAV format:: - - #:lav - s { - "/galaxy/data/hg16/seq/chr19.nib" 1 63811651 0 1 - "/galaxy/data/mm5/seq/chr11.nib-" 1 121648857 0 1 - } - h { - "> hg16.chr19" - "> mm5.chr11 (reverse complement)" - } - a { - s 3500 - b 3001012 70568380 - e 3001075 70568443 - l 3001012 70568380 3001075 70568443 81 - } - a { - s 3900 - b 3008279 70573976 - e 3008357 70574054 - l 3008279 70573976 3008357 70574054 78 - } - #:eof - -- With two files in the FASTA format:: - - >hg16.chr19_-_3001011_3001075 - TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA - - >hg16.chr19_-_3008278_3008357 - CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA - - **and**:: - - >mm5.chr11_-_70568379_70568443 - TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA - - >mm5.chr11_-_70573975_70574054 - CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/axt_to_lav_code.py --- a/tools/filters/axt_to_lav_code.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - for name,data in out_data.items(): - if name == "seq_file2": - data.dbkey = param_dict['dbkey_2'] - app.model.context.add( data ) - app.model.context.flush() - break \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/bed2gff.xml --- a/tools/filters/bed2gff.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ - - converter - bed_to_gff_converter.py $input $out_file1 - - - - - - - - - - - - - - -**What it does** - -This tool converts data from BED format to GFF format (scroll down for format description). - --------- - -**Example** - -The following data in BED format:: - - chr28 346187 388197 BC114771 0 + 346187 388197 0 9 144,81,115,63,155,96,134,105,112, 0,24095,26190,31006,32131,33534,36994,41793,41898, - -Will be converted to GFF (**note** that the start coordinate is incremented by 1):: - - ##gff-version 2 - ##bed_to_gff_converter.py - - chr28 bed2gff mRNA 346188 388197 0 + . mRNA BC114771; - chr28 bed2gff exon 346188 346331 0 + . exon BC114771; - chr28 bed2gff exon 370283 370363 0 + . exon BC114771; - chr28 bed2gff exon 372378 372492 0 + . exon BC114771; - chr28 bed2gff exon 377194 377256 0 + . exon BC114771; - chr28 bed2gff exon 378319 378473 0 + . exon BC114771; - chr28 bed2gff exon 379722 379817 0 + . exon BC114771; - chr28 bed2gff exon 383182 383315 0 + . exon BC114771; - chr28 bed2gff exon 387981 388085 0 + . exon BC114771; - chr28 bed2gff exon 388086 388197 0 + . exon BC114771; - - ------- - -.. class:: informark - -**About formats** - -**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: - -The first three BED fields (required) are:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - -The additional BED fields (optional) are:: - - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. - 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. - 9. reserved - This should always be set to zero. - 10. blockCount - The number of blocks (exons) in the BED line. - 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - 13. expCount - The number of experiments. - 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. - 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. - -**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: - - 1. seqname - Must be a chromosome or scaffold. - 2. source - The program that generated this feature. - 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. end - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. group - All lines with the same group are linked together into a single item. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/bed_to_bigbed.xml --- a/tools/filters/bed_to_bigbed.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ - - converter - bedToBigBed $input1 $chromInfo $out_file1 - #if $settings.settingsType == "full": - -blockSize=${settings.blockSize} -itemsPerSlot=${settings.itemsPerSlot} ${settings.unc} - #end if - 2>&1 || echo "Error running bedToBigBed." >&2 - - - ucsc_tools - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool converts a **sorted** BED file into a bigBed file. - -Currently, the bedFields option to specify the number of non-standard fields is not supported as an AutoSQL file must be provided, which is a format -currently not supported by Galaxy. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/bed_to_gff_converter.py --- a/tools/filters/bed_to_gff_converter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -#!/usr/bin/env python -# This code exists in 2 places: ~/datatypes/converters and ~/tools/filters -import sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - input_name = sys.argv[1] - output_name = sys.argv[2] - skipped_lines = 0 - first_skipped_line = 0 - out = open( output_name, 'w' ) - out.write( "##gff-version 2\n" ) - out.write( "##bed_to_gff_converter.py\n\n" ) - i = 0 - for i, line in enumerate( file( input_name ) ): - complete_bed = False - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ): - try: - elems = line.split( '\t' ) - if len( elems ) == 12: - complete_bed = True - chrom = elems[0] - if complete_bed: - feature = "mRNA" - else: - try: - feature = elems[3] - except: - feature = 'feature%d' % ( i + 1 ) - start = int( elems[1] ) + 1 - end = int( elems[2] ) - try: - score = elems[4] - except: - score = '0' - try: - strand = elems[5] - except: - strand = '+' - try: - group = elems[3] - except: - group = 'group%d' % ( i + 1 ) - if complete_bed: - out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group ) ) - else: - out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group ) ) - if complete_bed: - # We have all the info necessary to annotate exons for genes and mRNAs - block_count = int( elems[9] ) - block_sizes = elems[10].split( ',' ) - block_starts = elems[11].split( ',' ) - for j in range( block_count ): - exon_start = int( start ) + int( block_starts[j] ) - exon_end = exon_start + int( block_sizes[j] ) - 1 - out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) ) - except: - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - else: - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - out.close() - info_msg = "%i lines converted to GFF version 2. " % ( i + 1 - skipped_lines ) - if skipped_lines > 0: - info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) - print info_msg - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/catWrapper.py --- a/tools/filters/catWrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -#!/usr/bin/env python -#By, Guruprasad Ananda. - -from galaxy import eggs -import sys, os - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - outfile = sys.argv[1] - infile = sys.argv[2] - - try: - fout = open(sys.argv[1],'w') - except: - stop_err("Output file cannot be opened for writing.") - - try: - fin = open(sys.argv[2],'r') - except: - stop_err("Input file cannot be opened for reading.") - - if len(sys.argv) < 4: - os.system("cp %s %s" %(infile,outfile)) - sys.exit() - - cmdline = "cat %s " %(infile) - for inp in sys.argv[3:]: - cmdline = cmdline + inp + " " - cmdline = cmdline + ">" + outfile - try: - os.system(cmdline) - except: - stop_err("Error encountered with cat.") - -if __name__ == "__main__": main() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/catWrapper.xml --- a/tools/filters/catWrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ - - tail-to-head - - catWrapper.py - $out_file1 - $input1 - #for $q in $queries - ${q.input2} - #end for - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**WARNING:** Be careful not to concatenate datasets of different kinds (e.g., sequences with intervals). This tool does not check if the datasets being concatenated are in the same format. - ------ - -**What it does** - -Concatenates datasets - ------ - -**Example** - -Concatenating Dataset:: - - chrX 151087187 151087355 A 0 - - chrX 151572400 151572481 B 0 + - -with Dataset1:: - - chr1 151242630 151242955 X 0 + - chr1 151271715 151271999 Y 0 + - chr1 151278832 151279227 Z 0 - - -and with Dataset2:: - - chr2 100000030 200000955 P 0 + - chr2 100000015 200000999 Q 0 + - -will result in the following:: - - chrX 151087187 151087355 A 0 - - chrX 151572400 151572481 B 0 + - chr1 151242630 151242955 X 0 + - chr1 151271715 151271999 Y 0 + - chr1 151278832 151279227 Z 0 - - chr2 100000030 200000955 P 0 + - chr2 100000015 200000999 Q 0 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/changeCase.pl --- a/tools/filters/changeCase.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; - -my $columns = {}; -my $del = ""; -my @in = (); -my @out = (); -my $command = ""; -my $field = 0; - -# a wrapper for changing the case of columns from within galaxy -# isaChangeCase.pl [filename] [columns] [delim] [casing] [output] - -die "Check arguments: $0 [filename] [columns] [delim] [casing] [output]\n" unless @ARGV == 5; - -# process column input -$ARGV[1] =~ s/\s+//g; -foreach ( split /,/, $ARGV[1] ) { - if (m/^c\d{1,}$/i) { - s/c//ig; - $columns->{$_} = --$_; - } -} - -die "No columns specified, columns are not preceeded with 'c', or commas are not used to separate column numbers: $ARGV[1]\n" if keys %$columns == 0; - -my $column_delimiters_href = { - 'TAB' => q{\t}, - 'COMMA' => ",", - 'DASH' => "-", - 'UNDERSCORE' => "_", - 'PIPE' => q{\|}, - 'DOT' => q{\.}, - 'SPACE' => q{\s+} -}; - -$del = $column_delimiters_href->{$ARGV[2]}; - -open (OUT, ">$ARGV[4]") or die "Cannot create $ARGV[4]:$!\n"; -open (IN, "<$ARGV[0]") or die "Cannot open $ARGV[0]:$!\n"; -while () { - chop; - @in = split /$del/; - for ( my $i = 0; $i <= $#in; ++$i) { - if (exists $columns->{$i}) { - push(@out, $ARGV[3] eq 'up' ? uc($in[$i]) : lc($in[$i])); - } else { - push(@out, $in[$i]); - } - } - print OUT join("\t",@out), "\n"; - @out = (); -} -close IN; - -close OUT; diff -r c2a356708570 -r 33c067c3ae34 tools/filters/changeCase.xml --- a/tools/filters/changeCase.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ - - of selected columns - changeCase.pl $input "$cols" $delimiter $casing $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**This tool breaks column assignments.** To re-establish column assignments run the tool and click on the pencil icon in the resulting history item. - -.. class:: warningmark - -The format of the resulting dataset from this tool is always tabular. - ------ - -**What it does** - -This tool selects specified columns from a dataset and converts the values of those columns to upper or lower case. - -- Columns are specified as **c1**, **c2**, and so on. -- Columns can be specified in any order (e.g., **c2,c1,c6**) - ------ - -**Example** - -Changing columns 1 and 3 ( delimited by Comma ) to upper case in:: - - apple,is,good - windows,is,bad - -will result in:: - - APPLE is GOOD - WINDOWS is BAD - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/commWrapper.pl --- a/tools/filters/commWrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; -use File::Temp "tempfile"; -#use POSIX qw(tmpnam); - -my ($input1, $input2, $mode, $out_file1) = @ARGV; - -my ($fh, $file1) = tempfile(); -my ($fh1,$file2) = tempfile(); - -`sort $input1 > $file1`; -`sort $input2 > $file2`; -`comm $mode $file1 $file2 > $out_file1`; -`rm $file1 ; rm $file2`; - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/commWrapper.xml --- a/tools/filters/commWrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ - - between two datasets - commWrapper.pl $input1 $input2 $mode $out_file1 - - - - - - - - - - - - -This tool is based on UNIX shell command comm. It compares two datasets and returns similarities or differences. For example, if you have two datasets:: - - a 1 - b 2 - c 3 - -and:: - - a 1 - f 6 - h 8 - -Using this tool with **Lines unique to Dataset1** option will return:: - - b 2 - c 3 - -If you use **Lines shared between Dataset1 and Dataset2** option output will look like this:: - - a 1 - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/compare.xml --- a/tools/filters/compare.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ - - to find common or distinct rows - joinWrapper.py $input1 $input2 $field1 $field2 $mode $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool finds lines in one dataset that HAVE or DO NOT HAVE a common field with another dataset. - ------ - -**Example** - -If this is **First dataset**:: - - chr1 10 20 geneA - chr1 50 80 geneB - chr5 10 40 geneL - -and this is **Second dataset**:: - - geneA tumor-suppressor - geneB Foxp2 - geneC Gnas1 - geneE INK4a - -Finding lines of the **First dataset** whose 4th column matches the 1st column of the **Second dataset** yields:: - - chr1 10 20 geneA - chr1 50 80 geneB - -Conversely, using option **Non Matching rows of First dataset** on the same fields will yield:: - - chr5 10 40 geneL - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/condense_characters.pl --- a/tools/filters/condense_characters.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; - -# condenses all consecutive characters of one type -# convert_characters.pl [input] [character] [output] - -die "Check arguments" unless @ARGV == 3; - -my $inputfile = $ARGV[0]; -my $character = $ARGV[1]; -my $outputfile = $ARGV[2]; - - -my $convert_from; -my $convert_to; - - -if ($character eq "s") -{ - $convert_from = '\s'; -} -elsif ($character eq "T") -{ - $convert_from = '\t'; -} -elsif ($character eq "Sp") -{ - $convert_from = " "; -} -elsif ($character eq "Dt") -{ - $convert_from = '\.'; -} -elsif ($character eq "C") -{ - $convert_from = ","; -} -elsif ($character eq "D") -{ - $convert_from = "-"; -} -elsif ($character eq "U") -{ - $convert_from = "_"; -} -elsif ($character eq "P") -{ - $convert_from = '\|'; -} -else -{ - die "Invalid value specified for convert from\n"; -} - - -if ($character eq "T") -{ - $convert_to = "\t"; -} -elsif ($character eq "Sp") -{ - $convert_to = " "; -} -elsif ($character eq "Dt") -{ - $convert_to = "\."; -} -elsif ($character eq "C") -{ - $convert_to = ","; -} -elsif ($character eq "D") -{ - $convert_to = "-"; -} -elsif ($character eq "U") -{ - $convert_to = "_"; -} -elsif ($character eq "P") -{ - $convert_to = "|"; -} -else -{ - die "Invalid value specified for Convert to\n"; -} - -my $fhIn; -open ($fhIn, "< $inputfile") or die "Cannot open source file"; - -my $fhOut; -open ($fhOut, "> $outputfile"); - -while (<$fhIn>) -{ - my $thisLine = $_; - chomp $thisLine; - $thisLine =~ s/${convert_from}+/$convert_to/g; - print $fhOut $thisLine,"\n"; -} -close ($fhIn) or die "Cannot close source file"; -close ($fhOut) or die "Cannot close output file"; diff -r c2a356708570 -r 33c067c3ae34 tools/filters/condense_characters.xml --- a/tools/filters/condense_characters.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ - - consecutive characters - condense_characters.pl $input $character $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool condenses all consecutive characters of a specified type. - ------ - -**Example** - -- Input file:: - - geneX,,,10,,,,,20 - geneY,,5,,,,,12,15,9, - -- Condense all consecutive commas. The above file will be converted into:: - - geneX,10,20 - geneY,5,12,15,9 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/convert_characters.pl --- a/tools/filters/convert_characters.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; - -# converts all characters of one type into another -# convert_characters.pl [input] [convert_from] [convert_to] [output] - -die "Check argument\n" unless @ARGV == 4; - -my $inputfile = $ARGV[0]; -my $convert_from = $ARGV[1]; -my $convert_to = $ARGV[2]; -my $outputfile = $ARGV[3]; - -if ($convert_from eq "s") -{ - $convert_from = '\s'; -} -elsif ($convert_from eq "T") -{ - $convert_from = '\t'; -} -elsif ($convert_from eq "Sp") -{ - $convert_from = '\s'; -} -elsif ($convert_from eq "Dt") -{ - $convert_from = '\.'; -} -elsif ($convert_from eq "C") -{ - $convert_from = ","; -} -elsif ($convert_from eq "D") -{ - $convert_from = "-"; -} -elsif ($convert_from eq "U") -{ - $convert_from = "_"; -} -elsif ($convert_from eq "P") -{ - $convert_from = '\|'; -} -else -{ - die "Invalid value specified for convert from\n"; -} - - -if ($convert_to eq "T") -{ - $convert_to = "\t"; -} -elsif ($convert_to eq "Sp") -{ - $convert_to = '\s'; -} -elsif ($convert_to eq "Dt") -{ - $convert_to = "\."; -} -elsif ($convert_to eq "C") -{ - $convert_to = ","; -} -elsif ($convert_to eq "D") -{ - $convert_to = "-"; -} -elsif ($convert_to eq "U") -{ - $convert_to = "_"; -} -elsif ($convert_to eq "P") -{ - $convert_to = "|"; -} -else -{ - die "Invalid value specified for convert to\n"; -} - -my $fhIn; -open ($fhIn, "< $inputfile") or die "Cannot open source file"; - -my $fhOut; -open ($fhOut, "> $outputfile"); - -while (<$fhIn>) -{ - my $thisLine = $_; - chomp $thisLine; - $thisLine =~ s/$convert_from{1,}/$convert_to/g; - print $fhOut $thisLine,"\n"; -} -close ($fhIn) or die "Cannot close source file\n"; -close ($fhOut) or die "Cannot close output fil\n"; diff -r c2a356708570 -r 33c067c3ae34 tools/filters/convert_characters.py --- a/tools/filters/convert_characters.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -#!/usr/bin/env python -#By, Guruprasad Ananda. - -from galaxy import eggs -import sys, re - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - if len(sys.argv) != 4: - stop_err("usage: convert_characters infile from_char outfile") - - try: - fin = open(sys.argv[1],'r') - except: - stop_err("Input file cannot be opened for reading.") - - from_char = sys.argv[2] - - try: - fout = open(sys.argv[3],'w') - except: - stop_err("Output file cannot be opened for writing.") - - char_dict = {'T':'\t','s':'\s','Dt':'\.','C':',','D':'-','U':'_','P':'\|','Co':':'} - from_ch = char_dict[from_char] + '+' #making an RE to match 1 or more occurences. - skipped = 0 - - for line in fin: - line = line.strip() - try: - fout.write("%s\n" %(re.sub(from_ch,'\t',line))) - except: - skipped += 1 - - if skipped: - print "Skipped %d lines as invalid." %skipped - -if __name__ == "__main__": - main() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/convert_characters.xml --- a/tools/filters/convert_characters.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - delimiters to TAB - convert_characters.py $input $convert_from $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Converts all delimiters of a specified type into TABs. Consecutive characters are condensed. For example, if columns are separated by 5 spaces they will converted into 1 tab. - ------ - -**Example** - -- Input file:: - - chrX||151283558|151283724|NM_000808_exon_8_0_chrX_151283559_r|0|- - chrX|151370273|151370486|NM_000808_exon_9_0_chrX_151370274_r|0|- - chrX|151559494|151559583|NM_018558_exon_1_0_chrX_151559495_f|0|+ - chrX|151564643|151564711|NM_018558_exon_2_0_chrX_151564644_f||||0|+ - -- Converting all pipe delimiters of the above file to TABs will get:: - - chrX 151283558 151283724 NM_000808_exon_8_0_chrX_151283559_r 0 - - chrX 151370273 151370486 NM_000808_exon_9_0_chrX_151370274_r 0 - - chrX 151559494 151559583 NM_018558_exon_1_0_chrX_151559495_f 0 + - chrX 151564643 151564711 NM_018558_exon_2_0_chrX_151564644_f 0 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/cutWrapper.pl --- a/tools/filters/cutWrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use warnings; - -my @columns = (); -my $del = ""; -my @in = (); -my @out = (); -my $command = ""; -my $field = 0; - -# a wrapper for cut for use in galaxy -# cutWrapper.pl [filename] [columns] [delim] [output] - -die "Check arguments\n" unless @ARGV == 4; - -$ARGV[1] =~ s/\s+//g; -foreach ( split /,/, $ARGV[1] ) { - if (m/^c\d{1,}$/i) { - push (@columns, $_); - $columns[@columns-1] =~s/c//ig; - } -} - -die "No columns specified, columns are not preceded with 'c', or commas are not used to separate column numbers: $ARGV[1]\n" if @columns == 0; - -my $column_delimiters_href = { - 'T' => q{\t}, - 'C' => ",", - 'D' => "-", - 'U' => "_", - 'P' => q{\|}, - 'Dt' => q{\.}, - 'Sp' => q{\s+} -}; - -$del = $column_delimiters_href->{$ARGV[2]}; - -open (OUT, ">$ARGV[3]") or die "Cannot create $ARGV[2]:$!\n"; -open (IN, "<$ARGV[0]") or die "Cannot open $ARGV[0]:$!\n"; - -while (my $line=) { - if ($line =~ /^#/) { - #Ignore comment lines - } else { - chop($line); - @in = split(/$del/, $line); - foreach $field (@columns) { - if (defined($in[$field-1])) { - push(@out, $in[$field-1]); - } else { - push(@out, "."); - } - } - print OUT join("\t",@out), "\n"; - @out = (); - } -} - -#while () { -# chop; -# @in = split /$del/; -# foreach $field (@columns) { -# if (defined($in[$field-1])) { -# push(@out, $in[$field-1]); -# } else { -# push(@out, "."); -# } -# } -# print OUT join("\t",@out), "\n"; -# @out = (); -#} -close IN; - -close OUT; - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/cutWrapper.xml --- a/tools/filters/cutWrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ - - columns from a table - cutWrapper.pl $input "$columnList" $delimiter $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**WARNING: This tool breaks column assignments.** To re-establish column assignments run the tools and click on the pencil icon in the latest history item. - -.. class:: infomark - -The output of this tool is always in tabular format (e.g., if your original delimiters are commas, they will be replaced with tabs). For example: - - Cutting columns 1 and 3 from:: - - apple,is,good - windows,is,bad - - will give:: - - apple good - windows bad - ------ - -**What it does** - -This tool selects (cuts out) specified columns from the dataset. - -- Columns are specified as **c1**, **c2**, and so on. Column count begins with **1** -- Columns can be specified in any order (e.g., **c2,c1,c6**) -- If you specify more columns than actually present - empty spaces will be filled with dots - ------ - -**Example** - -Input dataset (six columns: c1, c2, c3, c4, c5, and c6):: - - chr1 10 1000 gene1 0 + - chr2 100 1500 gene2 0 + - -**cut** on columns "**c1,c4,c6**" will return:: - - chr1 gene1 + - chr2 gene2 + - -**cut** on columns "**c6,c5,c4,c1**" will return:: - - + 0 gene1 chr1 - + 0 gene2 chr2 - - -**cut** on columns "**c8,c7,c4**" will return:: - - . . gene1 - . . gene2 - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/fileGrep.xml --- a/tools/filters/fileGrep.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - a column from one Query against another Query - cut -f $col $input1 | grep -f - $match $input2 > $out_file1 - - - - - - - - - - - - - -This tool is based on UNIX command grep with option -f. It matches content of one query against another. For example, assume you have two queries - one that contains EST accession numbers and some other information:: - - AA001229 12 12 - A001501 7 7 - AA001641 6 6 - AA001842 6 6 - AA002047 6 6 - AA004638 3 3 - -and another that is a typical BED file describing genomic location of some ESTs:: - - chr7 115443235 115443809 CA947954_exon_0_0_chr7_115443236_f 0 + - chr7 115443236 115443347 DB338189_exon_0_0_chr7_115443237_f 0 + - chr7 115443347 115443768 DB338189_exon_1_0_chr7_115443348_f 0 + - chr7 115443239 115443802 AA001842_exon_0_0_chr7_115443240_f 0 + - chr7 115443243 115443347 DB331869_exon_0_0_chr7_115443244_f 0 + - chr7 115443347 115443373 DB331869_exon_1_0_chr7_115443348_f 0 + - -Using this tool you will be able to tell how many ESTs in Query1 are also preset in Query2 and will output this:: - - chr7 115443239 115443802 AA001842_exon_0_0_chr7_115443240_f 0 - -if **Match** option is chosen. - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/fixedValueColumn.pl --- a/tools/filters/fixedValueColumn.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; - -# fixedValueColumn.pl $input $out_file1 "expression" "iterate [yes|no]" - -my ($input, $out_file1, $expression, $iterate) = @ARGV; -my $i = 0; -my $numeric = 0; - -die "Check arguments\n" unless @ARGV == 4; - -open (DATA, "<$input") or die "Cannot open $input:$!\n"; -open (OUT, ">$out_file1") or die "Cannot create $out_file1:$!\n"; - -if ($expression =~ m/^\d+$/) { - $numeric = 1; - $i = $expression; -} - -while () { - chop; - if ($iterate eq "no") { - print OUT "$_\t$expression\n"; - } else { - print OUT "$_\t$i\n" if $numeric == 1; - print OUT "$_\t$expression-$i\n" if $numeric == 0; - ++$i; - } -} - -close DATA; -close OUT; diff -r c2a356708570 -r 33c067c3ae34 tools/filters/fixedValueColumn.xml --- a/tools/filters/fixedValueColumn.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ - - to an existing dataset - fixedValueColumn.pl $input $out_file1 "$exp" $iterate - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**What it does** - -You can enter any value and it will be added as a new column to your dataset - ------ - -**Example** - -If you original data looks like this:: - - chr1 10 100 geneA - chr2 200 300 geneB - chr2 400 500 geneC - -Typing **+** in the text box will generate:: - - chr1 10 100 geneA + - chr2 200 300 geneB + - chr2 400 500 geneC + - - -You can also add line numbers by selecting **Iterate: YES**. In this case if you enter **1** in the text box you will get:: - - chr1 10 100 geneA 1 - chr2 200 300 geneB 2 - chr2 400 500 geneC 3 - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/extract_GFF_Features.py --- a/tools/filters/gff/extract_GFF_Features.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda -""" -Extract features from GFF file. - -usage: %prog input1 out_file1 column features -""" - -import sys, os - -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - # Parsing Command Line here - options, args = doc_optparse.parse( __doc__ ) - - try: - inp_file, out_file, column, features = args - except: - stop_err( "One or more arguments is missing or invalid.\nUsage: prog input output column features" ) - try: - column = int( column ) - except: - stop_err( "Column %s is an invalid column." % column ) - - if features == None: - stop_err( "Column %d has no features to display, select another column." %( column + 1 ) ) - - fo=open( out_file, 'w' ) - for i, line in enumerate( file( inp_file ) ): - line = line.rstrip( '\r\n' ) - if line and line.startswith( '#' ): - # Keep valid comment lines in the output - fo.write( "%s\n" % line ) - else: - try: - if line.split( '\t' )[column] in features.split( ',' ): - fo.write( "%s\n" % line ) - except: - pass - fo.close() - - print 'Column %d features: %s' %( column + 1, features ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/extract_GFF_Features.xml --- a/tools/filters/gff/extract_GFF_Features.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ - - from GFF data - extract_GFF_Features.py $input1 $out_file1 ${column_choice.col} ${column_choice.feature} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool extracts selected features from GFF data. - ------ - -**Example** - -Selecting **promoter** from the following GFF data:: - - chr22 GeneA enhancer 10000000 10001000 500 + . TGA - chr22 GeneA promoter 10010000 10010100 900 + . TGA - chr22 GeneB promoter 10020000 10025000 400 - . TGB - chr22 GeneB CCDS2220 10030000 10065000 800 - . TGB - -will produce the following output:: - - chr22 GeneA promoter 10010000 10010100 900 + . TGA - chr22 GeneB promoter 10020000 10025000 400 - . TGB - ----- - -.. class:: infomark - -**About formats** - -**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: - - 1. seqname - Must be a chromosome or scaffold. - 2. source - The program that generated this feature. - 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. end - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. group - All lines with the same group are linked together into a single item. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/gff_filter_by_attribute.py --- a/tools/filters/gff/gff_filter_by_attribute.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,163 +0,0 @@ -#!/usr/bin/env python -# This tool takes a gff file as input and creates filters on attributes based on certain properties. -# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped. -# TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be -# abstracted and leveraged in each filtering tool. - -from __future__ import division -import sys -from galaxy import eggs -from galaxy.util.json import to_json_string, from_json_string - -# Older py compatibility -try: - set() -except: - from sets import Set as set - -assert sys.version_info[:2] >= ( 2, 4 ) - -# -# Helper functions. -# - -def get_operands( filter_condition ): - # Note that the order of all_operators is important - items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in '] - for item in items_to_strip: - if filter_condition.find( item ) >= 0: - filter_condition = filter_condition.replace( item, ' ' ) - operands = set( filter_condition.split( ' ' ) ) - return operands - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def check_for_executable( text, description='' ): - # Attempt to determine if the condition includes executable stuff and, if so, exit. - secured = dir() - operands = get_operands( text ) - for operand in operands: - try: - check = int( operand ) - except: - if operand in secured: - stop_err( "Illegal value '%s' in %s '%s'" % ( operand, description, text ) ) - -# -# Process inputs. -# - -in_fname = sys.argv[1] -out_fname = sys.argv[2] -cond_text = sys.argv[3] -attribute_types = from_json_string( sys.argv[4] ) - -# Convert types from str to type objects. -for name, a_type in attribute_types.items(): - check_for_executable(a_type) - attribute_types[ name ] = eval( a_type ) - -# Unescape if input has been escaped -mapped_str = { - '__lt__': '<', - '__le__': '<=', - '__eq__': '==', - '__ne__': '!=', - '__gt__': '>', - '__ge__': '>=', - '__sq__': '\'', - '__dq__': '"', -} -for key, value in mapped_str.items(): - cond_text = cond_text.replace( key, value ) - -# Attempt to determine if the condition includes executable stuff and, if so, exit. -check_for_executable( cond_text, 'condition') - -# Prepare the column variable names and wrappers for column data types. Only -# prepare columns up to largest column in condition. -attrs, type_casts = [], [] -for name, attr_type in attribute_types.items(): - attrs.append( name ) - type_cast = "get_value('%(name)s', attribute_types['%(name)s'], attribute_values)" % ( {'name': name} ) - type_casts.append( type_cast ) - -attr_str = ', '.join( attrs ) # 'c1, c2, c3, c4' -type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' -wrap = "%s = %s" % ( attr_str, type_cast_str ) - -# Stats -skipped_lines = 0 -first_invalid_line = 0 -invalid_line = None -lines_kept = 0 -total_lines = 0 -out = open( out_fname, 'wt' ) - -# Helper function to safely get and type cast a value in a dict. -def get_value(name, a_type, values_dict): - if name in values_dict: - return (a_type)(values_dict[ name ]) - else: - return None - -# Read and filter input file, skipping invalid lines -code = ''' -for i, line in enumerate( file( in_fname ) ): - total_lines += 1 - line = line.rstrip( '\\r\\n' ) - if not line or line.startswith( '#' ): - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - try: - # Place attribute values into variables with attribute - # name; type casting is done as well. - elems = line.split( '\t' ) - attribute_values = {} - for name_value_pair in elems[8].split(";"): - pair = name_value_pair.strip().split(" ") - if pair == '': - continue - name = pair[0].strip() - if name == '': - continue - # Need to strip double quote from value and typecast. - attribute_values[name] = pair[1].strip(" \\"") - %s - if %s: - lines_kept += 1 - print >> out, line - except Exception, e: - print e - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line -''' % ( wrap, cond_text ) - -valid_filter = True -try: - exec code -except Exception, e: - out.close() - if str( e ).startswith( 'invalid syntax' ): - valid_filter = False - stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text ) - else: - stop_err( str( e ) ) - -if valid_filter: - out.close() - valid_lines = total_lines - skipped_lines - print 'Filtering with %s, ' % ( cond_text ) - if valid_lines > 0: - print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) - else: - print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text - if skipped_lines > 0: - print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/gff_filter_by_attribute.xml --- a/tools/filters/gff/gff_filter_by_attribute.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ - - using simple expressions - - gff_filter_by_attribute.py $input $out_file1 "$cond" '${input.metadata.attribute_types}' - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**) - -.. class:: infomark - -**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the attribute being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings). If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition. The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue". - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -The filter tool allows you to restrict the dataset using simple conditional statements. - -- Make sure that multi-character operators contain no white space ( e.g., **<=** is valid while **< =** is not valid ) -- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **attribute_name=='chr1'** ) -- Non-numerical values must be included in single or double quotes ( e.g., **attribute_name=='XX22'** ) - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/gff_filter_by_feature_count.py --- a/tools/filters/gff/gff_filter_by_feature_count.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ -#!/usr/bin/env python -""" -Filter a gff file using a criterion based on feature counts for a transcript. - -Usage: -%prog input_name output_name feature_name condition -""" -import sys -from galaxy import eggs -from galaxy.datatypes.util.gff_util import GFFReaderWrapper -from bx.intervals.io import GenomicInterval - -# Valid operators, ordered so that complex operators (e.g. '>=') are -# recognized before simple operators (e.g. '>') -ops = [ - '>=', - '<=', - '<', - '>', - '==', - '!=' -] - -# Escape sequences for valid operators. -mapped_ops = { - '__ge__': ops[0], - '__le__': ops[1], - '__lt__': ops[2], - '__gt__': ops[3], - '__eq__': ops[4], - '__ne__': ops[5], -} - - -def __main__(): - # Get args. - input_name = sys.argv[1] - output_name = sys.argv[2] - feature_name = sys.argv[3] - condition = sys.argv[4] - - # Unescape operations in condition str. - for key, value in mapped_ops.items(): - condition = condition.replace( key, value ) - - # Error checking: condition should be of the form - for op in ops: - if op in condition: - empty, number_str = condition.split( op ) - try: - number = float( number_str ) - except: - number = None - if empty != "" or not number: - print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition - return - break - - # Do filtering. - kept_features = 0 - skipped_lines = 0 - first_skipped_line = 0 - out = open( output_name, 'w' ) - for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ): - if not isinstance( feature, GenomicInterval ): - continue - count = 0 - for interval in feature.intervals: - if interval.feature == feature_name: - count += 1 - if eval( '%s %s' % ( count, condition ) ): - # Keep feature. - for interval in feature.intervals: - out.write( "\t".join(interval.fields) + '\n' ) - kept_features += 1 - - # Needed because i is 0-based but want to display stats using 1-based. - i += 1 - - # Clean up. - out.close() - info_msg = "%i of %i features kept (%.2f%%) using condition %s. " % \ - ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition ) - if skipped_lines > 0: - info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) - print info_msg - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/gff_filter_by_feature_count.xml --- a/tools/filters/gff/gff_filter_by_feature_count.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ - - using simple expressions - - gff_filter_by_feature_count.py $input_file1 $out_file1 "$feature_name" "$cond" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -Valid comparison operators are: > < >=, <=, !=, and == - ------ - -**Syntax** - -The filter tool allows you to restrict the dataset based on transcripts' feature counts. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/gtf_filter_by_attribute_values_list.py --- a/tools/filters/gff/gtf_filter_by_attribute_values_list.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -# -# Filters a GFF file using a list of attribute values. Attribute values must -# be in the first column of the file; subsequent columns are ignored. -# Usage: -# python gff_filter_by_attribute_values.py -# - -import sys - -def parse_gff_attributes( attr_str ): - """ - Parses a GFF/GTF attribute string and returns a dictionary of name-value - pairs. The general format for a GFF3 attributes string is - name1=value1;name2=value2 - The general format for a GTF attribute string is - name1 "value1" ; name2 "value2" - The general format for a GFF attribute string is a single string that - denotes the interval's group; in this case, method returns a dictionary - with a single key-value pair, and key name is 'group' - """ - attributes_list = attr_str.split(";") - attributes = {} - for name_value_pair in attributes_list: - # Try splitting by space and, if necessary, by '=' sign. - pair = name_value_pair.strip().split(" ") - if len( pair ) == 1: - pair = name_value_pair.strip().split("=") - if len( pair ) == 1: - # Could not split for some reason -- raise exception? - continue - if pair == '': - continue - name = pair[0].strip() - if name == '': - continue - # Need to strip double quote from values - value = pair[1].strip(" \"") - attributes[ name ] = value - - if len( attributes ) == 0: - # Could not split attributes string, so entire string must be - # 'group' attribute. This is the case for strictly GFF files. - attributes['group'] = attr_str - return attributes - -def filter( gff_file, attribute_name, ids_file, output_file ): - # Put ids in dict for quick lookup. - ids_dict = {} - for line in open( ids_file ): - ids_dict[ line.split('\t')[0].strip() ] = True - - # Filter GFF file using ids. - output = open( output_file, 'w' ) - for line in open( gff_file ): - fields = line.split( '\t' ) - attributes = parse_gff_attributes( fields[8] ) - if ( attribute_name in attributes ) and ( attributes[ attribute_name ] in ids_dict ): - output.write( line ) - output.close() - -if __name__ == "__main__": - # Handle args. - if len( sys.argv ) != 5: - print >> sys.stderr, "usage: python %s " % sys.argv[0] - sys.exit( -1 ) - gff_file, attribute_name, ids_file, output_file = sys.argv[1:] - filter( gff_file, attribute_name, ids_file, output_file ) diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff/gtf_filter_by_attribute_values_list.xml --- a/tools/filters/gff/gtf_filter_by_attribute_values_list.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - - - gtf_filter_by_attribute_values_list.py $input $attribute_name $ids $output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool filters a GTF file using a list of attribute values. The attribute values are -taken from the first column in the file; additional columns in the file are ignored. An example -use of this tool is to filter a GTF file using a list of transcript_ids or gene_ids obtained from Cuffdiff. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff2bed.xml --- a/tools/filters/gff2bed.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ - - converter - gff_to_bed_converter.py $input $out_file1 - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts data from GFF format to BED format (scroll down for format description). - --------- - -**Example** - -The following data in GFF format:: - - chr22 GeneA enhancer 10000000 10001000 500 + . TGA - chr22 GeneA promoter 10010000 10010100 900 + . TGA - -Will be converted to BED (**note** that 1 is subtracted from the start coordinate):: - - chr22 9999999 10001000 enhancer 0 + - chr22 10009999 10010100 promoter 0 + - ------- - -.. class:: infomark - -**About formats** - -**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: - -The first three BED fields (required) are:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - -The additional BED fields (optional) are:: - - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. - 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. - 9. reserved - This should always be set to zero. - 10. blockCount - The number of blocks (exons) in the BED line. - 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - 13. expCount - The number of experiments. - 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. - 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. - -**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: - - 1. seqname - Must be a chromosome or scaffold. - 2. source - The program that generated this feature. - 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. end - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. group - All lines with the same group are linked together into a single item. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gff_to_bed_converter.py --- a/tools/filters/gff_to_bed_converter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -#!/usr/bin/env python -import sys -from galaxy import eggs -from galaxy.datatypes.util.gff_util import parse_gff_attributes - -assert sys.version_info[:2] >= ( 2, 4 ) - -def get_bed_line( chrom, name, strand, blocks ): - """ Returns a BED line for given data. """ - - - if len( blocks ) == 1: - # Use simple BED format if there is only a single block: - # chrom, chromStart, chromEnd, name, score, strand - # - start, end = blocks[0] - return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand ) - - # - # Build lists for transcript blocks' starts, sizes. - # - - # Get transcript start, end. - t_start = sys.maxint - t_end = -1 - for block_start, block_end in blocks: - if block_start < t_start: - t_start = block_start - if block_end > t_end: - t_end = block_end - - # Get block starts, sizes. - block_starts = [] - block_sizes = [] - for block_start, block_end in blocks: - block_starts.append( str( block_start - t_start ) ) - block_sizes.append( str( block_end - block_start ) ) - - # - # Create BED entry. - # Bed format: chrom, chromStart, chromEnd, name, score, strand, \ - # thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts - # - # Render complete feature with thick blocks. There's no clear way to do this unless - # we analyze the block names, but making everything thick makes more sense than - # making everything thin. - # - return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \ - ( chrom, t_start, t_end, name, strand, t_start, t_end, len( block_starts ), - ",".join( block_sizes ), ",".join( block_starts ) ) - -def __main__(): - input_name = sys.argv[1] - output_name = sys.argv[2] - skipped_lines = 0 - first_skipped_line = 0 - out = open( output_name, 'w' ) - i = 0 - cur_transcript_chrom = None - cur_transcript_id = None - cur_transcript_strand = None - cur_transcripts_blocks = [] # (start, end) for each block. - for i, line in enumerate( file( input_name ) ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - try: - # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes - elems = line.split( '\t' ) - start = str( long( elems[3] ) - 1 ) - coords = [ long( start ), long( elems[4] ) ] - strand = elems[6] - if strand not in ['+', '-']: - strand = '+' - attributes = parse_gff_attributes( elems[8] ) - t_id = attributes.get( "transcript_id", None ) - - if not t_id: - # - # No transcript ID, so write last transcript and write current line as its own line. - # - - # Write previous transcript. - if cur_transcript_id: - # Write BED entry. - out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) ) - - # Replace any spaces in the name with underscores so UCSC will not complain. - name = elems[2].replace(" ", "_") - out.write( get_bed_line( elems[0], name, strand, [ coords ] ) ) - continue - - # There is a transcript ID, so process line at transcript level. - if t_id == cur_transcript_id: - # Line is element of transcript and will be a block in the BED entry. - cur_transcripts_blocks.append( coords ) - continue - - # - # Line is part of new transcript; write previous transcript and start - # new transcript. - # - - # Write previous transcript. - if cur_transcript_id: - # Write BED entry. - out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) ) - - # Start new transcript. - cur_transcript_chrome = elems[0] - cur_transcript_id = t_id - cur_transcript_strand = strand - cur_transcripts_blocks = [] - cur_transcripts_blocks.append( coords ) - except: - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - else: - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - - # Write last transcript. - if cur_transcript_id: - # Write BED entry. - out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) ) - out.close() - info_msg = "%i lines converted to BED. " % ( i + 1 - skipped_lines ) - if skipped_lines > 0: - info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) - print info_msg - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/grep.py --- a/tools/filters/grep.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ -# Filename: grep.py -# Author: Ian N. Schenck -# Version: 8/23/2005 -# -# This script accepts regular expressions, as well as an "invert" -# option, and applies the regular expression using grep. This wrapper -# provides security and pipeline. -# -# Grep is launched based on these inputs: -# -i Input file -# -o Output file -# -pattern RegEx pattern -# -v true or false (output NON-matching lines) - -import sys -import os -import re -import string -import commands -from tempfile import NamedTemporaryFile - -# This function is exceedingly useful, perhaps package for reuse? -def getopts(argv): - opts = {} - while argv: - if argv[0][0] == '-': - opts[argv[0]] = argv[1] - argv = argv[2:] - else: - argv = argv[1:] - return opts - -def main(): - args = sys.argv[1:] - - try: - opts = getopts(args) - except IndexError: - print "Usage:" - print " -i Input file" - print " -o Output file" - print " -pattern RegEx pattern" - print " -v true or false (Invert match)" - return 0 - - outputfile = opts.get("-o") - if outputfile == None: - print "No output file specified." - return -1 - - inputfile = opts.get("-i") - if inputfile == None: - print "No input file specified." - return -2 - - invert = opts.get("-v") - if invert == None: - print "Match style (Invert or normal) not specified." - return -3 - - pattern = opts.get("-pattern") - if pattern == None: - print "RegEx pattern not specified." - return -4 - - # All inputs have been specified at this point, now validate. - - # replace if input has been escaped, remove sq - # characters that are allowed but need to be escaped - mapped_chars = { '>' :'__gt__', - '<' :'__lt__', - '\'' :'__sq__', - '"' :'__dq__', - '[' :'__ob__', - ']' :'__cb__', - '{' :'__oc__', - '}' :'__cc__' - } - - #with new sanitizing we only need to replace for single quote, but this needs to remain for backwards compatibility - for key, value in mapped_chars.items(): - pattern = pattern.replace(value, key) - - fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") #why? - invertRegEx = re.compile("(true)|(false)") #why? - - if not fileRegEx.match(outputfile): - print "Illegal output filename." - return -5 - if not fileRegEx.match(inputfile): - print "Illegal input filename." - return -6 - if not invertRegEx.match(invert): - print "Illegal invert option." - return -7 - - # invert grep search? - if invert == "true": - invertflag = " -v" - print "Not matching pattern: %s" % pattern - else: - invertflag = "" - print "Matching pattern: %s" % pattern - - #Create temp file holding pattern - #By using a file to hold the pattern, we don't have worry about sanitizing grep commandline and can include single quotes in pattern - pattern_file_name = NamedTemporaryFile().name - open( pattern_file_name, 'w' ).write( pattern ) - - #generate grep command - commandline = "grep -E %s -f %s %s > %s" % ( invertflag, pattern_file_name, inputfile, outputfile ) - - #run grep - errorcode, stdout = commands.getstatusoutput(commandline) - - #remove temp pattern file - os.unlink( pattern_file_name ) - - #return error code - return errorcode - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/grep.xml --- a/tools/filters/grep.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ - - lines that match an expression - grep.py -i $input -o $out_file1 -pattern '$pattern' -v $invert - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -The select tool searches the data for lines containing or not containing a match to the given pattern. Regular Expression is introduced in this tool. A Regular Expression is a pattern describing a certain amount of text. - -- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for. -- **\\A** matches the beginning of a string(but not an internal line). -- **\\d** matches a digit, same as [0-9]. -- **\\D** matches a non-digit. -- **\\s** matches a whitespace character. -- **\\S** matches anything BUT a whitespace. -- **\\t** matches a tab. -- **\\w** matches an alphanumeric character. -- **\\W** matches anything but an alphanumeric character. -- **(** .. **)** groups a particular pattern. -- **\\Z** matches the end of a string(but not a internal line). -- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern. - - - **{n}** The preceding item is matched exactly n times. - - **{n,}** The preceding item is matched n or more times. - - **{n,m}** The preceding item is matched at least n times but not more than m times. - -- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**. -- **.** Matches any single character except a newline. -- ***** The preceding item will be matched zero or more times. -- **?** The preceding item is optional and matched at most once. -- **+** The preceding item will be matched one or more times. -- **^** has two meaning: - - matches the beginning of a line or string. - - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets. -- **$** matches the end of a line or string. -- **\|** Separates alternate possibilities. - ------ - -**Example** - -- **^chr([0-9A-Za-z])+** would match lines that begin with chromosomes, such as lines in a BED format file. -- **(ACGT){1,5}** would match at least 1 "ACGT" and at most 5 "ACGT" consecutively. -- **([^,][0-9]{1,3})(,[0-9]{3})\*** would match a large integer that is properly separated with commas such as 23,078,651. -- **(abc)|(def)** would match either "abc" or "def". -- **^\\W+#** would match any line that is a comment. - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gtf2bedgraph.xml --- a/tools/filters/gtf2bedgraph.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ - - converter - gtf_to_bedgraph_converter.py $input $out_file1 $attribute_name - - - - - - - - - - - - - - - - -**What it does** - -This tool converts data from GTF format to BEDGraph format (scroll down for format description). - --------- - -**Example** - -The following data in GFF format:: - - chr22 GeneA enhancer 10000000 10001000 500 + . gene_id "GeneA"; transcript_id "TranscriptAlpha"; FPKM "2.75"; frac "1.000000"; - chr22 GeneA promoter 10010000 10010100 900 + . gene_id "GeneA"; transcript_id "TranscriptsAlpha"; FPKM "2.25"; frac "1.000000"; - -using the attribute name 'FPKM' will be converted to BEDGraph (**note** that 1 is subtracted from the start coordinate):: - - - chr22 9999999 10001000 2.75 - chr22 10009999 10010100 2.25 - ------- - -.. class:: infomark - -**About formats** - -**GTF format** Gene Transfer Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GTF lines have nine tab-separated fields:: - - 1. seqname - Must be a chromosome or scaffold. - 2. source - The program that generated this feature. - 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". - 4. start - The starting position of the feature in the sequence. The first base is numbered 1. - 5. end - The ending position of the feature (inclusive). - 6. score - A score between 0 and 1000. If there is no score value, enter ".". - 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). - 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. - 9. group - The group field is a list of attributes. Each attribute consists of a type/value pair. Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. The attribute list must begin with the two mandatory attributes: (i) gene_id value - A globally unique identifier for the genomic source of the sequence and (ii) transcript_id value - A globally unique identifier for the predicted transcript. - -**BEDGraph format** - -The bedGraph format is line-oriented. Bedgraph data are preceeded by a track definition line, which adds a number of options for controlling the default display of this track. - -For the track definition line, all options are placed in a single line separated by spaces: - track type=bedGraph name=track_label description=center_label - visibility=display_mode color=r,g,b altColor=r,g,b - priority=priority autoScale=on|off alwaysZero=on|off - gridDefault=on|off maxHeightPixels=max:default:min - graphType=bar|points viewLimits=lower:upper - yLineMark=real-value yLineOnOff=on|off - windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 - -The track type is REQUIRED, and must be bedGraph: - type=bedGraph - -Following the track definition line are the track data in four column BED format:: - - chromA chromStartA chromEndA dataValueA - chromB chromStartB chromEndB dataValueB - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/gtf_to_bedgraph_converter.py --- a/tools/filters/gtf_to_bedgraph_converter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ -#!/usr/bin/env python -import os, sys, tempfile - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - # Read parms. - input_name = sys.argv[1] - output_name = sys.argv[2] - attribute_name = sys.argv[3] - - # Create temp files. - tmp_name1 = tempfile.NamedTemporaryFile().name - tmp_name2 = tempfile.NamedTemporaryFile().name - - # Do conversion. - skipped_lines = 0 - first_skipped_line = 0 - out = open( tmp_name1, 'w' ) - - # Write track data to temporary file. - i = 0 - for i, line in enumerate( file( input_name ) ): - line = line.rstrip( '\r\n' ) - - if line and not line.startswith( '#' ): - try: - elems = line.split( '\t' ) - start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based. - strand = elems[6] - if strand not in ['+', '-']: - strand = '+' - attributes_list = elems[8].split(";") - attributes = {} - for name_value_pair in attributes_list: - pair = name_value_pair.strip().split(" ") - name = pair[0].strip() - if name == '': - continue - # Need to strip double quote from values - value = pair[1].strip(" \"") - attributes[name] = value - value = attributes[ attribute_name ] - # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. - # BedGraph format: chrom, chromStart, chromEnd, value - out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) ) - except: - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - else: - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - out.close() - - # Sort tmp file by chromosome name and chromosome start to create ordered track data. - cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 ) - try: - os.system(cmd) - os.remove(tmp_name1) - except Exception, ex: - sys.stderr.write( "%s\n" % ex ) - sys.exit(1) - - # Create bedgraph file by combining track definition with ordered track data. - cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name ) - try: - os.system(cmd) - os.remove(tmp_name2) - except Exception, ex: - sys.stderr.write( "%s\n" % ex ) - sys.exit(1) - - info_msg = "%i lines converted to BEDGraph. " % ( i + 1 - skipped_lines ) - if skipped_lines > 0: - info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) - print info_msg - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/headWrapper.pl --- a/tools/filters/headWrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; - -# a wrapper for head for use in galaxy -# headWrapper.pl [filename] [# lines to show] [output] - -die "Check arguments" unless @ARGV == 3; -die "Line number must be an integer\n" unless $ARGV[1]=~ m/^\d+$/; - -open (OUT, ">$ARGV[2]") or die "Cannot create $ARGV[2]:$!\n"; -open (HEAD, "head -n $ARGV[1] $ARGV[0]|") or die "Cannot run head:$!\n"; -while () { - print OUT; -} -close OUT; -close HEAD; - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/headWrapper.xml --- a/tools/filters/headWrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - lines from a dataset - headWrapper.pl $input $lineNum $out_file1 - - - - - - - - - - - - - - - - -**What it does** - -This tool outputs specified number of lines from the **beginning** of a dataset - ------ - -**Example** - -Selecting 2 lines from this:: - - chr7 56632 56652 D17003_CTCF_R6 310 + - chr7 56736 56756 D17003_CTCF_R7 354 + - chr7 56761 56781 D17003_CTCF_R4 220 + - chr7 56772 56792 D17003_CTCF_R7 372 + - chr7 56775 56795 D17003_CTCF_R4 207 + - -will produce:: - - chr7 56632 56652 D17003_CTCF_R6 310 + - chr7 56736 56756 D17003_CTCF_R7 354 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/join.py --- a/tools/filters/join.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,370 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -""" -Script to Join Two Files on specified columns. - -Takes two tab delimited files, two column numbers (base 1) and outputs a new tab delimited file with lines joined by tabs. -User can also opt to have have non-joining rows of file1 echoed. - -""" - -import optparse, os, sys, tempfile, struct -import psyco_full - -try: - simple_json_exception = None - from galaxy import eggs - from galaxy.util.bunch import Bunch - from galaxy.util import stringify_dictionary_keys - import pkg_resources - pkg_resources.require("simplejson") - import simplejson -except Exception, e: - simplejson_exception = e - simplejson = None - - -class OffsetList: - def __init__( self, filesize = 0, fmt = None ): - self.file = tempfile.NamedTemporaryFile( 'w+b' ) - if fmt: - self.fmt = fmt - elif filesize and filesize <= sys.maxint * 2: - self.fmt = 'I' - else: - self.fmt = 'Q' - self.fmt_size = struct.calcsize( self.fmt ) - @property - def size( self ): - self.file.flush() - return self.file_size / self.fmt_size - @property - def file_size( self ): - self.file.flush() - return os.stat( self.file.name ).st_size - def add_offset( self, offset ): - if not isinstance( offset, list ): - offset = [offset] - self.file.seek( self.file_size ) - for off in offset: - self.file.write( struct.pack( self.fmt, off ) ) - def get_offsets( self, start = 0 ): - self.file.seek( start * self.fmt_size ) - while True: - packed = self.file.read( self.fmt_size ) - if not packed: break - yield struct.unpack( self.fmt, packed )[0] - def get_offset_by_index( self, index ): - self.file.seek( index * self.fmt_size ) - return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] - def set_offset_at_index( self, index, offset ): - if not isinstance( offset, list ): - offset = [offset] - if index >= self.size: - self.add_offset( offset ) - else: - temp_file = tempfile.NamedTemporaryFile( 'w+b' ) - self.file.seek( 0 ) - temp_file.write( self.file.read( ( index ) * self.fmt_size ) ) - for off in offset: - temp_file.write( struct.pack( self.fmt, off ) ) - temp_file.write( self.file.read() ) - self.file = temp_file - -class SortedOffsets( OffsetList ): - def __init__( self, indexed_filename, column, split = None ): - OffsetList.__init__( self, os.stat( indexed_filename ).st_size ) - self.indexed_filename = indexed_filename - self.indexed_file = open( indexed_filename, 'rb' ) - self.column = column - self.split = split - self.last_identifier = None - self.last_identifier_merged = None - self.last_offset_merged = 0 - def merge_with_dict( self, new_offset_dict ): - if not new_offset_dict: return #no items to merge in - keys = new_offset_dict.keys() - keys.sort() - identifier2 = keys.pop( 0 ) - - result_offsets = OffsetList( fmt = self.fmt ) - offsets1 = enumerate( self.get_offsets() ) - try: - index1, offset1 = offsets1.next() - identifier1 = self.get_identifier_by_offset( offset1 ) - except StopIteration: - offset1 = None - identifier1 = None - index1 = 0 - - while True: - if identifier1 is None and identifier2 is None: - self.file = result_offsets.file #self is now merged results - return - elif identifier1 is None or ( identifier2 and identifier2 < identifier1 ): - result_offsets.add_offset( new_offset_dict[identifier2] ) - if keys: - identifier2 = keys.pop( 0 ) - else: - identifier2 = None - elif identifier2 is None: - result_offsets.file.seek( result_offsets.file_size ) - self.file.seek( index1 * self.fmt_size ) - result_offsets.file.write( self.file.read() ) - identifier1 = None - offset1 = None - else: - result_offsets.add_offset( offset1 ) - try: - index1, offset1 = offsets1.next() - identifier1 = self.get_identifier_by_offset( offset1 ) - except StopIteration: - offset1 = None - identifier1 = None - index1 += 1 -#methods to help link offsets to lines, ids, etc - def get_identifier_by_line( self, line ): - if isinstance( line, str ): - fields = line.rstrip( '\r\n' ).split( self.split ) - if self.column < len( fields ): - return fields[self.column] - return None - def get_line_by_offset( self, offset ): - self.indexed_file.seek( offset ) - return self.indexed_file.readline() - def get_identifier_by_offset( self, offset ): - return self.get_identifier_by_line( self.get_line_by_offset( offset ) ) - -#indexed set of offsets, index is built on demand -class OffsetIndex: - def __init__( self, filename, column, split = None, index_depth = 3 ): - self.filename = filename - self.file = open( filename, 'rb' ) - self.column = column - self.split = split - self._offsets = {} - self._index = None - self.index_depth = index_depth - def _build_index( self ): - self._index = {} - for start_char, sorted_offsets in self._offsets.items(): - self._index[start_char]={} - for i, offset in enumerate( sorted_offsets.get_offsets() ): - identifier = sorted_offsets.get_identifier_by_offset( offset ) - if identifier[0:self.index_depth] not in self._index[start_char]: - self._index[start_char][identifier[0:self.index_depth]] = i - def get_lines_by_identifier( self, identifier ): - if not identifier: return - #if index doesn't exist, build it - if self._index is None: self._build_index() - - #identifier cannot exist - if identifier[0] not in self._index or identifier[0:self.index_depth] not in self._index[identifier[0]]: - return - #identifier might exist, search for it - offset_index = self._index[identifier[0]][identifier[0:self.index_depth]] - while True: - if offset_index >= self._offsets[identifier[0]].size: - return - offset = self._offsets[identifier[0]].get_offset_by_index( offset_index ) - identifier2 = self._offsets[identifier[0]].get_identifier_by_offset( offset ) - if not identifier2 or identifier2 > identifier: - return - if identifier2 == identifier: - yield self._offsets[identifier[0]].get_line_by_offset( offset ) - offset_index += 1 - def get_offsets( self ): - keys = self._offsets.keys() - keys.sort() - for key in keys: - for offset in self._offsets[key].get_offsets(): - yield offset - def get_line_by_offset( self, offset ): - self.file.seek( offset ) - return self.file.readline() - def get_identifiers_offsets( self ): - keys = self._offsets.keys() - keys.sort() - for key in keys: - for offset in self._offsets[key].get_offsets(): - yield self._offsets[key].get_identifier_by_offset( offset ), offset - def get_identifier_by_line( self, line ): - if isinstance( line, str ): - fields = line.rstrip( '\r\n' ).split( self.split ) - if self.column < len( fields ): - return fields[self.column] - return None - def merge_with_dict( self, d ): - if not d: return #no data to merge - self._index = None - keys = d.keys() - keys.sort() - identifier = keys.pop( 0 ) - first_char = identifier[0] - temp = { identifier: d[identifier] } - while True: - if not keys: - if first_char not in self._offsets: - self._offsets[first_char] = SortedOffsets( self.filename, self.column, self.split ) - self._offsets[first_char].merge_with_dict( temp ) - return - identifier = keys.pop( 0 ) - if identifier[0] == first_char: - temp[identifier] = d[identifier] - else: - if first_char not in self._offsets: - self._offsets[first_char] = SortedOffsets( self.filename, self.column, self.split ) - self._offsets[first_char].merge_with_dict( temp ) - temp = { identifier: d[identifier] } - first_char = identifier[0] - -class BufferedIndex: - def __init__( self, filename, column, split = None, buffer = 1000000, index_depth = 3 ): - self.index = OffsetIndex( filename, column, split, index_depth ) - self.buffered_offsets = {} - f = open( filename, 'rb' ) - offset = f.tell() - identified_offset_count = 1 - while True: - offset = f.tell() - line = f.readline() - if not line: break #EOF - identifier = self.index.get_identifier_by_line( line ) - if identifier: - #flush buffered offsets, if buffer size reached - if buffer and identified_offset_count % buffer == 0: - self.index.merge_with_dict( self.buffered_offsets ) - self.buffered_offsets = {} - if identifier not in self.buffered_offsets: - self.buffered_offsets[identifier] = [] - self.buffered_offsets[identifier].append( offset ) - identified_offset_count += 1 - f.close() - - def get_lines_by_identifier( self, identifier ): - for line in self.index.get_lines_by_identifier( identifier ): - yield line - if identifier in self.buffered_offsets: - for offset in self.buffered_offsets[identifier]: - yield self.index.get_line_by_offset( offset ) - - -def fill_empty_columns( line, split, fill_values ): - if not fill_values: - return line - filled_columns = [] - for i, field in enumerate( line.split( split ) ): - if field or i >= len( fill_values ): - filled_columns.append( field ) - else: - filled_columns.append( fill_values[i] ) - if len( fill_values ) > len( filled_columns ): - filled_columns.extend( fill_values[ len( filled_columns ) : ] ) - return split.join( filled_columns ) - - -def join_files( filename1, column1, filename2, column2, out_filename, split = None, buffer = 1000000, keep_unmatched = False, keep_partial = False, index_depth = 3, fill_options = None ): - #return identifier based upon line - def get_identifier_by_line( line, column, split = None ): - if isinstance( line, str ): - fields = line.rstrip( '\r\n' ).split( split ) - if column < len( fields ): - return fields[column] - return None - if fill_options is None: - fill_options = Bunch( fill_unjoined_only = True, file1_columns = None, file2_columns = None ) - out = open( out_filename, 'w+b' ) - index = BufferedIndex( filename2, column2, split, buffer, index_depth ) - for line1 in open( filename1, 'rb' ): - identifier = get_identifier_by_line( line1, column1, split ) - if identifier: - written = False - for line2 in index.get_lines_by_identifier( identifier ): - if not fill_options.fill_unjoined_only: - out.write( "%s%s%s\n" % ( fill_empty_columns( line1.rstrip( '\r\n' ), split, fill_options.file1_columns ), split, fill_empty_columns( line2.rstrip( '\r\n' ), split, fill_options.file2_columns ) ) ) - else: - out.write( "%s%s%s\n" % ( line1.rstrip( '\r\n' ), split, line2.rstrip( '\r\n' ) ) ) - written = True - if not written and keep_unmatched: - out.write( fill_empty_columns( line1.rstrip( '\r\n' ), split, fill_options.file1_columns ) ) - if fill_options: - if fill_options.file2_columns: - out.write( "%s%s" % ( split, fill_empty_columns( "", split, fill_options.file2_columns ) ) ) - out.write( "\n" ) - elif keep_partial: - out.write( fill_empty_columns( line1.rstrip( '\r\n' ), split, fill_options.file1_columns ) ) - if fill_options: - if fill_options.file2_columns: - out.write( "%s%s" % ( split, fill_empty_columns( "", split, fill_options.file2_columns ) ) ) - out.write( "\n" ) - out.close() - -def main(): - parser = optparse.OptionParser() - parser.add_option( - '-b','--buffer', - dest='buffer', - type='int',default=1000000, - help='Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.' - ) - parser.add_option( - '-d','--index_depth', - dest='index_depth', - type='int',default=3, - help='Depth to use on filebased offset indexing. Default: 3.' - ) - parser.add_option( - '-p','--keep_partial', - action='store_true', - dest='keep_partial', - default=False, - help='Keep rows in first input which are missing identifiers.') - parser.add_option( - '-u','--keep_unmatched', - action='store_true', - dest='keep_unmatched', - default=False, - help='Keep rows in first input which are not joined with the second input.') - parser.add_option( - '-f','--fill_options_file', - dest='fill_options_file', - type='str',default=None, - help='Fill empty columns with a values from a JSONified file.') - - - options, args = parser.parse_args() - - fill_options = None - if options.fill_options_file is not None: - try: - if simplejson is None: - raise simplejson_exception - fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) #simplejson.load( open( options.fill_options_file ) ) - except Exception, e: - print "Warning: Ignoring fill options due to simplejson error (%s)." % e - if fill_options is None: - fill_options = Bunch() - if 'fill_unjoined_only' not in fill_options: - fill_options.fill_unjoined_only = True - if 'file1_columns' not in fill_options: - fill_options.file1_columns = None - if 'file2_columns' not in fill_options: - fill_options.file2_columns = None - - - try: - filename1 = args[0] - filename2 = args[1] - column1 = int( args[2] ) - 1 - column2 = int( args[3] ) - 1 - out_filename = args[4] - except: - print >> sys.stderr, "Error parsing command line." - sys.exit() - - #Character for splitting fields and joining lines - split = "\t" - - return join_files( filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.index_depth, fill_options = fill_options ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/joinWrapper.pl --- a/tools/filters/joinWrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; -use File::Temp "tempfile"; - -my ($input1, $input2, $field1, $field2, $mode, $OOption, $out_file1) = @ARGV; - -die "No arguments\n" unless @ARGV == 7; - -my ($fh1, $file1) = tempfile(); -my ($fh2, $file2) = tempfile(); - -`sort -k $field1 $input1 > $file1`; -`sort -k $field2 $input2 > $file2`; - -my $option = ""; -my @fields = (); -my $line = ""; - -if ($OOption eq "Y") { - if (defined($fh1)) { - $line = <$fh1>; - } else { - die "Failed to create file $file1\n"; - } - @fields = split /\t/, $line; - die "The field you selected does not exist in the input file" if (@fields < $field1); - my @optionO = (); - my $i = 0; - foreach (@fields) { - ++$i; - push(@optionO, "1.$i"); - } - $option = "-o " . join(",", @optionO); -} else { - $option = ""; -} - -$ENV{'LC_ALL'} = 'POSIX'; - -if ($mode eq "V") { - `join -v 1 $option -1 $field1 -2 $field2 $file1 $file2 | tr " " "\t" > $out_file1`; -} else { - `join $option -1 $field1 -2 $field2 $file1 $file2 | tr " " "\t" > $out_file1`; -} - -`rm $file1 ; rm $file2`; - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/joinWrapper.py --- a/tools/filters/joinWrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda -""" -This tool provides the UNIX "join" functionality. -""" -import sys, os, tempfile, subprocess - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - infile1 = sys.argv[1] - infile2 = sys.argv[2] - field1 = int(sys.argv[3]) - field2 = int(sys.argv[4]) - mode =sys.argv[5] - outfile = sys.argv[6] - - tmpfile1 = tempfile.NamedTemporaryFile() - tmpfile2 = tempfile.NamedTemporaryFile() - - try: - #Sort the two files based on specified fields - os.system("sort -t ' ' -k %d,%d -o %s %s" %(field1, field1, tmpfile1.name, infile1)) - os.system("sort -t ' ' -k %d,%d -o %s %s" %(field2, field2, tmpfile2.name, infile2)) - except Exception, exc: - stop_err( 'Initialization error -> %s' %str(exc) ) - - option = "" - for line in file(tmpfile1.name): - line = line.strip() - if line: - elems = line.split('\t') - for j in range(1,len(elems)+1): - if j == 1: - option = "1.1" - else: - option = option + ",1." + str(j) - break - - #check if join has --version option. BSD join doens't have this option, while GNU join does. - #The return value in the latter case will be 0, and non-zero in the latter case. - ret = subprocess.call('join --version 2>/dev/null', shell=True) - # check if we are a version later than 7 of join. If so, we want to skip - # checking the order since join will raise an error with duplicated items in - # the two files being joined. - if ret == 0: - cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE) - (stdout, _) = cl.communicate() - version_line = stdout.split("\n")[0] - (version, _) = version_line.split()[-1].split(".") - if int(version) >= 7: - flags = "--nocheck-order" - else: - flags = "" - else: - flags = "" - - if mode == "V": - cmdline = "join %s -t ' ' -v 1 -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) - else: - cmdline = "join %s -t ' ' -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) - - try: - os.system(cmdline) - except Exception, exj: - stop_err('Error joining the two datasets -> %s' %str(exj)) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/joiner.xml --- a/tools/filters/joiner.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,180 +0,0 @@ - - side by side on a specified field - join.py $input1 $input2 $field1 $field2 $out_file1 $unmatched $partial --index_depth=3 --buffer=50000000 --fill_options_file=$fill_options_file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <% -import simplejson -%> -#set $__fill_options = {} -#if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty': - #set $__fill_options['fill_unjoined_only'] = $fill_empty_columns['fill_columns_by'].value == 'fill_unjoined_only' - #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value': - #set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value - #else: - #set $__start_fill = "" - #end if - #set $__fill_options['file1_columns'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ] - #set $__fill_options['file2_columns'] = [ __start_fill for i in range( int( $input2.metadata.columns ) ) ] - #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column': - #for column_fill1 in $fill_empty_columns['do_fill_empty_columns']['column_fill1']: - #set $__fill_options['file1_columns'][ int( column_fill1['column_number1'].value ) - 1 ] = column_fill1['fill_value1'].value - #end for - #for column_fill2 in $fill_empty_columns['do_fill_empty_columns']['column_fill2']: - #set $__fill_options['file2_columns'][ int( column_fill2['column_number2'].value ) - 1 ] = column_fill2['fill_value2'].value - #end for - #end if -#end if -${simplejson.dumps( __fill_options )} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**This tool will attempt to reuse the metadata from your first input.** To change metadata assignments click on the "edit attributes" link of the history item generated by this tool. - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool joins lines of two datasets on a common field. An empty string ("") is not a valid identifier. -You may choose to include lines of your first input that do not join with your second input. - -- Columns are referenced with a **number**. For example, **3** refers to the 3rd column of a tab-delimited file. - ------ - -**Example** - -Dataset1:: - - chr1 10 20 geneA - chr1 50 80 geneB - chr5 10 40 geneL - -Dataset2:: - - geneA tumor-supressor - geneB Foxp2 - geneC Gnas1 - geneE INK4a - -Joining the 4th column of Dataset1 with the 1st column of Dataset2 will yield:: - - chr1 10 20 geneA geneA tumor-suppressor - chr1 50 80 geneB geneB Foxp2 - -Joining the 4th column of Dataset1 with the 1st column of Dataset2, while keeping all lines from Dataset1, will yield:: - - chr1 10 20 geneA geneA tumor-suppressor - chr1 50 80 geneB geneB Foxp2 - chr5 10 40 geneL - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/joiner2.xml --- a/tools/filters/joiner2.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ - - two datasets a specific column of which has the same value - sort -k $col1 $input1 > $input1.tmp; sort -k $col2 $input2 > $input2.tmp; join -1 $col1 -2 $col2 $input1.tmp $input2.tmp | tr " " "\t" > $out_file1; rm -rf $input1.tmp $input2.tmp - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/lav_to_bed.py --- a/tools/filters/lav_to_bed.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -#!/usr/bin/env python -#Reads a LAV file and writes two BED files. -import sys -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import bx.align.lav - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - try: - lav_file = open(sys.argv[1],'r') - bed_file1 = open(sys.argv[2],'w') - bed_file2 = open(sys.argv[3],'w') - except Exception, e: - stop_err( str( e ) ) - - lavsRead = 0 - bedsWritten = 0 - species = {} - # TODO: this is really bad since everything is read into memory. Can we eliminate this tool? - for lavBlock in bx.align.lav.Reader( lav_file ): - lavsRead += 1 - for c in lavBlock.components: - spec, chrom = bx.align.lav.src_split( c.src ) - if bedsWritten < 1: - if len( species )==0: - species[spec]=bed_file1 - elif len( species )==1: - species[spec]=bed_file2 - else: - continue #this is a pairwise alignment... - if spec in species: - species[spec].write( "%s\t%i\t%i\t%s_%s\t%i\t%s\n" % ( chrom, c.start, c.end, spec, str( bedsWritten ), 0, c.strand ) ) - bedsWritten += 1 - - - for spec,file in species.items(): - print "#FILE\t%s\t%s" % (file.name, spec) - - lav_file.close() - bed_file1.close() - bed_file2.close() - - print "%d lav blocks read, %d regions written\n" % (lavsRead,bedsWritten) - - - -if __name__ == "__main__": main() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/lav_to_bed.xml --- a/tools/filters/lav_to_bed.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ - - Converts a LAV formatted file to BED format - lav_to_bed.py $lav_file $bed_file1 $bed_file2 - - - - - - - - - - - - - - - - -**Syntax** - -This tool converts a LAV formatted file to the BED format. - -- **LAV format** LAV is an alignment format developed by Webb Miller's group at Penn State University. It is the primary output format for BLASTZ. - -- **BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. - ------ - -**Example** - -- Convert LAV format:: - - #:lav - s { - "/galaxy/data/hg16/seq/chr19.nib" 1 63811651 0 1 - "/galaxy/data/mm5/seq/chr11.nib" 1 121648857 0 1 - } - h { - "> hg16.chr19" - "> mm5.chr11 (reverse complement)" - } - a { - s 3500 - b 3001012 70568380 - e 3001075 70568443 - l 3001012 70568380 3001075 70568443 81 - } - a { - s 3900 - b 3008279 70573976 - e 3008357 70574054 - l 3008279 70573976 3008357 70574054 78 - } - #:eof - -- To two BED formatted files:: - - chr19 3001011 3001075 hg16_0 0 + - chr19 3008278 3008357 hg16_1 0 + - - **and**:: - - chr11 70568379 70568443 mm5_0 0 + - chr11 70573975 70574054 mm5_1 0 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/lav_to_bed_code.py --- a/tools/filters/lav_to_bed_code.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#Set build, name, and info for each output BED file -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - new_stdout = "" - filename_to_build = {} - for line in stdout.split("\n"): - if line.startswith("#FILE"): - fields = line.split("\t") - filename_to_build[fields[1]]=fields[2].strip() - else: - new_stdout = "%s%s" % ( new_stdout, line ) - for name,data in out_data.items(): - try: - data.info = "%s\n%s" % ( new_stdout, stderr ) - data.dbkey = filename_to_build[data.file_name] - data.name = "%s (%s)" % ( data.name, data.dbkey ) - app.model.context.add( data ) - app.model.context.flush() - except: - continue diff -r c2a356708570 -r 33c067c3ae34 tools/filters/mergeCols.py --- a/tools/filters/mergeCols.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -import sys, re - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def __main__(): - try: - infile = open ( sys.argv[1], 'r') - outfile = open ( sys.argv[2], 'w') - except: - stop_err( 'Cannot open or create a file\n' ) - - if len( sys.argv ) < 4: - stop_err( 'No columns to merge' ) - else: - cols = sys.argv[3:] - - skipped_lines = 0 - - for line in infile: - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - fields = line.split( '\t' ) - line += '\t' - for col in cols: - try: - line += fields[ int( col ) -1 ] - except: - skipped_lines += 1 - - print >>outfile, line - - if skipped_lines > 0: - print 'Skipped %d invalid lines' % skipped_lines - -if __name__ == "__main__" : __main__() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/mergeCols.xml --- a/tools/filters/mergeCols.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - together - - mergeCols.py - $input1 - $out_file1 - $col1 - $col2 - #for $col in $columns - ${col.datacol} - #end for - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**What it does** - -This tool merges columns together. Any number of valid columns can be merged in any order. - ------ - -**Example** - -Input dataset (five columns: c1, c2, c3, c4, and c5):: - - 1 10 1000 gene1 chr - 2 100 1500 gene2 chr - -merging columns "**c5,c1**" will return:: - - 1 10 1000 gene1 chr chr1 - 2 100 1500 gene2 chr chr2 - -.. class:: warningmark - -Note that all original columns are preserved and the result of merge is added as the rightmost column. - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/pasteWrapper.pl --- a/tools/filters/pasteWrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; -my $command = ""; -# a wrapper for paste for use in galaxy -# pasteWrapper.pl [filename1] [filename2] [delimiter] [output] - -die "Check arguments" unless @ARGV == 4; - -if ($ARGV[2] eq 'T') { - $command = "paste $ARGV[0] $ARGV[1]"; -} elsif ($ARGV[2] eq 'C') { - $command = "paste -d \",\" $ARGV[0] $ARGV[1]"; -} elsif ($ARGV[2] eq 'D') { - $command = "paste -d \"-\" $ARGV[0] $ARGV[1]"; -} elsif ($ARGV[2] eq 'U') { - $command = "paste -d \"_\" $ARGV[0] $ARGV[1]"; -} elsif ($ARGV[2] eq 'P') { - $command = "paste -d \"|\" $ARGV[0] $ARGV[1]"; -} elsif ($ARGV[2] eq 'Dt') { - $command = "paste -d \".\" $ARGV[0] $ARGV[1]"; -} elsif ($ARGV[2] eq 'Sp') { - $command = "paste -d \" \" $ARGV[0] $ARGV[1]"; -} - -open (OUT, ">$ARGV[3]") or die "Cannot create $ARGV[2]:$!\n"; -open (PASTE, "$command |") or die "Cannot run paste:$!\n"; - -while () { - print OUT; -} -close OUT; -close PASTE; - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/pasteWrapper.xml --- a/tools/filters/pasteWrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ - - two files side by side - pasteWrapper.pl $input1 $input2 $delimiter $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -Paste preserves column assignments of the first dataset. - ------ - -**What it does** - -This tool merges two datasets side by side. If the first (left) dataset contains column assignments such as chromosome, start, end and strand, these will be preserved. However, if you would like to change column assignments, click the pencil icon in the history item. - ------ - -**Example** - -First dataset:: - - a 1 - a 2 - a 3 - -Second dataset:: - - 20 - 30 - 40 - -Pasting them together will produce:: - - a 1 20 - a 2 30 - a 3 40 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/randomlines.py --- a/tools/filters/randomlines.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -#!/usr/bin/env python -# Kanwei Li, 2010 -# Selects N random lines from a file and outputs to another file - -import random, sys - -def main(): - infile = open(sys.argv[1], 'r') - total_lines = int(sys.argv[2]) - - if total_lines < 1: - sys.stderr.write( "Must select at least one line." ) - sys.exit() - - kept = [] - n = 0 - for line in infile: - line = line.rstrip("\n") - n += 1 - if (n <= total_lines): - kept.append(line) - elif random.randint(1, n) <= total_lines: - kept.pop(random.randint(0, total_lines-1)) - kept.append(line) - - if n < total_lines: - sys.stderr.write( "Error: asked to select more lines than there were in the file." ) - sys.exit() - - open(sys.argv[3], 'w').write( "\n".join(kept) ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/randomlines.xml --- a/tools/filters/randomlines.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - from a file - randomlines.py $input $num_lines $out_file1 - - - - - - - - - - - - - - - - -**What it does** - -This tool selects N random lines from a file, with no repeats, and preserving ordering. - ------ - -**Example** - -Input File:: - - chr7 56632 56652 D17003_CTCF_R6 310 + - chr7 56736 56756 D17003_CTCF_R7 354 + - chr7 56761 56781 D17003_CTCF_R4 220 + - chr7 56772 56792 D17003_CTCF_R7 372 + - chr7 56775 56795 D17003_CTCF_R4 207 + - -Selecting 2 random lines might return this:: - - chr7 56736 56756 D17003_CTCF_R7 354 + - chr7 56775 56795 D17003_CTCF_R4 207 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/remove_beginning.pl --- a/tools/filters/remove_beginning.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; - -# Removes the specified number of lines from the beginning of the file. -# remove_beginning.pl [input] [num_lines] [output] - -die "Check arguments" unless @ARGV == 3; - -my $inputfile = $ARGV[0]; -my $num_lines = $ARGV[1]; -my $outputfile = $ARGV[2]; - -my $curCount=0; - -my $fhIn; -open ($fhIn, "< $inputfile") or die "Cannot open source file"; - -my $fhOut; -open ($fhOut, "> $outputfile"); - -while (<$fhIn>) -{ - $curCount++; - if ($curCount<=$num_lines) - { - next; - } - print $fhOut $_; -} -close ($fhIn) or die "Cannot close source file"; -close ($fhOut) or die "Cannot close output file"; diff -r c2a356708570 -r 33c067c3ae34 tools/filters/remove_beginning.xml --- a/tools/filters/remove_beginning.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - of a file - remove_beginning.pl $input $num_lines $out_file1 - - - - - - - - - - - - - - - - -**What it does** - -This tool removes a specified number of lines from the beginning of a dataset. - ------ - -**Example** - -Input File:: - - chr7 56632 56652 D17003_CTCF_R6 310 + - chr7 56736 56756 D17003_CTCF_R7 354 + - chr7 56761 56781 D17003_CTCF_R4 220 + - chr7 56772 56792 D17003_CTCF_R7 372 + - chr7 56775 56795 D17003_CTCF_R4 207 + - -After removing the first 3 lines the dataset will look like this:: - - chr7 56772 56792 D17003_CTCF_R7 372 + - chr7 56775 56795 D17003_CTCF_R4 207 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/sff_extract.py --- a/tools/filters/sff_extract.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1505 +0,0 @@ -#!/usr/bin/python -'''This software extracts the seq, qual and ancillary information from an sff -file, like the ones used by the 454 sequencer. - -Optionally, it can also split paired-end reads if given the linker sequence. -The splitting is done with maximum match, i.e., every occurence of the linker -sequence will be removed, even if occuring multiple times.''' - -#copyright Jose Blanca and Bastien Chevreux -#COMAV institute, Universidad Politecnica de Valencia (UPV) -#Valencia, Spain - -# additions to handle paired end reads by Bastien Chevreux -# bugfixes for linker specific lengths: Lionel Guy - -#This program is free software: you can redistribute it and/or modify -#it under the terms of the GNU General Public License as published by -#the Free Software Foundation, either version 3 of the License, or -#(at your option) any later version. -#This program is distributed in the hope that it will be useful, -#but WITHOUT ANY WARRANTY; without even the implied warranty of -#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -#GNU General Public License for more details. -#You should have received a copy of the GNU General Public License -#along with this program. If not, see . - -__author__ = 'Jose Blanca and Bastien Chevreux' -__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux' -__license__ = 'GPLv3 or later' -__version__ = '0.2.8' -__email__ = 'jblanca@btc.upv.es' -__status__ = 'beta' - -import struct -import sys -import os -import subprocess -import tempfile - - -fake_sff_name = 'fake_sff_name' - - -# readname as key: lines with matches from SSAHA, one best match -ssahapematches = {} -# linker readname as key: length of linker sequence -linkerlengths = {} - -# set to true if something really fishy is going on with the sequences -stern_warning = True - -def read_bin_fragment(struct_def, fileh, offset=0, data=None, - byte_padding=None): - '''It reads a chunk of a binary file. - - You have to provide the struct, a file object, the offset (where to start - reading). - Also you can provide an optional dict that will be populated with the - extracted data. - If a byte_padding is given the number of bytes read will be a multiple of - that number, adding the required pad at the end. - It returns the number of bytes reads and the data dict. - ''' - if data is None: - data = {} - - #we read each item - bytes_read = 0 - for item in struct_def: - #we go to the place and read - fileh.seek(offset + bytes_read) - n_bytes = struct.calcsize(item[1]) - buffer = fileh.read(n_bytes) - read = struct.unpack('>' + item[1], buffer) - if len(read) == 1: - read = read[0] - data[item[0]] = read - bytes_read += n_bytes - - #if there is byte_padding the bytes_to_read should be a multiple of the - #byte_padding - if byte_padding is not None: - pad = byte_padding - bytes_read = ((bytes_read + pad - 1) // pad) * pad - - return (bytes_read, data) - - -def check_magic(magic): - '''It checks that the magic number of the file matches the sff magic.''' - if magic != 779314790: - raise RuntimeError('This file does not seems to be an sff file.') - -def check_version(version): - '''It checks that the version is supported, otherwise it raises an error.''' - supported = ('\x00', '\x00', '\x00', '\x01') - i = 0 - for item in version: - if version[i] != supported[i]: - raise RuntimeError('SFF version not supported. Please contact the author of the software.') - i += 1 - -def read_header(fileh): - '''It reads the header from the sff file and returns a dict with the - information''' - #first we read the first part of the header - head_struct = [ - ('magic_number', 'I'), - ('version', 'cccc'), - ('index_offset', 'Q'), - ('index_length', 'I'), - ('number_of_reads', 'I'), - ('header_length', 'H'), - ('key_length', 'H'), - ('number_of_flows_per_read', 'H'), - ('flowgram_format_code', 'B'), - ] - data = {} - first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh, - offset=0, data=data) - check_magic(data['magic_number']) - check_version(data['version']) - #now that we know the number_of_flows_per_read and the key_length - #we can read the second part of the header - struct2 = [ - ('flow_chars', str(data['number_of_flows_per_read']) + 'c'), - ('key_sequence', str(data['key_length']) + 'c') - ] - read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes, - data=data) - return data - - -def read_sequence(header, fileh, fposition): - '''It reads one read from the sff file located at the fposition and - returns a dict with the information.''' - header_length = header['header_length'] - index_offset = header['index_offset'] - index_length = header['index_length'] - - #the sequence struct - read_header_1 = [ - ('read_header_length', 'H'), - ('name_length', 'H'), - ('number_of_bases', 'I'), - ('clip_qual_left', 'H'), - ('clip_qual_right', 'H'), - ('clip_adapter_left', 'H'), - ('clip_adapter_right', 'H'), - ] - def read_header_2(name_length): - '''It returns the struct definition for the second part of the header''' - return [('name', str(name_length) +'c')] - def read_data(number_of_bases): - '''It returns the struct definition for the read data section.''' - #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8} - if header['flowgram_format_code'] == 1: - flow_type = 'H' - else: - raise Error('file version not supported') - number_of_bases = str(number_of_bases) - return [ - ('flowgram_values', str(header['number_of_flows_per_read']) + - flow_type), - ('flow_index_per_base', number_of_bases + 'B'), - ('bases', number_of_bases + 'c'), - ('quality_scores', number_of_bases + 'B'), - ] - - data = {} - #we read the first part of the header - bytes_read, data = read_bin_fragment(struct_def=read_header_1, - fileh=fileh, offset=fposition, data=data) - - read_bin_fragment(struct_def=read_header_2(data['name_length']), - fileh=fileh, offset=fposition + bytes_read, data=data) - #we join the letters of the name - data['name'] = ''.join(data['name']) - offset = data['read_header_length'] - #we read the sequence and the quality - read_data_st = read_data(data['number_of_bases']) - bytes_read, data = read_bin_fragment(struct_def=read_data_st, - fileh=fileh, offset=fposition + offset, - data=data, byte_padding=8) - #we join the bases - data['bases'] = ''.join(data['bases']) - - #print data - #print "pre cqr: ", data['clip_qual_right'] - #print "pre car: ", data['clip_adapter_right'] - #print "pre cql: ", data['clip_qual_left'] - #print "pre cal: ", data['clip_adapter_left'] - - # correct for the case the right clip is <= than the left clip - # in this case, left clip is 0 are set to 0 (right clip == 0 means - # "whole sequence") - if data['clip_qual_right'] <= data['clip_qual_left'] : - data['clip_qual_right'] = 0 - data['clip_qual_left'] = 0 - if data['clip_adapter_right'] <= data['clip_adapter_left'] : - data['clip_adapter_right'] = 0 - data['clip_adapter_left'] = 0 - - #the clipping section follows the NCBI's guidelines Trace Archive RFC - #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc - #if there's no adapter clip: qual -> vector - #else: qual-> qual - # adapter -> vector - - if not data['clip_adapter_left']: - data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left'] - if not data['clip_adapter_right']: - data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right'] - - # see whether we have to override the minimum left clips - if config['min_leftclip'] > 0: - if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']: - data['clip_adapter_left'] = config['min_leftclip'] - if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']: - data['clip_qual_left'] = config['min_leftclip'] - - - #print "post cqr: ", data['clip_qual_right'] - #print "post car: ", data['clip_adapter_right'] - #print "post cql: ", data['clip_qual_left'] - #print "post cal: ", data['clip_adapter_left'] - - - # for handling the -c (clip) option gently, we already clip here - # and set all clip points to the sequence end points - if config['clip']: - data['bases'], data['quality_scores'] = clip_read(data) - - data['number_of_bases']=len(data['bases']) - data['clip_qual_right'] = data['number_of_bases'] - data['clip_adapter_right'] = data['number_of_bases'] - data['clip_qual_left'] = 0 - data['clip_adapter_left'] = 0 - - return data['read_header_length'] + bytes_read, data - - -def sequences(fileh, header): - '''It returns a generator with the data for each read.''' - #now we can read all the sequences - fposition = header['header_length'] #position in the file - reads_read = 0 - while True: - if fposition == header['index_offset']: - #we have to skip the index section - fposition += index_length - continue - else: - bytes_read, seq_data = read_sequence(header=header, fileh=fileh, - fposition=fposition) - yield seq_data - fposition += bytes_read - reads_read += 1 - if reads_read >= header['number_of_reads']: - break - - -def remove_last_xmltag_in_file(fname, tag=None): - '''Given an xml file name and a tag, it removes the last tag of the - file if it matches the given tag. Tag removal is performed via file - truncation. - - It the given tag is not the last in the file, a RunTimeError will be - raised. - - The resulting xml file will be not xml valid. This function is a hack - that allows to append records to xml files in a quick and dirty way. - ''' - - fh = open(fname, 'r+') - #we have to read from the end to the start of the file and keep the - #string enclosed by - i = -1 - last_tag = [] #the chars that form the last tag - start_offset = None #in which byte does the last tag starts? - end_offset = None #in which byte does the last tag ends? - while True: - fh.seek(i, 2) - char = fh.read(1) - if not char.isspace(): - last_tag.append(char) - if char == '>': - end_offset = i - if char == '<': - start_offset = i - break - i -= 1 - - #we have read the last tag backwards - last_tag = ''.join(last_tag[::-1]) - #we remove the - last_tag = last_tag.rstrip('>').lstrip('\n'] - to_print.append(' ') - to_print.append(readname) - to_print.append('\n') - - #extra information - #do we have extra info for this file? - info = None - if config['xml_info']: - #with this name? - if fname in config['xml_info']: - info = config['xml_info'][fname] - else: - #with no name? - try: - info = config['xml_info'][fake_sff_name] - except KeyError: - pass - #we print the info that we have - if info: - for key in info: - to_print.append(' <' + key + '>' + info[key] + \ - '\n') - - return ''.join(to_print) - - -def create_clip_xml_info(readlen, adapl, adapr, quall, qualr): - '''Takes the clip values of the read and formats them into XML - Corrects "wrong" values that might have resulted through - simplified calculations earlier in the process of conversion - (especially during splitting of paired-end reads) - ''' - - to_print = [""] - - # if right borders are >= to read length, they don't need - # to be printed - if adapr >= readlen: - adapr = 0 - if qualr >= readlen: - qualr = 0 - - # BaCh - # when called via split_paired_end(), some values may be < 0 - # (when clip values were 0 previously) - # instead of putting tons of if clauses for different calculations there, - # I centralise corrective measure here - # set all values <0 to 0 - - if adapr < 0: - adapr = 0 - if qualr <0: - qualr = 0 - if adapl < 0: - adapl = 0 - if quall <0: - quall = 0 - - if quall: - to_print.append(' ') - to_print.append(str(quall)) - to_print.append('\n') - if qualr: - to_print.append(' ') - to_print.append(str(qualr)) - to_print.append('\n') - if adapl: - to_print.append(' ') - to_print.append(str(adapl)) - to_print.append('\n') - if adapr: - to_print.append(' ') - to_print.append(str(adapr)) - to_print.append('\n') - return ''.join(to_print) - - -def create_xml_for_unpaired_read(data, fname): - '''Given the data for one read it returns an str with the xml ancillary - data.''' - to_print = [create_basic_xml_info(data['name'],fname)] - #clippings in the XML only if we do not hard clip - if not config['clip']: - to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right'])); - to_print.append(' \n') - return ''.join(to_print) - - -def format_as_fasta(name,seq,qual): - name_line = ''.join(('>', name,'\n')) - seqstring = ''.join((name_line, seq, '\n')) - qual_line = ' '.join([str(q) for q in qual]) - qualstring = ''.join((name_line, qual_line, '\n')) - return seqstring, qualstring - -def format_as_fastq(name,seq,qual): - qual_line = ''.join([chr(q+33) for q in qual]) - #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n')) - seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n')) - return seqstring - - -def get_read_data(data): - '''Given the data for one read it returns 2 strs with the fasta seq - and fasta qual.''' - #seq and qual - if config['mix_case']: - seq = sequence_case(data) - qual = data['quality_scores'] - else : - seq = data['bases'] - qual = data['quality_scores'] - - return seq, qual - -def extract_read_info(data, fname): - '''Given the data for one read it returns 3 strs with the fasta seq, fasta - qual and xml ancillary data.''' - - seq,qual = get_read_data(data) - seqstring, qualstring = format_as_fasta(data['name'],seq,qual) - - #name_line = ''.join(('>', data['name'],'\n')) - #seq = ''.join((name_line, seq, '\n')) - #qual_line = ' '.join([str(q) for q in qual]) - #qual = ''.join((name_line, qual_line, '\n')) - - xmlstring = create_xml_for_unpaired_read(data, fname) - - return seqstring, qualstring, xmlstring - -def write_sequence(name,seq,qual,seq_fh,qual_fh): - '''Write sequence and quality FASTA and FASTA qual filehandles - (or into FASTQ and XML) - if sequence length is 0, don't write''' - - if len(seq) == 0 : return - - if qual_fh is None: - seq_fh.write(format_as_fastq(name,seq,qual)) - else: - seqstring, qualstring = format_as_fasta(name,seq,qual) - seq_fh.write(seqstring) - qual_fh.write(qualstring) - return - -def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh): - '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles - (or into FASTQ and XML) - if sequence length is 0, don't write''' - - seq,qual = get_read_data(data) - if len(seq) == 0 : return - - write_sequence(data['name'],seq,qual,seq_fh,qual_fh) - - anci = create_xml_for_unpaired_read(data, sff_fh.name) - if anci is not None: - xml_fh.write(anci) - return - - -def reverse_complement(seq): - '''Returns the reverse complement of a DNA sequence as string''' - - compdict = { - 'a': 't', - 'c': 'g', - 'g': 'c', - 't': 'a', - 'u': 't', - 'm': 'k', - 'r': 'y', - 'w': 'w', - 's': 's', - 'y': 'r', - 'k': 'm', - 'v': 'b', - 'h': 'd', - 'd': 'h', - 'b': 'v', - 'x': 'x', - 'n': 'n', - 'A': 'T', - 'C': 'G', - 'G': 'C', - 'T': 'A', - 'U': 'T', - 'M': 'K', - 'R': 'Y', - 'W': 'W', - 'S': 'S', - 'Y': 'R', - 'K': 'M', - 'V': 'B', - 'H': 'D', - 'D': 'H', - 'B': 'V', - 'X': 'X', - 'N': 'N', - '*': '*' - } - - complseq = ''.join([compdict[base] for base in seq]) - # python hack to reverse a list/string/etc - complseq = complseq[::-1] - return complseq - - -def mask_sequence(seq, maskchar, fpos, tpos): - '''Given a sequence, mask it with maskchar starting at fpos (including) and - ending at tpos (excluding) - ''' - - if len(maskchar) > 1: - raise RuntimeError("Internal error: more than one character given to mask_sequence") - if fpos<0: - fpos = 0 - if tpos > len(seq): - tpos = len(seq) - - newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:])) - - return newseq - - -def fragment_sequences(sequence, qualities, splitchar): - '''Works like split() on strings, except it does this on a sequence - and the corresponding list with quality values. - Returns a tuple for each fragment, each sublist has the fragment - sequence as first and the fragment qualities as second elemnt''' - - # this is slow (due to zip and list appends... use an iterator over - # the sequence find find variations and splices on seq and qual - - if len(sequence) != len(qualities): - print sequence, qualities - raise RuntimeError("Internal error: length of sequence and qualities don't match???") - - retlist = ([]) - if len(sequence) == 0: - return retlist - - actseq = ([]) - actqual = ([]) - if sequence[0] != splitchar: - inseq = True - else: - inseq = False - for char,qual in zip(sequence,qualities): - if inseq: - if char != splitchar: - actseq.append(char) - actqual.append(qual) - else: - retlist.append((''.join(actseq), actqual)) - actseq = ([]) - actqual = ([]) - inseq = False - else: - if char != splitchar: - inseq = True - actseq.append(char) - actqual.append(qual) - - if inseq and len(actseq): - retlist.append((''.join(actseq), actqual)) - - return retlist - - -def calc_subseq_boundaries(maskedseq, maskchar): - '''E.g.: - ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx......... - to - (0,8),(8,16),(16,26),(26,47),(47,56) - ''' - - blist = ([]) - if len(maskedseq) == 0: - return blist - - inmask = True - if maskedseq[0] != maskchar: - inmask = False - - start = 0 - for spos in range(len(maskedseq)): - if inmask and maskedseq[spos] != maskchar: - blist.append(([start,spos])) - start = spos - inmask = False - elif not inmask and maskedseq[spos] == maskchar: - blist.append(([start,spos])) - start = spos - inmask = True - - blist.append(([start,spos+1])) - - return blist - - -def correct_for_smallhits(maskedseq, maskchar, linkername): - '''If partial hits were found, take preventive measure: grow - the masked areas by 20 bases in each direction - Returns either unchanged "maskedseq" or a new sequence - with some more characters masked. - ''' - global linkerlengths - - CEBUG = 0 - - if CEBUG : print "correct_for_smallhits" - if CEBUG : print "Masked seq\n", maskedseq - if CEBUG : print "Linkername: ", linkername - - if len(maskedseq) == 0: - return maskedseq - - growl=40 - growl2=growl/2 - - boundaries = calc_subseq_boundaries(maskedseq,maskchar) - if CEBUG : print "Boundaries: ", boundaries - - foundpartial = False - for bounds in boundaries: - if CEBUG : print "\tbounds: ", bounds - left, right = bounds - if left != 0 and right != len(maskedseq): - if maskedseq[left] == maskchar: - # allow 10% discrepancy - # -linkerlengths[linkername]/10 - # that's a kind of safety net if there are slight sequencing - # errors in the linker itself - if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10: - if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long." - foundpartial = True - - if not foundpartial: - return maskedseq - - # grow - newseq = "" - for bounds in boundaries: - if CEBUG : print "Bounds: ", bounds - left, right = bounds - if maskedseq[left] == maskchar: - newseq += maskedseq[left:right] - else: - clearstart = 0 - if left > 0 : - clearstart = left+growl2 - clearstop = len(maskedseq) - if right < len(maskedseq): - clearstop = right-growl2 - - if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop - - if clearstop <= clearstart: - newseq += maskchar * (right-left) - else: - if clearstart != left: - newseq += maskchar * growl2 - newseq += maskedseq[clearstart:clearstop] - if clearstop != right: - newseq += maskchar * growl2 - - #print "newseq\n",newseq - - return newseq - - -def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh): - '''Splits a paired end read and writes sequences into FASTA, FASTA qual - and XML traceinfo file. Returns the number of sequences created. - - As the linker sequence may be anywhere in the read, including the ends - and overlapping with bad quality sequence, we need to perform some - computing and eventually set new clip points. - - If the resulting split yields only one sequence (because linker - was not present or overlapping with left or right clip), only one - sequence will be written with ".fn" appended to the name. - - If the read can be split, two reads will be written. The side left of - the linker will be named ".r" and will be written in reverse complement - into the file to conform with what approximately all assemblers expect - when reading paired-end data: reads in forward direction in file. The side - right of the linker will be named ".f" - - If SSAHA found partial linker (linker sequences < length of linker), - the sequences will get a "_pl" furthermore be cut back thoroughly. - - If SSAHA found multiple occurences of the linker, the names will get an - additional "_mlc" within the name to show that there was "multiple - linker contamination". - - For multiple or partial linker, the "good" parts of the reads are - stored with a ".part" name, additionally they will not get - template information in the XML - ''' - - global ssahapematches - - CEBUG = 0 - - maskchar = "#" - - if CEBUG : print "Need to split: " + data['name'] - - numseqs = 0; - readname = data['name'] - readlen = data['number_of_bases'] - - leftclip, rightclip = return_merged_clips(data) - seq, qual = get_read_data(data) - - if CEBUG : print "Original read:\n",seq - - maskedseq = seq - if leftclip > 0: - maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1) - if rightclip < len(maskedseq): - maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq)) - - leftclip, rightclip = return_merged_clips(data) - readlen = data['number_of_bases'] - - if CEBUG : print "Readname:", readname - if CEBUG : print "Readlen:", readlen - if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']])) - if CEBUG : print "matches:", ssahapematches[data['name']] - - for match in ssahapematches[data['name']]: - score = int(match[0]) - linkername = match[2] - leftreadhit = int(match[3]) - rightreadhit = int(match[4]) - #leftlinkerhit = int(match[5]) - #rightlinkerhit = int(match[6]) - #direction = match[7] - #hitlen = int(match[8]) - #hitidentity = float(match[9]) - - if CEBUG : print match - if CEBUG : print "Match with score:", score - if CEBUG : print "Read before:\n", maskedseq - maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit) - if CEBUG : print "Masked seq:\n", maskedseq - - correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername) - - if len(maskedseq) != len(correctedseq): - raise RuntimeError("Internal error: maskedseq != correctedseq") - - partialhits = False - if correctedseq != maskedseq: - if CEBUG : print "Partial hits in", readname - if CEBUG : print "Original seq:\n", seq - if CEBUG : print "Masked seq:\n", maskedseq - if CEBUG : print "Corrected seq\n", correctedseq - partialhits = True - readname += "_pl" - maskedseq = correctedseq - - fragments = fragment_sequences(maskedseq, qual, maskchar) - - if CEBUG : print "Fragments (", len(fragments), "): ", fragments - - mlcflag = False - #if len(ssahapematches[data['name']]) > 1: - # #print "Multi linker contamination" - # mlcflag = True - # readname += "_mlc" - - if len(fragments) > 2: - if CEBUG : print "Multi linker contamination" - mlcflag = True - readname += "_mlc" - - - #print fragments - if mlcflag or partialhits: - fragcounter = 1 - readname += ".part" - for frag in fragments: - actseq = frag[0] - if len(actseq) >= 20: - actqual = frag[1] - oname = readname + str(fragcounter) - #seq_fh.write(">"+oname+"\n") - #seq_fh.write(actseq+"\n") - #qual_fh.write(">"+oname+"\n") - #qual_fh.write(' '.join((str(q) for q in actqual))) - #qual_fh.write("\n") - write_sequence(oname,actseq,actqual,seq_fh,qual_fh) - to_print = [create_basic_xml_info(oname,sff_fh.name)] - # No clipping in XML ... the multiple and partial fragments - # are clipped "hard" - # No template ID and trace_end: we don't know the - # orientation of the frahments. Even if it were - # only two, the fact we had multiple linkers - # says something went wrong, so simply do not - # write any paired-end information for all these fragments - to_print.append(' \n') - xml_fh.write(''.join(to_print)) - numseqs += 1 - fragcounter += 1 - else: - if len(fragments) >2: - raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.") - # nothing will happen for 0 fragments - if len(fragments) == 1: - #print "Tada1" - boundaries = calc_subseq_boundaries(maskedseq,maskchar) - if len(boundaries) < 1 or len(boundaries) >3: - raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname) - if len(boundaries) == 3: - # case: mask char on both sides of sequence - #print "bounds3" - data['clip_adapter_left']=1+boundaries[0][1] - data['clip_adapter_right']=boundaries[2][0] - elif len(boundaries) == 2: - # case: mask char left or right of sequence - #print "bounds2", - if maskedseq[0] == maskchar : - # case: mask char left - #print "left" - data['clip_adapter_left']=1+boundaries[0][1] - else: - # case: mask char right - #print "right" - data['clip_adapter_right']=boundaries[1][0] - data['name'] = data['name'] + ".fn" - write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh) - numseqs = 1 - elif len(fragments) == 2: - #print "Tada2" - oname = readname + ".r" - seq, qual = get_read_data(data) - - startsearch = False - for spos in range(len(maskedseq)): - if maskedseq[spos] != maskchar: - startsearch = True; - else: - if startsearch: - break - - #print "\nspos: ", spos - lseq=seq[:spos] - #print "lseq:", lseq - actseq = reverse_complement(lseq) - lreadlen = len(actseq) - lqual = qual[:spos]; - # python hack to reverse a list/string/etc - lqual = lqual[::-1]; - - #seq_fh.write(">"+oname+"\n") - #seq_fh.write(actseq+"\n") - #qual_fh.write(">"+oname+"\n") - #qual_fh.write(' '.join((str(q) for q in lqual))) - #qual_fh.write("\n") - - write_sequence(oname,actseq,lqual,seq_fh,qual_fh) - - to_print = [create_basic_xml_info(oname,sff_fh.name)] - to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left'])); - to_print.append(' ') - to_print.append(readname) - to_print.append('\n') - to_print.append(' r\n') - to_print.append(' \n') - xml_fh.write(''.join(to_print)) - - oname = readname + ".f" - startsearch = False - for spos in range(len(maskedseq)-1,-1,-1): - if maskedseq[spos] != maskchar: - startsearch = True; - else: - if startsearch: - break - - actseq = seq[spos+1:] - actqual = qual[spos+1:]; - - #print "\nspos: ", spos - #print "rseq:", actseq - - #seq_fh.write(">"+oname+"\n") - #seq_fh.write(actseq+"\n") - #qual_fh.write(">"+oname+"\n") - #qual_fh.write(' '.join((str(q) for q in actqual))) - #qual_fh.write("\n") - write_sequence(oname,actseq,actqual,seq_fh,qual_fh) - - rreadlen = len(actseq) - to_print = [create_basic_xml_info(oname,sff_fh.name)] - to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right']))); - to_print.append(' ') - to_print.append(readname) - to_print.append('\n') - to_print.append(' f\n') - to_print.append(' \n') - xml_fh.write(''.join(to_print)) - numseqs = 2 - - return numseqs - - - -def extract_reads_from_sff(config, sff_files): - '''Given the configuration and the list of sff_files it writes the seqs, - qualities and ancillary data into the output file(s). - - If file for paired-end linker was given, first extracts all sequences - of an SFF and searches these against the linker(s) with SSAHA2 to - create needed information to split reads. - ''' - - global ssahapematches - - - if len(sff_files) == 0 : - raise RuntimeError("No SFF file given?") - - #we go through all input files - for sff_file in sff_files: - if not os.path.getsize(sff_file): - raise RuntimeError('Empty file? : ' + sff_file) - fh = open(sff_file, 'r') - fh.close() - - openmode = 'w' - if config['append']: - openmode = 'a' - - seq_fh = open(config['seq_fname'], openmode) - xml_fh = open(config['xml_fname'], openmode) - if config['want_fastq']: - qual_fh = None - try: - os.remove(config['qual_fname']) - except : - python_formattingwithoutbracesisdumb_dummy = 1 - else: - qual_fh = open(config['qual_fname'], openmode) - - if not config['append']: - xml_fh.write('\n\n') - else: - remove_last_xmltag_in_file(config['xml_fname'], "trace_volume") - - #we go through all input files - for sff_file in sff_files: - #print "Working on '" + sff_file + "':" - ssahapematches.clear() - - seqcheckstore = ([]) - - debug = 0 - - if not debug and config['pelinker_fname']: - #print "Creating temporary sequences from reads in '" + sff_file + "' ... ", - sys.stdout.flush() - - if 0 : - # for debugging - pid = os.getpid() - tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta' - tmpfasta_fh = open(tmpfasta_fname, 'w') - else: - tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_', - suffix = '.fasta') - - sff_fh = open(sff_file, 'rb') - header_data = read_header(fileh=sff_fh) - for seq_data in sequences(fileh=sff_fh, header=header_data): - seq,qual = get_read_data(seq_data) - seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual) - tmpfasta_fh.write(seqstring) - #seq, qual, anci = extract_read_info(seq_data, sff_fh.name) - #tmpfasta_fh.write(seq) - #print "done." - tmpfasta_fh.seek(0) - - if 0 : - # for debugging - tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2' - tmpssaha_fh = open(tmpssaha_fname, 'w+') - else: - tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_', - suffix = '.ssaha2') - - launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh) - tmpfasta_fh.close() - - tmpssaha_fh.seek(0) - read_ssaha_data(tmpssaha_fh) - tmpssaha_fh.close() - - if debug: - tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r') - read_ssaha_data(tmpssaha_fh) - - #print "Converting '" + sff_file + "' ... ", - sys.stdout.flush() - sff_fh = open(sff_file, 'rb') - #read_header(infile) - header_data = read_header(fileh=sff_fh) - - #now convert all reads - nseqs_sff = 0 - nseqs_out = 0 - for seq_data in sequences(fileh=sff_fh, header=header_data): - nseqs_sff += 1 - - seq, qual = clip_read(seq_data) - seqcheckstore.append(seq[0:50]) - - #if nseqs_sff >1000: - # check_for_dubious_startseq(seqcheckstore,sff_file,seq_data) - # sys.exit() - - if ssahapematches.has_key(seq_data['name']): - #print "Paired end:",seq_data['name'] - nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh) - else: - #print "Normal:",seq_data['name'] - if config['pelinker_fname']: - seq_data['name'] = seq_data['name'] + ".fn" - write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh) - nseqs_out += 1 - #print "done." - #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.' - sff_fh.close() - - check_for_dubious_startseq(seqcheckstore,sff_file,seq_data) - seqcheckstore = ([]) - - xml_fh.write('\n') - - xml_fh.close() - seq_fh.close() - if qual_fh is not None: - qual_fh.close() - - return - -def check_for_dubious_startseq(seqcheckstore, sffname,seqdata): - - global stern_warning - - foundproblem = "" - for checklen in range(1,len(seqcheckstore[0])): - foundinloop = False - seqdict = {} - for seq in seqcheckstore: - shortseq = seq[0:checklen] - if shortseq in seqdict: - seqdict[shortseq] += 1 - else: - seqdict[shortseq] = 1 - - for shortseq, count in seqdict.items(): - if float(count)/len(seqcheckstore) >= 0.5: - foundinloop = True - stern_warning - foundproblem = "\n"+"*" * 80 - foundproblem += "\nWARNING: " - foundproblem += "weird sequences in file " + sffname + "\n\n" - foundproblem += "After applying left clips, " + str(count) + " sequences (=" - foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore)) - foundproblem += "%) start with these bases:\n" + shortseq - foundproblem += "\n\nThis does not look sane.\n\n" - foundproblem += "Countermeasures you *probably* must take:\n" - foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n" - foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n" - left,right = return_merged_clips(seqdata) - foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n" - foundproblem += "*" * 80 + "\n" - if not foundinloop : - break - if len(foundproblem): - print foundproblem - - -def parse_extra_info(info): - '''It parses the information that will go in the xml file. - - There are two formats accepted for the extra information: - key1:value1, key2:value2 - or: - file1.sff{key1:value1, key2:value2};file2.sff{key3:value3} - ''' - if not info: - return info - finfos = info.split(';') #information for each file - data_for_files = {} - for finfo in finfos: - #we split the file name from the rest - items = finfo.split('{') - if len(items) == 1: - fname = fake_sff_name - info = items[0] - else: - fname = items[0] - info = items[1] - #now we get each key,value pair in the info - info = info.replace('}', '') - data = {} - for item in info.split(','): - key, value = item.strip().split(':') - key = key.strip() - value = value.strip() - data[key] = value - data_for_files[fname] = data - return data_for_files - - -def return_merged_clips(data): - '''It returns the left and right positions to clip.''' - def max(a, b): - '''It returns the max of the two given numbers. - - It won't take into account the zero values. - ''' - if not a and not b: - return None - if not a: - return b - if not b: - return a - if a >= b: - return a - else: - return b - def min(a, b): - '''It returns the min of the two given numbers. - - It won't take into account the zero values. - ''' - if not a and not b: - return None - if not a: - return b - if not b: - return a - if a <= b: - return a - else: - return b - left = max(data['clip_adapter_left'], data['clip_qual_left']) - right = min(data['clip_adapter_right'], data['clip_qual_right']) - #maybe both clips where zero - if left is None: - left = 1 - if right is None: - right = data['number_of_bases'] - return left, right - -def sequence_case(data): - '''Given the data for one read it returns the seq with mixed case. - - The regions to be clipped will be lower case and the rest upper case. - ''' - left, right = return_merged_clips(data) - seq = data['bases'] - new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower())) - return new_seq - -def clip_read(data): - '''Given the data for one read it returns clipped seq and qual.''' - - qual = data['quality_scores'] - left, right = return_merged_clips(data) - seq = data['bases'] - qual = data['quality_scores'] - new_seq = seq[left-1:right] - new_qual = qual[left-1:right] - - return new_seq, new_qual - - - -def tests_for_ssaha(linker_fname): - '''Tests whether SSAHA2 can be successfully called.''' - - try: - print "Testing whether SSAHA2 is installed and can be launched ... ", - sys.stdout.flush() - fh = open('/dev/null', 'w') - retcode = subprocess.call(["ssaha2", "-v"], stdout = fh) - fh.close() - print "ok." - except : - print "nope? Uh oh ...\n\n" - raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?') - - -def load_linker_sequences(linker_fname): - '''Loads all linker sequences into memory, storing only the length - of each linker.''' - - global linkerlengths - - if not os.path.getsize(linker_fname): - raise RuntimeError("File empty? '" + linker_fname + "'") - fh = open(linker_fname, 'r') - linkerseqs = read_fasta(fh) - if len(linkerseqs) == 0: - raise RuntimeError(linker_fname + ": no sequence found?") - for i in linkerseqs: - if linkerlengths.has_key(i.name): - raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.") - linkerlengths[i.name] = len(i.sequence) - fh.close() - - -def launch_ssaha(linker_fname, query_fname, output_fh): - '''Launches SSAHA2 on the linker and query file, string SSAHA2 output - into the output filehandle''' - - try: - print "Searching linker sequences with SSAHA2 (this may take a while) ... ", - sys.stdout.flush() - retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh) - if retcode: - raise RuntimeError('Ups.') - else: - print "ok." - except: - print "\n" - raise RuntimeError('An error occured during the SSAHA2 execution, aborting.') - -def read_ssaha_data(ssahadata_fh): - '''Given file handle, reads file generated with SSAHA2 (with default - output format) and stores all matches as list ssahapematches - (ssaha paired-end matches) dictionary''' - - global ssahapematches - - print "Parsing SSAHA2 result file ... ", - sys.stdout.flush() - - for line in ssahadata_fh: - if line.startswith('ALIGNMENT'): - ml = line.split() - if len(ml) != 12 : - print "\n", line, - raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml))) - if not ssahapematches.has_key(ml[2]) : - ssahapematches[ml[2]] = ([]) - if ml[8] == 'F': - #print line, - - # store everything except the first element (output - # format name (ALIGNMENT)) and the last element - # (length) - ssahapematches[ml[2]].append(ml[1:-1]) - else: - #print ml - ml[4],ml[5] = ml[5],ml[4] - #print ml - ssahapematches[ml[2]].append(ml[1:-1]) - - print "done." - - -########################################################################## -# -# BaCh: This block was shamelessly copied from -# http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/ -# and then subsequently modified to read fasta correctly -# It's still not fool proof, but should be good enough -# -########################################################################## - -class Fasta: - def __init__(self, name, sequence): - self.name = name - self.sequence = sequence - -def read_fasta(file): - items = [] - aninstance = Fasta('', '') - linenum = 0 - for line in file: - linenum += 1 - if line.startswith(">"): - if len(aninstance.sequence): - items.append(aninstance) - aninstance = Fasta('', '') - # name == all characters until the first whitespace - # (split()[0]) but without the starting ">" ([1:]) - aninstance.name = line.split()[0][1:] - aninstance.sequence = '' - if len(aninstance.name) == 0: - raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?') - - else: - if len(aninstance.name) == 0: - raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?') - aninstance.sequence += line.strip() - - if len(aninstance.name) and len(aninstance.sequence): - items.append(aninstance) - - return items -########################################################################## - -def version_string (): - return "sff_extract " + __version__ - -def read_config(): - '''It reads the configuration options from the command line arguments and - it returns a dict with them.''' - from optparse import OptionParser, OptionGroup - usage = "usage: %prog [options] sff1 sff2 ..." - desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\ - " and XML traceinfo format. When a paired-end linker sequence"\ - " is given (-l), use SSAHA2 to scan the sequences for the linker,"\ - " then split the sequences, removing the linker." - parser = OptionParser(usage = usage, version = version_string(), description = desc) - parser.add_option('-a', '--append', action="store_true", dest='append', - help='append output to existing files', default=False) - parser.add_option('-i', '--xml_info', dest='xml_info', - help='extra info to write in the xml file') - parser.add_option("-l", "--linker_file", dest="pelinker_fname", - help="FASTA file with paired-end linker sequences", metavar="FILE") - - group = OptionGroup(parser, "File name options","") - group.add_option('-c', '--clip', action="store_true", dest='clip', - help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False) - group.add_option('-u', '--upper_case', action="store_false", dest='mix_case', - help='all bases in upper case, including clipped ends', default=True) - group.add_option('', '--min_left_clip', dest='min_leftclip', - metavar="INTEGER", type = "int", - help='if the left clip coming from the SFF is smaller than this value, override it', default=0) - group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq', - help='store as FASTQ file instead of FASTA + FASTA quality file', default=False) - parser.add_option_group(group) - - group = OptionGroup(parser, "File name options","") - group.add_option("-o", "--out_basename", dest="basename", - help="base name for all output files") - group.add_option("-s", "--seq_file", dest="seq_fname", - help="output sequence file name", metavar="FILE") - group.add_option("-q", "--qual_file", dest="qual_fname", - help="output quality file name", metavar="FILE") - group.add_option("-x", "--xml_file", dest="xml_fname", - help="output ancillary xml file name", metavar="FILE") - parser.add_option_group(group) - - #default fnames - #is there an sff file? - basename = 'reads' - if sys.argv[-1][-4:].lower() == '.sff': - basename = sys.argv[-1][:-4] - def_seq_fname = basename + '.fasta' - def_qual_fname = basename + '.fasta.qual' - def_xml_fname = basename + '.xml' - def_pelinker_fname = '' - parser.set_defaults(seq_fname = def_seq_fname) - parser.set_defaults(qual_fname = def_qual_fname) - parser.set_defaults(xml_fname = def_xml_fname) - parser.set_defaults(pelinker_fname = def_pelinker_fname) - - #we parse the cmd line - (options, args) = parser.parse_args() - - #we put the result in a dict - global config - config = {} - for property in dir(options): - if property[0] == '_' or property in ('ensure_value', 'read_file', - 'read_module'): - continue - config[property] = getattr(options, property) - - if config['basename'] is None: - config['basename']=basename - - #if we have not set a file name with -s, -q or -x we set the basename - #based file name - if config['want_fastq']: - config['qual_fname'] = '' - if config['seq_fname'] == def_seq_fname: - config['seq_fname'] = config['basename'] + '.fastq' - else: - if config['seq_fname'] == def_seq_fname: - config['seq_fname'] = config['basename'] + '.fasta' - if config['qual_fname'] == def_qual_fname: - config['qual_fname'] = config['basename'] + '.fasta.qual' - - if config['xml_fname'] == def_xml_fname: - config['xml_fname'] = config['basename'] + '.xml' - - #we parse the extra info for the xml file - config['xml_info'] = parse_extra_info(config['xml_info']) - return config, args - - - -########################################################################## - - -def testsome(): - sys.exit() - return - - -def debug(): - try: - dummy = 1 - #debug() - #testsome() - - config, args = read_config() - load_linker_sequences(config['pelinker_fname']) - - #pid = os.getpid() - pid = 15603 - - #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta' - #tmpfasta_fh = open(tmpfasta_fname, 'w') - tmpfasta_fname = 'FLVI58L05.fa' - tmpfasta_fh = open(tmpfasta_fname, 'r') - - tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2' - tmpssaha_fh = open(tmpssaha_fname, 'w') - - launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh) - - tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r') - read_ssaha_data(tmpssaha_fh) - - sys.exit() - - extract_reads_from_sff(config, args) - - except (OSError, IOError, RuntimeError), errval: - print errval - sys.exit() - - sys.exit() - - -def main(): - - argv = sys.argv - if len(argv) == 1: - sys.argv.append('-h') - read_config() - sys.exit() - try: - #debug(); - - config, args = read_config() - - if config['pelinker_fname']: - #tests_for_ssaha(config['pelinker_fname']) - load_linker_sequences(config['pelinker_fname']) - if len(args) == 0: - raise RuntimeError("No SFF file given?") - extract_reads_from_sff(config, args) - except (OSError, IOError, RuntimeError), errval: - print errval - return 1 - - if stern_warning: - return 1 - - return 0 - - - -if __name__ == "__main__": - sys.exit(main()) diff -r c2a356708570 -r 33c067c3ae34 tools/filters/sff_extractor.xml --- a/tools/filters/sff_extractor.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - - - #if str($fastq_output) == "fastq_false" #sff_extract.py $clip --seq_file=$out_file3 --qual_file=$out_file4 --xml_file=$out_file2 $input - #elif str($fastq_output) == "fastq_true" #sff_extract.py $clip --fastq --seq_file=$out_file1 --xml_file=$out_file2 $input - #end if# - - - - - - - - - - - - fastq_output is True - - - - - fastq_output is False - - - fastq_output is False - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool extracts data from the 454 Sequencer SFF format and creates three files containing the: -Sequences (FASTA), -Qualities (QUAL) and -Clippings (XML) - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/sorter.py --- a/tools/filters/sorter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -""" -Sorts tabular data on one or more columns. - -usage: %prog [options] - -i, --input=i: Tabular file to be sorted - -o, --out_file1=o: Sorted output file - -c, --column=c: First column to sort on - -s, --style=s: Sort style (numerical or alphabetical) - -r, --order=r: Order (ASC or DESC) - -usage: %prog input out_file1 column style order [column style ...] -""" - -import os, re, string, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def main(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - try: - inputfile = options.input - outputfile = '-o %s' % options.out_file1 - columns = [options.column] - styles = [('','n')[options.style == 'num']] - orders = [('','r')[options.order == 'DESC']] - col_style_orders = sys.argv[6:] - if len(col_style_orders) > 1: - columns.extend([col_style_orders[i] for i in range(0,len(col_style_orders),3)]) - styles.extend([('','n')[col_style_orders[i] == 'num'] for i in range(1,len(col_style_orders),3)]) - orders.extend([('','r')[col_style_orders[i] == 'DESC'] for i in range(2,len(col_style_orders),3)]) - cols = [ '-k%s,%s%s%s'%(columns[i], columns[i], styles[i], orders[i]) for i in range(len(columns)) ] - except Exception, ex: - stop_err('Error parsing input parameters\n' + str(ex)) - - # Launch sort. - cmd = "sort -f -t ' ' %s %s %s" % (' '.join(cols), outputfile, inputfile) - try: - os.system(cmd) - except Exception, ex: - stop_err('Error running sort command\n' + str(ex)) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/sorter.xml --- a/tools/filters/sorter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ - - data in ascending or descending order - - sorter.py - --input=$input - --out_file1=$out_file1 - --column=$column - --style=$style - --order=$order - #for $col in $column_set: - ${col.other_column} - ${col.other_style} - ${col.other_order} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool sorts the dataset on any number of columns in either ascending or descending order. - -* Numerical sort orders numbers by their magnitude, ignores all characters besides numbers, and evaluates a string of numbers to the value they signify. -* Alphabetical sort is a phonebook type sort based on the conventional order of letters in an alphabet. Each nth letter is compared with the nth letter of other words in the list, starting at the first letter of each word and advancing to the second, third, fourth, and so on, until the order is established. Therefore, in an alphabetical sort, 2 comes after 100 (1 < 2). - ------ - -**Examples** - -The list of numbers 4,17,3,5 collates to 3,4,5,17 by numerical sorting, while it collates to 17,3,4,5 by alphabetical sorting. - -Sorting the following:: - - Q d 7 II jhu 45 - A kk 4 I h 111 - Pd p 1 ktY WS 113 - A g 10 H ZZ 856 - A edf 4 tw b 234 - BBB rt 10 H ZZ 100 - A rew 10 d b 1111 - C sd 19 YH aa 10 - Hah c 23 ver bb 467 - MN gtr 1 a X 32 - N j 9 a T 205 - BBB rrf 10 b Z 134 - odfr ws 6 Weg dew 201 - C f 3 WW SW 34 - A jhg 4 I b 345 - Pd gf 7 Gthe de 567 - rS hty 90 YY LOp 89 - A g 10 H h 43 - A g 4 I h 500 - -on columns 1 (alpha), 3 (num), and 6 (num) in ascending order will yield:: - - A kk 4 I h 111 - A edf 4 tw b 234 - A jhg 4 I b 345 - A g 4 I h 500 - A g 10 H h 43 - A g 10 H ZZ 856 - A rew 10 d b 1111 - BBB rt 10 H ZZ 100 - BBB rrf 10 b Z 134 - C f 3 WW SW 34 - C sd 19 YH aa 10 - Hah c 23 ver bb 467 - MN gtr 1 a X 32 - N j 9 a T 205 - odfr ws 6 Weg dew 201 - Pd p 1 ktY WS 113 - Pd gf 7 Gthe de 567 - Q d 7 II jhu 45 - rS hty 90 YY LOp 89 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/tailWrapper.pl --- a/tools/filters/tailWrapper.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#! /usr/bin/perl -w - -use strict; -use warnings; - -# a wrapper for tail for use in galaxy -# lessWrapper.pl [filename] [# lines to show] [output] - -die "Check arguments" unless @ARGV == 3; -die "Line number should be an integer\n" unless $ARGV[1]=~ m/^\d+$/; - -open (OUT, ">$ARGV[2]") or die "Cannot create $ARGV[2]:$!\n"; -open (TAIL, "tail -n $ARGV[1] $ARGV[0]|") or die "Cannot run tail:$!\n"; -while () { - print OUT; -} -close OUT; -close TAIL; - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/tailWrapper.xml --- a/tools/filters/tailWrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - lines from a dataset - tailWrapper.pl $input $lineNum $out_file1 - - - - - - - - - - - - - - - - -**What it does** - -This tool outputs specified number of lines from the **end** of a dataset - ------ - -**Example** - -- Input File:: - - chr7 57134 57154 D17003_CTCF_R7 356 - - chr7 57247 57267 D17003_CTCF_R4 207 + - chr7 57314 57334 D17003_CTCF_R5 269 + - chr7 57341 57361 D17003_CTCF_R7 375 + - chr7 57457 57477 D17003_CTCF_R3 188 + - -- Show last two lines of above file. The result is:: - - chr7 57341 57361 D17003_CTCF_R7 375 + - chr7 57457 57477 D17003_CTCF_R3 188 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/trimmer.py --- a/tools/filters/trimmer.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -#!/usr/bin/env python - -import sys -import optparse - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - usage = """%prog [options] - -options (listed below) default to 'None' if omitted - """ - parser = optparse.OptionParser(usage=usage) - - parser.add_option( - '-a','--ascii', - dest='ascii', - action='store_true', - default = False, - help='Use ascii codes to defined ignored beginnings instead of raw characters') - - parser.add_option( - '-q','--fastq', - dest='fastq', - action='store_true', - default = False, - help='The input data in fastq format. It selected the script skips every even line since they contain sequence ids') - - parser.add_option( - '-i','--ignore', - dest='ignore', - help='A comma separated list on ignored beginnings (e.g., ">,@"), or its ascii codes (e.g., "60,42") if option -a is enabled') - - parser.add_option( - '-s','--start', - dest='start', - default = '0', - help='Trim from beginning to here (1-based)') - - parser.add_option( - '-e','--end', - dest='end', - default = '0', - help='Trim from here to the ned (1-based)') - - parser.add_option( - '-f','--file', - dest='input_txt', - default = False, - help='Name of file to be chopped. STDIN is default') - - parser.add_option( - '-c','--column', - dest='col', - default = '0', - help='Column to chop. If 0 = chop the whole line') - - - options, args = parser.parse_args() - invalid_starts = [] - - if options.input_txt: - infile = open ( options.input_txt, 'r') - else: - infile = sys.stdin - - if options.ignore and options.ignore != "None": - invalid_starts = options.ignore.split(',') - - if options.ascii and options.ignore and options.ignore != "None": - for i, item in enumerate( invalid_starts ): - invalid_starts[i] = chr( int( item ) ) - - col = int( options.col ) - - for i, line in enumerate( infile ): - line = line.rstrip( '\r\n' ) - if line: - - if options.fastq and i % 2 == 0: - print line - continue - - - if line[0] not in invalid_starts: - if col == 0: - if int( options.end ) > 0: - line = line[ int( options.start )-1 : int( options.end ) ] - else: - line = line[ int( options.start )-1 : ] - else: - fields = line.split( '\t' ) - if col-1 > len( fields ): - stop_err('Column %d does not exist. Check input parameters\n' % col) - - if int( options.end ) > 0: - fields[col - 1] = fields[col - 1][ int( options.start )-1 : int( options.end ) ] - else: - fields[col - 1] = fields[col - 1][ int( options.start )-1 : ] - line = '\t'.join(fields) - print line - -if __name__ == "__main__": main() - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/trimmer.xml --- a/tools/filters/trimmer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ - - leading or trailing characters - - trimmer.py -a -f $input1 -c $col -s $start -e $end -i $ignore $fastq > $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Trims specified number of characters from a dataset or its field (if dataset is tab-delimited). - ------ - -**Example 1** - -Trimming this dataset:: - - 1234567890 - abcdefghijk - -by setting **Trim from the beginning to this position** to *2* and **Remove everything from this position to the end** to *6* will produce:: - - 23456 - bcdef - ------ - -**Example 2** - -Trimming column 2 of this dataset:: - - abcde 12345 fghij 67890 - fghij 67890 abcde 12345 - -by setting **Trim content of this column only** to *2*, **Trim from the beginning to this position** to *2*, and **Remove everything from this position to the end** to *4* will produce:: - - abcde 234 fghij 67890 - fghij 789 abcde 12345 - ------ - -**Trimming FASTQ datasets** - -This tool can be used to trim sequences and quality strings in fastq datasets. This is done by selected *Yes* from the **Is input dataset in fastq format?** dropdown. If set to *Yes*, the tool will skip all even numbered lines (see warning below). For example, trimming last 5 bases of this dataset:: - - @081017-and-081020:1:1:1715:1759 - GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC - + - II#IIIIIII$5+.(9IIIIIII$%*$G$A31I&&B - -cab done by setting **Remove everything from this position to the end** to 31:: - - @081017-and-081020:1:1:1715:1759 - GGACTCAGATAGTAATCCACGCTCCTTTAAA - + - II#IIIIIII$5+.(9IIIIIII$%*$G$A3 - -**Note** that headers are skipped. - -.. class:: warningmark - -**WARNING:** This tool will only work on properly formatted fastq datasets where (1) each read and quality string occupy one line and (2) '@' (read header) and "+" (quality header) lines are evenly numbered like in the above example. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/ucsc_gene_bed_to_exon_bed.py --- a/tools/filters/ucsc_gene_bed_to_exon_bed.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,152 +0,0 @@ -#!/usr/bin/env python - -""" -Read a table dump in the UCSC gene table format and print a tab separated -list of intervals corresponding to requested features of each gene. - -usage: ucsc_gene_table_to_intervals.py [options] - -options: - -h, --help show this help message and exit - -rREGION, --region=REGION - Limit to region: one of coding, utr3, utr5, codon, intron, transcribed [default] - -e, --exons Only print intervals overlapping an exon - -i, --input=inputfile input file - -o, --output=outputfile output file -""" - -import optparse, string, sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - - # Parse command line - parser = optparse.OptionParser( usage="%prog [options] " ) - parser.add_option( "-r", "--region", dest="region", default="transcribed", - help="Limit to region: one of coding, utr3, utr5, transcribed [default]" ) - parser.add_option( "-e", "--exons", action="store_true", dest="exons", - help="Only print intervals overlapping an exon" ) - parser.add_option( "-s", "--strand", action="store_true", dest="strand", - help="Print strand after interval" ) - parser.add_option( "-i", "--input", dest="input", default=None, - help="Input file" ) - parser.add_option( "-o", "--output", dest="output", default=None, - help="Output file" ) - options, args = parser.parse_args() - assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed', 'intron', 'codon' ), "Invalid region argument" - - try: - out_file = open (options.output,"w") - except: - print >> sys.stderr, "Bad output file." - sys.exit(0) - - try: - in_file = open (options.input) - except: - print >> sys.stderr, "Bad input file." - sys.exit(0) - - print "Region:", options.region+";" - """print "Only overlap with Exons:", - if options.exons: - print "Yes" - else: - print "No" - """ - - # Read table and handle each gene - for line in in_file: - try: - if line[0:1] == "#": - continue - # Parse fields from gene tabls - fields = line.split( '\t' ) - chrom = fields[0] - tx_start = int( fields[1] ) - tx_end = int( fields[2] ) - name = fields[3] - strand = fields[5].replace(" ","_") - cds_start = int( fields[6] ) - cds_end = int( fields[7] ) - - # Determine the subset of the transcribed region we are interested in - if options.region == 'utr3': - if strand == '-': region_start, region_end = tx_start, cds_start - else: region_start, region_end = cds_end, tx_end - elif options.region == 'utr5': - if strand == '-': region_start, region_end = cds_end, tx_end - else: region_start, region_end = tx_start, cds_start - elif options.region == 'coding' or options.region == 'codon': - region_start, region_end = cds_start, cds_end - else: - region_start, region_end = tx_start, tx_end - - # If only interested in exons, print the portion of each exon overlapping - # the region of interest, otherwise print the span of the region - # options.exons is always TRUE - if options.exons: - exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) - exon_starts = map((lambda x: x + tx_start ), exon_starts) - exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) - exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); - - #for Intron regions: - if options.region == 'intron': - i=0 - while i < len(exon_starts)-1: - intron_starts = exon_ends[i] - intron_ends = exon_starts[i+1] - if strand: print_tab_sep(out_file, chrom, intron_starts, intron_ends, name, "0", strand ) - else: print_tab_sep(out_file, chrom, intron_starts, intron_ends ) - i+=1 - #for non-intron regions: - else: - for start, end in zip( exon_starts, exon_ends ): - start = max( start, region_start ) - end = min( end, region_end ) - if start < end: - if options.region == 'codon': - start += (3 - ((start-region_start)%3))%3 - c_start = start - while c_start+3 <= end: - if strand: - print_tab_sep(out_file, chrom, c_start, c_start+3, name, "0", strand ) - else: - print_tab_sep(out_file, chrom, c_start, c_start+3) - c_start += 3 - else: - if strand: - print_tab_sep(out_file, chrom, start, end, name, "0", strand ) - else: - print_tab_sep(out_file, chrom, start, end ) - """ - else: - if options.region == 'codon': - c_start = start - c_end = end - if c_start > c_end: - t = c_start - c_start = c_end - c_end = t - while c_start+3 <= c_end: - if strand: - print_tab_sep(out_file, chrom, c_start, c_start+3, name, "0", strand ) - else: - print_tab_sep(out_file, chrom, c_start, c_start+3) - c_start += 3 - else: - if strand: - print_tab_sep(out_file, chrom, region_start, region_end, name, "0", strand ) - else: - print_tab_sep(out_file, chrom, region_start, region_end ) - """ - except: - continue - -def print_tab_sep(out_file, *args ): - """Print items in `l` to stdout separated by tabs""" - print >>out_file, string.join( [ str( f ) for f in args ], '\t' ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/ucsc_gene_bed_to_exon_bed.xml --- a/tools/filters/ucsc_gene_bed_to_exon_bed.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ - -expander - ucsc_gene_bed_to_exon_bed.py --input=$input1 --output=$out_file1 --region=$region "--exons" - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -This tool works only on a BED file that contains at least 12 fields (see **Example** and **About formats** below). The output will be empty if applied to a BED file with 3 or 6 fields. - ------- - -**What it does** - -BED format can be used to represent a single gene in just one line, which contains the information about exons, coding sequence location (CDS), and positions of untranslated regions (UTRs). This tool *unpacks* this information by converting a single line describing a gene into a collection of lines representing individual exons, introns, UTRs, etc. - -------- - -**Example** - -Extracting **Coding Exons + UTR Exons** from the following two BED lines:: - - chr7 127475281 127491632 NM_000230 0 + 127486022 127488767 0 3 29,172,3225, 0,10713,13126 - chr7 127486011 127488900 D49487 0 + 127486022 127488767 0 2 155,490, 0,2399 - -will return:: - - chr7 127475281 127475310 NM_000230 0 + - chr7 127485994 127486166 NM_000230 0 + - chr7 127488407 127491632 NM_000230 0 + - chr7 127486011 127486166 D49487 0 + - chr7 127488410 127488900 D49487 0 + - ------- - -.. class:: infomark - -**About formats** - -**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and additional optional ones. In the specific case of this tool the following fields must be present:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. - 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. - 9. reserved - This should always be set to zero. - 10. blockCount - The number of blocks (exons) in the BED line. - 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/ucsc_gene_bed_to_intron_bed.py --- a/tools/filters/ucsc_gene_bed_to_intron_bed.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -#!/usr/bin/env python - -""" -Read a table dump in the UCSC gene table format and print a tab separated -list of intervals corresponding to requested features of each gene. - -usage: ucsc_gene_table_to_intervals.py [options] - -options: - -h, --help show this help message and exit - -rREGION, --region=REGION - Limit to region: one of coding, utr3, utr5, transcribed [default] - -e, --exons Only print intervals overlapping an exon - -i, --input=inputfile input file - -o, --output=outputfile output file -""" - -import optparse, string, sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - - # Parse command line - parser = optparse.OptionParser( usage="%prog [options] " ) - #parser.add_option( "-r", "--region", dest="region", default="transcribed", - # help="Limit to region: one of coding, utr3, utr5, transcribed [default]" ) - #parser.add_option( "-e", "--exons", action="store_true", dest="exons", - # help="Only print intervals overlapping an exon" ) - parser.add_option( "-s", "--strand", action="store_true", dest="strand", - help="Print strand after interval" ) - parser.add_option( "-i", "--input", dest="input", default=None, - help="Input file" ) - parser.add_option( "-o", "--output", dest="output", default=None, - help="Output file" ) - options, args = parser.parse_args() - #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" - - try: - out_file = open (options.output,"w") - except: - print >> sys.stderr, "Bad output file." - sys.exit(0) - - try: - in_file = open (options.input) - except: - print >> sys.stderr, "Bad input file." - sys.exit(0) - - #print "Region:", options.region+";" - #print "Only overlap with Exons:", - #if options.exons: - # print "Yes" - #else: - # print "No" - - # Read table and handle each gene - - for line in in_file: - try: - #print ("len: %d", len(line)) - if line[0:1] == "#": - continue - - # Parse fields from gene tabls - fields = line.split( '\t' ) - chrom = fields[0] - tx_start = int( fields[1] ) - tx_end = int( fields[2] ) - name = fields[3] - strand = fields[5].replace(" ","_") - cds_start = int( fields[6] ) - cds_end = int( fields[7] ) - - exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) - exon_starts = map((lambda x: x + tx_start ), exon_starts) - exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) - exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); - - i=0 - while i < len(exon_starts)-1: - intron_starts = exon_ends[i] + 1 - intron_ends = exon_starts[i+1] - 1 - if strand: print_tab_sep(out_file, chrom, intron_starts, intron_ends, name, "0", strand ) - else: print_tab_sep(out_file, chrom, intron_starts, intron_ends ) - i+=1 - # If only interested in exons, print the portion of each exon overlapping - # the region of interest, otherwise print the span of the region - - except: - continue - -def print_tab_sep(out_file, *args ): - """Print items in `l` to stdout separated by tabs""" - print >>out_file, string.join( [ str( f ) for f in args ], '\t' ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/ucsc_gene_bed_to_intron_bed.xml --- a/tools/filters/ucsc_gene_bed_to_intron_bed.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - -expander - ucsc_gene_bed_to_intron_bed.py --input=$input1 --output=$out_file1 - - - - - - - - - - - - - - - -**Syntax** - -This tool converts a UCSC gene bed format file to a list of bed format lines corresponding to requested features of each gene. - -- **BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and twelve additional optional ones:: - - The first three BED fields (required) are: - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - - The twelve additional BED fields (optional) are: - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. - 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. - 9. reserved - This should always be set to zero. - 10. blockCount - The number of blocks (exons) in the BED line. - 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - 13. expCount - The number of experiments. - 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. - 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. - ------ - -**Example** - -- A UCSC gene bed format file:: - - chr7 127475281 127491632 NM_000230 0 + 127486022 127488767 0 3 29,172,3225, 0,10713,13126 - chr7 127486011 127488900 D49487 0 + 127486022 127488767 0 2 155,490, 0,2399 - -- Converts the above file to a list of bed lines, which has the introns:: - - chr7 127475311 127475993 NM_000230 0 + - chr7 127486167 127488406 NM_000230 0 + - chr7 127486167 127488409 D49487 0 + - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/ucsc_gene_table_to_intervals.py --- a/tools/filters/ucsc_gene_table_to_intervals.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -#!/usr/bin/env python - -""" -Read a table dump in the UCSC gene table format and print a tab separated -list of intervals corresponding to requested features of each gene. - -usage: ucsc_gene_table_to_intervals.py [options] - -options: - -h, --help show this help message and exit - -rREGION, --region=REGION - Limit to region: one of coding, utr3, utr5, transcribed [default] - -e, --exons Only print intervals overlapping an exon - -i, --input=inputfile input file - -o, --output=outputfile output file -""" - -import optparse, string, sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - - # Parse command line - parser = optparse.OptionParser( usage="%prog [options] " ) - parser.add_option( "-r", "--region", dest="region", default="transcribed", - help="Limit to region: one of coding, utr3, utr5, transcribed [default]" ) - parser.add_option( "-e", "--exons", action="store_true", dest="exons", - help="Only print intervals overlapping an exon" ) - parser.add_option( "-s", "--strand", action="store_true", dest="strand", - help="Print strand after interval" ) - parser.add_option( "-i", "--input", dest="input", default=None, - help="Input file" ) - parser.add_option( "-o", "--output", dest="output", default=None, - help="Output file" ) - options, args = parser.parse_args() - assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" - - try: - out_file = open (options.output,"w") - except: - print >> sys.stderr, "Bad output file." - sys.exit(0) - - try: - in_file = open (options.input) - except: - print >> sys.stderr, "Bad input file." - sys.exit(0) - - print "Region:", options.region+";" - print "Only overlap with Exons:", - if options.exons: - print "Yes" - else: - print "No" - - # Read table and handle each gene - for line in in_file: - try: - if line[0:1] == "#": - continue - # Parse fields from gene tabls - fields = line.split( '\t' ) - name = fields[0] - chrom = fields[1] - strand = fields[2].replace(" ","_") - tx_start = int( fields[3] ) - tx_end = int( fields[4] ) - cds_start = int( fields[5] ) - cds_end = int( fields[6] ) - - # Determine the subset of the transcribed region we are interested in - if options.region == 'utr3': - if strand == '-': region_start, region_end = tx_start, cds_start - else: region_start, region_end = cds_end, tx_end - elif options.region == 'utr5': - if strand == '-': region_start, region_end = cds_end, tx_end - else: region_start, region_end = tx_start, cds_start - elif options.region == 'coding': - region_start, region_end = cds_start, cds_end - else: - region_start, region_end = tx_start, tx_end - - # If only interested in exons, print the portion of each exon overlapping - # the region of interest, otherwise print the span of the region - if options.exons: - exon_starts = map( int, fields[8].rstrip( ',\n' ).split( ',' ) ) - exon_ends = map( int, fields[9].rstrip( ',\n' ).split( ',' ) ) - for start, end in zip( exon_starts, exon_ends ): - start = max( start, region_start ) - end = min( end, region_end ) - if start < end: - if strand: print_tab_sep(out_file, chrom, start, end, name, "0", strand ) - else: print_tab_sep(out_file, chrom, start, end ) - else: - if strand: print_tab_sep(out_file, chrom, region_start, region_end, name, "0", strand ) - else: print_tab_sep(out_file, chrom, region_start, region_end ) - except: - continue - -def print_tab_sep(out_file, *args ): - """Print items in `l` to stdout separated by tabs""" - print >>out_file, string.join( [ str( f ) for f in args ], '\t' ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/ucsc_gene_table_to_intervals.xml --- a/tools/filters/ucsc_gene_table_to_intervals.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ - -Parse a UCSC Gene Table dump - ucsc_gene_table_to_intervals.py --input=$input1 --output=$out_file1 --region=$region $exon - - - - - - - - - - - - - - - - - - - -Read a table dump in the UCSC gene table format and create a BED file corresponding to the requested feature of each gene. - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/filters/uniq.py --- a/tools/filters/uniq.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ -# Filename: uniq.py -# Author: Ian N. Schenck -# Version: 19/12/2005 -# -# This script accepts an input file, an output file, a column -# delimiter, and a list of columns. The script then grabs unique -# lines based on the columns, and returns those records with a count -# of occurences of each unique column, inserted before the columns. -# -# This executes the command pipeline: -# cut -f $fields | sort | uniq -C -# -# -i Input file -# -o Output file -# -d Delimiter -# -c Column list (Comma Seperated) - -import sys -import re -import string -import commands - -# This function is exceedingly useful, perhaps package for reuse? -def getopts(argv): - opts = {} - while argv: - if argv[0][0] == '-': - opts[argv[0]] = argv[1] - argv = argv[2:] - else: - argv = argv[1:] - return opts - -def main(): - args = sys.argv[1:] - - try: - opts = getopts(args) - except IndexError: - print "Usage:" - print " -i Input file" - print " -o Output file" - print " -c Column list (comma seperated)" - print " -d Delimiter:" - print " T Tab" - print " C Comma" - print " D Dash" - print " U Underscore" - print " P Pipe" - print " Dt Dot" - print " Sp Space" - return 0 - - outputfile = opts.get("-o") - if outputfile == None: - print "No output file specified." - return -1 - - inputfile = opts.get("-i") - if inputfile == None: - print "No input file specified." - return -2 - - delim = opts.get("-d") - if delim == None: - print "Field delimiter not specified." - return -3 - - columns = opts.get("-c") - if columns == None or columns == 'None': - print "Columns not specified." - return -4 - - # All inputs have been specified at this point, now validate. - fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") - columnRegEx = re.compile("([0-9]{1,},?)+") - - if not columnRegEx.match(columns): - print "Illegal column specification." - return -4 - if not fileRegEx.match(outputfile): - print "Illegal output filename." - return -5 - if not fileRegEx.match(inputfile): - print "Illegal input filename." - return -6 - - column_list = re.split(",",columns) - columns_for_display = "" - for col in column_list: - columns_for_display += "c"+col+", " - - commandline = "cut " - # Set delimiter - if delim=='C': - commandline += "-d \",\" " - if delim=='D': - commandline += "-d \"-\" " - if delim=='U': - commandline += "-d \"_\" " - if delim=='P': - commandline += "-d \"|\" " - if delim=='Dt': - commandline += "-d \".\" " - if delim=='Sp': - commandline += "-d \" \" " - - # set columns - commandline += "-f " + columns - commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile - errorcode, stdout = commands.getstatusoutput(commandline) - - print "Count of unique values in " + columns_for_display - return errorcode - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/uniq.xml --- a/tools/filters/uniq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - - occurrences of each record - uniq.py -i $input -o $out_file1 -c "$column" -d $delim - - - - - - - - - - - - - - - - - - - - - - - - - - - .. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool counts occurrences of unique values in selected column(s). - -- If multiple columns are selected, counting is performed on each unique group of all values in the selected columns. -- The first column of the resulting dataset will be the count of unique values in the selected column(s) and will be followed by each value. - ------ - -**Example** - -- Input file:: - - chr1 10 100 gene1 - chr1 105 200 gene2 - chr1 205 300 gene3 - chr2 10 100 gene4 - chr2 1000 1900 gene5 - chr3 15 1656 gene6 - chr4 10 1765 gene7 - chr4 10 1765 gene8 - -- Counting unique values in column c1 will result in:: - - 3 chr1 - 2 chr2 - 1 chr3 - 2 chr4 - -- Counting unique values in the grouping of columns c2 and c3 will result in:: - - 2 10 100 - 2 10 1765 - 1 1000 1900 - 1 105 200 - 1 15 1656 - 1 205 300 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/wc_gnu.xml --- a/tools/filters/wc_gnu.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ - - of a dataset - - #set $word_to_arg = { 'characters':'m', 'words':'w', 'lines':'l' } - #set $arg_order = [ 'lines', 'words', 'characters' ] - #if not isinstance( $options.value, list ): - #set $args = [ $options.value ] - #else: - #set $args = $options.value - #end if - #if $include_header.value: - echo "#${ "\t".join( [ i for i in $arg_order if i in $args ] ) }" > $out_file1 - && - #end if - wc - #for $option in $args: - -${ word_to_arg[ str(option) ] } - #end for - $input1 | awk '{ print ${ '"\\t"'.join( [ "$%i" % ( i+1 ) for i in range( len( $args ) ) ] ) } }' - >> $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool outputs counts of specified attributes (lines, words, characters) of a dataset. - ------ - -**Example Output** - -:: - - #lines words characters - 7499 41376 624971 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/wig_to_bigwig.xml --- a/tools/filters/wig_to_bigwig.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - - converter - grep -v "^track" $input1 | wigToBigWig stdin $chromInfo $out_file1 - #if $settings.settingsType == "full": - -blockSize=${settings.blockSize} -itemsPerSlot=${settings.itemsPerSlot} ${settings.clip} ${settings.unc} - #else: - -clip - #end if - 2>&1 || echo "Error running wigToBigWig." >&2 - - ucsc_tools - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Syntax** - -This tool converts wiggle data into bigWig type. - -- **Wiggle format**: The .wig format is line-oriented. Wiggle data is preceded by a UCSC track definition line. Following the track definition line is the track data, which can be entered in three different formats described below. - - - **BED format** with no declaration line and four columns of data:: - - chromA chromStartA chromEndA dataValueA - chromB chromStartB chromEndB dataValueB - - - **variableStep** two column data; started by a declaration line and followed with chromosome positions and data values:: - - variableStep chrom=chrN [span=windowSize] - chromStartA dataValueA - chromStartB dataValueB - - - **fixedStep** single column data; started by a declaration line and followed with data values:: - - fixedStep chrom=chrN start=position step=stepInterval [span=windowSize] - dataValue1 - dataValue2 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/filters/wiggle_to_simple.py --- a/tools/filters/wiggle_to_simple.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -""" -Read a wiggle track and print out a series of lines containing -"chrom position score". Ignores track lines, handles bed, variableStep -and fixedStep wiggle lines. -""" -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.wiggle -from galaxy.tools.exception_handling import * - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - if len( sys.argv ) > 1: - in_file = open( sys.argv[1] ) - else: - in_file = open( sys.stdin ) - - if len( sys.argv ) > 2: - out_file = open( sys.argv[2], "w" ) - else: - out_file = sys.stdout - - try: - for fields in bx.wiggle.IntervalReader( UCSCOutWrapper( in_file ) ): - out_file.write( "%s\n" % "\t".join( map( str, fields ) ) ) - except UCSCLimitException: - # Wiggle data was truncated, at the very least need to warn the user. - print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.' - except ValueError, e: - in_file.close() - out_file.close() - stop_err( str( e ) ) - - in_file.close() - out_file.close() - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/filters/wiggle_to_simple.xml --- a/tools/filters/wiggle_to_simple.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ - - converter - wiggle_to_simple.py $input $out_file1 - - - - - - - - - - - - - - - - - -**Syntax** - -This tool converts wiggle data into interval type. - -- **Wiggle format**: The .wig format is line-oriented. Wiggle data is preceded by a UCSC track definition line. Following the track definition line is the track data, which can be entered in three different formats described below. - - - **BED format** with no declaration line and four columns of data:: - - chromA chromStartA chromEndA dataValueA - chromB chromStartB chromEndB dataValueB - - - **variableStep** two column data; started by a declaration line and followed with chromosome positions and data values:: - - variableStep chrom=chrN [span=windowSize] - chromStartA dataValueA - chromStartB dataValueB - - - **fixedStep** single column data; started by a declaration line and followed with data values:: - - fixedStep chrom=chrN start=position step=stepInterval [span=windowSize] - dataValue1 - dataValue2 - ------ - -**Example** - -- input wiggle format file:: - - #track type=wiggle_0 name="Bed Format" description="BED format" - chr19 59302000 59302300 -1.0 - chr19 59302300 59302600 -0.75 - chr19 59302600 59302900 -0.50 - chr19 59302900 59303200 -0.25 - chr19 59303200 59303500 0.0 - #track type=wiggle_0 name="variableStep" description="variableStep format" - variableStep chrom=chr19 span=150 - 59304701 10.0 - 59304901 12.5 - 59305401 15.0 - 59305601 17.5 - #track type=wiggle_0 name="fixedStep" description="fixed step" visibility=full - fixedStep chrom=chr19 start=59307401 step=300 span=200 - 1000 - 900 - 800 - 700 - 600 - -- convert the above file to interval file:: - - chr19 59302000 59302300 + -1.0 - chr19 59302300 59302600 + -0.75 - chr19 59302600 59302900 + -0.5 - chr19 59302900 59303200 + -0.25 - chr19 59303200 59303500 + 0.0 - chr19 59304701 59304851 + 10.0 - chr19 59304901 59305051 + 12.5 - chr19 59305401 59305551 + 15.0 - chr19 59305601 59305751 + 17.5 - chr19 59307701 59307901 + 1000.0 - chr19 59308001 59308201 + 900.0 - chr19 59308301 59308501 + 800.0 - chr19 59308601 59308801 + 700.0 - chr19 59308901 59309101 + 600.0 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/galaxy-loc.tar.gz Binary file tools/galaxy-loc.tar.gz has changed diff -r c2a356708570 -r 33c067c3ae34 tools/gatk/analyze_covariates.xml --- a/tools/gatk/analyze_covariates.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ - - - perform local realignment -gatk_wrapper.py - --stdout "${output_log}" - --html_report_from_directory "${output_html}" "${output_html.files_path}" - -p 'java - -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/AnalyzeCovariates.jar" - -recalFile "${input_recal}" - -outputDir "${output_html.files_path}" - ##-log "${output_log}" - ##-Rscript,--path_to_Rscript path_to_Rscript; on path is good enough - -resources "${GALAXY_DATA_INDEX_DIR}/gatk/R" - #if $analysis_param_type.analysis_param_type_selector == "advanced": - --ignoreQ "${analysis_param_type.ignore_q}" - --numRG "${analysis_param_type.num_read_groups}" - --max_quality_score "${analysis_param_type.max_quality_score}" - --max_histogram_value "${analysis_param_type.max_histogram_value}" - ${analysis_param_type.do_indel_quality} - #end if - ' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates. - - ------- - -Please cite the website "http://addlink.here" as well as: - -Add citation here 2011. - ------- - -**Input formats** - -GenomeAnalysisTK: AnalyzeCovariates accepts an recal CSV file. - ------- - -**Outputs** - -The output is in and HTML file with links to PDF graphs and a data files, see http://addlink.here for more details. - -------- - -**Settings**:: - - recal_file The input recal csv file to analyze - output_dir The directory in which to output all the plots and intermediate data files - path_to_Rscript The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript - path_to_resources Path to resources folder holding the Sting R scripts. - ignoreQ Ignore bases with reported quality less than this number. - numRG Only process N read groups. Default value: -1 (process all read groups) - max_quality_score The integer value at which to cap the quality scores, default is 50 - max_histogram_value If supplied, this value will be the max value of the histogram plots - do_indel_quality If supplied, this value will be the max value of the histogram plots - - - diff -r c2a356708570 -r 33c067c3ae34 tools/gatk/count_covariates.xml --- a/tools/gatk/count_covariates.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,431 +0,0 @@ - - on BAM files - gatk_wrapper.py - --stdout "${output_log}" - -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" - -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index - -p 'java - -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar" - -T "CountCovariates" - --num_threads 4 ##hard coded, for now - -et "NO_ET" ##ET no phone home - ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout - #if $reference_source.reference_source_selector != "history": - -R "${reference_source.ref_file.fields.path}" - #end if - --recal_file "${output_recal}" - ${standard_covs} - #if $covariates.value: - #for $cov in $covariates.value: - -cov "${cov}" - #end for - #end if - ' - - #set $snp_dataset_provided = False - #if str( $input_dbsnp_rod ) != "None": - -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod" - #set $snp_dataset_provided = True - #end if - #set $rod_binding_names = dict() - #for $rod_binding in $rod_bind: - #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': - #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name - #else - #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector - #end if - #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'snps': - #set $snp_dataset_provided = True - #end if - #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 - -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" - #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ): - -p '--rodToIntervalTrackName "${rod_bind_name}"' - #end if - #end for - - ##start standard gatk options - #if $gatk_param_type.gatk_param_type_selector == "advanced": - #for $sample_metadata in $gatk_param_type.sample_metadata: - -p '--sample_metadata "${sample_metadata.sample_metadata_file}"' - #end for - #for $read_filter in $gatk_param_type.read_filter: - -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" - ###raise Exception( str( dir( $read_filter ) ) ) - #for $name, $param in $read_filter.read_filter_type.iteritems(): - #if $name not in [ "__current_case__", "read_filter_type_selector" ]: - --${name} "${param}" - #end if - #end for - ' - #end for - #if str( $gatk_param_type.input_intervals ) != "None": - -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals" - #end if - #if str( $gatk_param_type.input_exclude_intervals ) != "None": - -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals" - #end if - - -p '--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"' - -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' - #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": - -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' - #end if - -p ' - --baq "${gatk_param_type.baq}" - --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" - ${gatk_param_type.use_original_qualities} - --defaultBaseQualities "${gatk_param_type.default_base_qualities}" - --validation_strictness "${gatk_param_type.validation_strictness}" - --interval_merging "${gatk_param_type.interval_merging}" - ' - #if str( $gatk_param_type.read_group_black_list ) != "None": - -d "-read_group_black_list" "${gatk_param_type.read_group_black_list}" "txt" "input_read_group_black_list" - #end if - #end if - #if str( $reference_source.reference_source_selector ) == "history": - -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" - #end if - ##end standard gatk options - - ##start analysis specific options - #if $analysis_param_type.analysis_param_type_selector == "advanced": - -p ' - #if $analysis_param_type.default_read_group_type.default_read_group_type_selector == "set": - --default_read_group "${analysis_param_type.default_read_group_type.default_read_group}" - #end if - #if str( $analysis_param_type.default_platform ) != "default": - --default_platform "${analysis_param_type.default_platform}" - #end if - #if str( $analysis_param_type.force_read_group_type.force_read_group_type_selector ) == "set": - --force_read_group "${analysis_param_type.force_read_group_type.force_read_group}" - #end if - #if str( $analysis_param_type.force_platform ) != "default": - --force_platform "${analysis_param_type.force_platform}" - #end if - ${analysis_param_type.exception_if_no_tile} - #if str( $analysis_param_type.solid_options_type.solid_options_type_selector ) == "set": - #if str( $analysis_param_type.solid_options_type.solid_recal_mode ) != "default": - --solid_recal_mode "${analysis_param_type.solid_options_type.solid_recal_mode}" - #end if - #if str( $analysis_param_type.solid_options_type.solid_nocall_strategy ) != "default": - --solid_nocall_strategy "${analysis_param_type.solid_options_type.solid_nocall_strategy}" - #end if - #end if - --window_size_nqs "${analysis_param_type.window_size_nqs}" - --homopolymer_nback "${analysis_param_type.homopolymer_nback}" - ' - #end if - #if not $snp_dataset_provided: - -p '--run_without_dbsnp_potentially_ruining_quality' - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -"This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation." -However, if you do not provide this file, the '--run_without_dbsnp_potentially_ruining_quality' flag will be automatically used, and the command will be allowed to run. - -**What it does** - - This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal - operating only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors - and indicative of poor base quality. This walker generates tables based on various user-specified covariates (such - as read group, reported quality score, cycle, and dinucleotide) Since there is a large amount of data one can then - calculate an empirical probability of error given the particular covariates seen at this site, where p(error) = num - mismatches / num observations The output file is a CSV list of (the several covariate values, num observations, num - mismatches, empirical quality score) The first non-comment line of the output file gives the name of the covariates - that were used for this calculation. Note: ReadGroupCovariate and QualityScoreCovariate are required covariates - and will be added for the user regardless of whether or not they were specified Note: This walker is designed to be - used in conjunction with TableRecalibrationWalker. - - ------- - -Please cite the website "http://addlink.here" as well as: - -Add citation here 2011. - ------- - -**Input formats** - -GenomeAnalysisTK: CountCovariates accepts an aligned BAM input file. - ------- - -**Outputs** - -The output is in CSV format, see http://addlink.here for more details. - -------- - -**Settings**:: - - - default_read_group If a read has no read group then default to the provided String. - default_platform If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. - force_read_group If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group. - force_platform If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. - window_size_nqs The window size used by MinimumNQSCovariate for its calculation - homopolymer_nback The number of previous bases to look at in HomopolymerCovariate - exception_if_no_tile If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1 - solid_recal_mode How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS) - solid_nocall_strategy Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ) - recal_file Filename for the input covariates table recalibration .csv file - out The output CSV file - recal_file Filename for the outputted covariates table recalibration file - standard_covs Use the standard set of covariates in addition to the ones listed using the -cov argument - run_without_dbsnp_potentially_ruining_quality If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/gatk/gatk_wrapper.py --- a/tools/gatk/gatk_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg - -""" -A wrapper script for running the GenomeAnalysisTK.jar commands. -""" - -import sys, optparse, os, tempfile, subprocess, shutil -from string import Template - -GALAXY_EXT_TO_GATK_EXT = { 'gatk_interval':'intervals', 'bam_index':'bam.bai', 'gatk_dbsnp':'dbsnp', 'picard_interval_list':'interval_list' } #items not listed here, will use the galaxy extension as-is -GALAXY_EXT_TO_GATK_FILE_TYPE = GALAXY_EXT_TO_GATK_EXT #for now, these are the same, but could be different if needed -DEFAULT_GATK_PREFIX = "gatk_file" -CHUNK_SIZE = 2**20 #1mb - - -def cleanup_before_exit( tmp_dir ): - if tmp_dir and os.path.exists( tmp_dir ): - shutil.rmtree( tmp_dir ) - -def gatk_filename_from_galaxy( galaxy_filename, galaxy_ext, target_dir = None, prefix = None ): - suffix = GALAXY_EXT_TO_GATK_EXT.get( galaxy_ext, galaxy_ext ) - if prefix is None: - prefix = DEFAULT_GATK_PREFIX - if target_dir is None: - target_dir = os.getcwd() - gatk_filename = os.path.join( target_dir, "%s.%s" % ( prefix, suffix ) ) - os.symlink( galaxy_filename, gatk_filename ) - return gatk_filename - -def gatk_filetype_argument_substitution( argument, galaxy_ext ): - return argument % dict( file_type = GALAXY_EXT_TO_GATK_FILE_TYPE.get( galaxy_ext, galaxy_ext ) ) - -def open_file_from_option( filename, mode = 'rb' ): - if filename: - return open( filename, mode = mode ) - return None - -def html_report_from_directory( html_out, dir ): - html_out.write( '\n\nGalaxy - GATK Output\n\n\n

\n

    \n' ) - for fname in sorted( os.listdir( dir ) ): - html_out.write( '
  • %s
  • \n' % ( fname, fname ) ) - html_out.write( '
\n\n\n' ) - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' ) - parser.add_option( '-d', '--dataset', dest='datasets', action='append', type="string", nargs=4, help='"-argument" "original_filename" "galaxy_filetype" "name_prefix"' ) - parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' ) - parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' ) - parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"') - (options, args) = parser.parse_args() - - tmp_dir = tempfile.mkdtemp() - if options.pass_through_options: - cmd = ' '.join( options.pass_through_options ) - else: - cmd = '' - if options.datasets: - for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets: - gatk_filename = gatk_filename_from_galaxy( filename, galaxy_ext, target_dir = tmp_dir, prefix = prefix ) - if dataset_arg: - cmd = '%s %s "%s"' % ( cmd, gatk_filetype_argument_substitution( dataset_arg, galaxy_ext ), gatk_filename ) - #set up stdout and stderr output options - stdout = open_file_from_option( options.stdout, mode = 'wb' ) - stderr = open_file_from_option( options.stderr, mode = 'wb' ) - #if no stderr file is specified, we'll use our own - if stderr is None: - stderr = tempfile.NamedTemporaryFile( dir=tmp_dir ) - stderr.close() - stderr = open( stderr.name, 'w+b' ) - - proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir ) - return_code = proc.wait() - - if return_code: - stderr_target = sys.stderr - else: - stderr_target = sys.stdout - stderr.flush() - stderr.seek(0) - while True: - chunk = stderr.read( CHUNK_SIZE ) - if chunk: - stderr_target.write( chunk ) - else: - break - stderr.close() - #generate html reports - if options.html_report_from_directory: - for ( html_filename, html_dir ) in options.html_report_from_directory: - html_report_from_directory( open( html_filename, 'wb' ), html_dir ) - - cleanup_before_exit( tmp_dir ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/gatk/indel_realigner.xml --- a/tools/gatk/indel_realigner.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,332 +0,0 @@ - - - perform local realignment - gatk_wrapper.py - --stdout "${output_log}" - -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" - -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index - -p 'java - -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar" - -T "IndelRealigner" - ##-quiet ##this appears to have no effect...confirmed by gatk programmers - -o "${output_bam}" - -et "NO_ET" ##ET no phone home - ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout - #if $reference_source.reference_source_selector != "history": - -R "${reference_source.ref_file.fields.path}" - #end if - -LOD "${lod_threshold}" - ${knowns_only} - ' - - #set $rod_binding_names = dict() - #if str( $input_dbsnp_rod ) != "None": - -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod" - #end if - #for $rod_binding in $rod_bind: - #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': - #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name - #else - #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector - #end if - #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 - -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" - #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ): - -p '--rodToIntervalTrackName "${rod_bind_name}"' - #end if - #end for - - ##start standard gatk options - #if $gatk_param_type.gatk_param_type_selector == "advanced": - #for $sample_metadata in $gatk_param_type.sample_metadata: - -p '--sample_metadata "${sample_metadata.sample_metadata_file}"' - #end for - #for $read_filter in $gatk_param_type.read_filter: - -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" - ###raise Exception( str( dir( $read_filter ) ) ) - #for $name, $param in $read_filter.read_filter_type.iteritems(): - #if $name not in [ "__current_case__", "read_filter_type_selector" ]: - --${name} "${param}" - #end if - #end for - ' - #end for - #if str( $gatk_param_type.input_intervals ) != "None": - -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals" - #end if - #if str( $gatk_param_type.input_exclude_intervals ) != "None": - -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals" - #end if - -p '--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"' - -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' - #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": - -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' - #end if - -p ' - --baq "${gatk_param_type.baq}" - --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" - ${gatk_param_type.use_original_qualities} - --defaultBaseQualities "${gatk_param_type.default_base_qualities}" - --validation_strictness "${gatk_param_type.validation_strictness}" - --interval_merging "${gatk_param_type.interval_merging}" - ' - #if str( $gatk_param_type.read_group_black_list ) != "None": - -d "-read_group_black_list" "${gatk_param_type.read_group_black_list}" "txt" "input_read_group_black_list" - #end if - #end if - #if $reference_source.reference_source_selector == "history": - -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" - #end if - ##end standard gatk options - ##start analysis specific options - -d "-targetIntervals" "${target_intervals}" "${target_intervals.ext}" "gatk_target_intervals" - -p ' - -targetNotSorted ##always resort input intervals - --disable_bam_indexing - ' - #if $analysis_param_type.analysis_param_type_selector == "advanced": - -p ' - --entropyThreshold "${analysis_param_type.entropy_threshold}" - ${analysis_param_type.simplify_bam} - --maxIsizeForMovement "${analysis_param_type.max_insert_size_for_movement}" - --maxPositionalMoveAllowed "${analysis_param_type.max_positional_move_allowed}" - --maxConsensuses "${analysis_param_type.max_consensuses}" - --maxReadsForConsensuses "${analysis_param_type.max_reads_for_consensuses}" - --maxReadsForRealignment "${analysis_param_type.max_reads_for_realignment}" - "${analysis_param_type.no_original_alignment_tags}" - ' - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - Performs local realignment of reads based on misalignments due to the presence of indels. Unlike most mappers, this - walker uses the full alignment context to determine whether an appropriate alternate reference (i.e. indel) exists - and updates SAMRecords accordingly. - ------- - -Please cite the website "http://addlink.here" as well as: - -Add citation here 2011. - ------- - -**Input formats** - -GenomeAnalysisTK: IndelRealigner accepts an aligned BAM and a list of intervals to realign as input files. - ------- - -**Outputs** - -The output is in the BAM format, see http://addlink.here for more details. - -------- - -**Settings**:: - - targetIntervals intervals file output from RealignerTargetCreator - LODThresholdForCleaning LOD threshold above which the cleaner will clean - entropyThreshold percentage of mismatches at a locus to be considered having high entropy - out Output bam - bam_compression Compression level to use for writing BAM files - disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files. - simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier - useOnlyKnownIndels Don't run 'Smith-Waterman' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references. - maxReadsInMemory max reads allowed to be kept in memory at a time by the SAMFileWriter. Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory). - maxIsizeForMovement maximum insert size of read pairs that we attempt to realign - maxPositionalMoveAllowed maximum positional move in basepairs that a read can be adjusted during realignment - maxConsensuses max alternate consensuses to try (necessary to improve performance in deep coverage) - maxReadsForConsensuses max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage) - maxReadsForRealignment max reads allowed at an interval for realignment; if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is - noOriginalAlignmentTags Don't output the original cigar or alignment start tags for each realigned read in the output bam. - targetIntervalsAreNotSorted This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception. Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/gatk/realigner_target_creator.xml --- a/tools/gatk/realigner_target_creator.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,296 +0,0 @@ - - for use in local realignment - gatk_wrapper.py - --stdout "${output_log}" - -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" - -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index - -p 'java - -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar" - -T "RealignerTargetCreator" - -o "${output_interval}" - -et "NO_ET" ##ET no phone home - ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout - #if $reference_source.reference_source_selector != "history": - -R "${reference_source.ref_file.fields.path}" - #end if - ' - #set $rod_binding_names = dict() - #if str( $input_dbsnp_rod ) != "None": - -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod" - #end if - #for $rod_binding in $rod_bind: - #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': - #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name - #else - #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector - #end if - #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 - -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" - #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ): - -p '--rodToIntervalTrackName "${rod_bind_name}"' - #end if - #end for - - ##start standard gatk options - #if $gatk_param_type.gatk_param_type_selector == "advanced": - #for $sample_metadata in $gatk_param_type.sample_metadata: - -p '--sample_metadata "${sample_metadata.sample_metadata_file}"' - #end for - #for $read_filter in $gatk_param_type.read_filter: - -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" - ###raise Exception( str( dir( $read_filter ) ) ) - #for $name, $param in $read_filter.read_filter_type.iteritems(): - #if $name not in [ "__current_case__", "read_filter_type_selector" ]: - --${name} "${param}" - #end if - #end for - ' - #end for - #if str( $gatk_param_type.input_intervals ) != "None": - -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals" - #end if - #if str( $gatk_param_type.input_exclude_intervals ) != "None": - -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals" - #end if - - -p '--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"' - - -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' - #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": - -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' - #end if - -p ' - --baq "${gatk_param_type.baq}" - --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" - ${gatk_param_type.use_original_qualities} - --defaultBaseQualities "${gatk_param_type.default_base_qualities}" - --validation_strictness "${gatk_param_type.validation_strictness}" - --interval_merging "${gatk_param_type.interval_merging}" - ' - #if str( $gatk_param_type.read_group_black_list ) != "None": - -d "-read_group_black_list" "${gatk_param_type.read_group_black_list}" "txt" "input_read_group_black_list" - #end if - #end if - #if $reference_source.reference_source_selector == "history": - -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" - #end if - ##end standard gatk options - ##start analysis specific options - #if $analysis_param_type.analysis_param_type_selector == "advanced": - -p ' - --minReadsAtLocus "${analysis_param_type.minReadsAtLocus}" - --windowSize "${analysis_param_type.windowSize}" - --mismatchFraction "${analysis_param_type.mismatchFraction}" - --maxIntervalSize "${analysis_param_type.maxIntervalSize}" - ' - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string. - ------- - -Please cite the website "http://addlink.here" as well as: - -Add citation here 2011. - ------- - -**Input formats** - -GenomeAnalysisTK: RealignerTargetCreator accepts an aligned BAM input file. - ------- - -**Outputs** - -The output is in GATK Interval format, see http://addlink.here for more details. - -------- - -**Settings**:: - - windowSize window size for calculating entropy or SNP clusters - mismatchFraction fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1 - minReadsAtLocus minimum reads at a locus to enable using the entropy calculation - maxIntervalSize maximum interval size - - - diff -r c2a356708570 -r 33c067c3ae34 tools/gatk/table_recalibration.xml --- a/tools/gatk/table_recalibration.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,400 +0,0 @@ - - on BAM files - gatk_wrapper.py - --stdout "${output_log}" - -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" - -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index - -p 'java - -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar" - -T "TableRecalibration" - -o "${output_bam}" - -et "NO_ET" ##ET no phone home - ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout - #if $reference_source.reference_source_selector != "history": - -R "${reference_source.ref_file.fields.path}" - #end if - --recal_file "${input_recal}" - --disable_bam_indexing - ' - ##start standard gatk options - #if $gatk_param_type.gatk_param_type_selector == "advanced": - #for $sample_metadata in $gatk_param_type.sample_metadata: - -p '--sample_metadata "${sample_metadata.sample_metadata_file}"' - #end for - #for $read_filter in $gatk_param_type.read_filter: - -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" - ###raise Exception( str( dir( $read_filter ) ) ) - #for $name, $param in $read_filter.read_filter_type.iteritems(): - #if $name not in [ "__current_case__", "read_filter_type_selector" ]: - --${name} "${param}" - #end if - #end for - ' - #end for - #if str( $gatk_param_type.input_intervals ) != "None": - -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals" - #end if - #if str( $gatk_param_type.input_exclude_intervals ) != "None": - -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals" - #end if - #set $rod_binding_names = dict() - #for $rod_binding in $gatk_param_type.rod_bind: - #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': - #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name - #else - #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector - #end if - #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 - -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" - #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ): - -p '--rodToIntervalTrackName "${rod_bind_name}"' - #end if - #end for - -p '--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"' - #if str( $gatk_param_type.input_dbsnp_rod ) != "None": - -d "-D" "${gatk_param_type.input_dbsnp_rod}" "${gatk_param_type.input_dbsnp_rod.ext}" "dbsnp_rod" - #end if - -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' - #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": - -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' - #end if - -p ' - --baq "${gatk_param_type.baq}" - --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" - ${gatk_param_type.use_original_qualities} - --defaultBaseQualities "${gatk_param_type.default_base_qualities}" - --validation_strictness "${gatk_param_type.validation_strictness}" - --interval_merging "${gatk_param_type.interval_merging}" - ' - #if str( $gatk_param_type.read_group_black_list ) != "None": - -d "-read_group_black_list" "${gatk_param_type.read_group_black_list}" "txt" "input_read_group_black_list" - #end if - #end if - #if str( $reference_source.reference_source_selector ) == "history": - -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" - #end if - ##end standard gatk options - - ##start analysis specific options - #if $analysis_param_type.analysis_param_type_selector == "advanced": - -p ' - #if $analysis_param_type.default_read_group_type.default_read_group_type_selector == "set": - --default_read_group "${analysis_param_type.default_read_group_type.default_read_group}" - #end if - #if str( $analysis_param_type.default_platform ) != "default": - --default_platform "${analysis_param_type.default_platform}" - #end if - #if str( $analysis_param_type.force_read_group_type.force_read_group_type_selector ) == "set": - --force_read_group "${analysis_param_type.force_read_group_type.force_read_group}" - #end if - #if str( $analysis_param_type.force_platform ) != "default": - --force_platform "${analysis_param_type.force_platform}" - #end if - ${analysis_param_type.exception_if_no_tile} - #if str( $analysis_param_type.solid_options_type.solid_options_type_selector ) == "set": - #if str( $analysis_param_type.solid_options_type.solid_recal_mode ) != "default": - --solid_recal_mode "${analysis_param_type.solid_options_type.solid_recal_mode}" - #end if - #if str( $analysis_param_type.solid_options_type.solid_nocall_strategy ) != "default": - --solid_nocall_strategy "${analysis_param_type.solid_options_type.solid_nocall_strategy}" - #end if - #end if - ${analysis_param_type.simplify_bam} - --preserve_qscores_less_than "${analysis_param_type.preserve_qscores_less_than}" - --smoothing "${analysis_param_type.smoothing}" - --max_quality_score "${analysis_param_type.max_quality_score}" - --window_size_nqs "${analysis_param_type.window_size_nqs}" - --homopolymer_nback "${analysis_param_type.homopolymer_nback}" - ${analysis_param_type.do_not_write_original_quals} - ' - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For - each base in each read this walker calculates various user-specified covariates (such as read group, reported - quality score, cycle, and dinuc) Using these values as a key in a large hashmap the walker calculates an empirical - base quality score and overwrites the quality score currently in the read. This walker then outputs a new bam file - with these updated (recalibrated) reads. Note: This walker expects as input the recalibration table file generated - previously by CovariateCounterWalker. Note: This walker is designed to be used in conjunction with - CovariateCounterWalker. - ------- - -Please cite the website "http://addlink.here" as well as: - -Add citation here 2011. - ------- - -**Input formats** - -GenomeAnalysisTK: TableRecalibration accepts an aligned BAM and a recalibration CSV input files. - ------- - -**Outputs** - -The output is in BAM format, see http://addlink.here for more details. - -------- - -**Settings**:: - - default_read_group If a read has no read group then default to the provided String. - default_platform If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. - force_read_group If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group. - force_platform If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. - window_size_nqs The window size used by MinimumNQSCovariate for its calculation - homopolymer_nback The number of previous bases to look at in HomopolymerCovariate - exception_if_no_tile If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1 - solid_recal_mode How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS) - solid_nocall_strategy Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ) - recal_file Filename for the input covariates table recalibration .csv file - out The output BAM file - bam_compression Compression level to use for writing BAM files - disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files. - simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier - preserve_qscores_less_than Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases - smoothing Number of imaginary counts to add to each bin bin order to smooth out bins with few data points, default=1 - max_quality_score The integer value at which to cap the quality scores, default=50 - doNotWriteOriginalQuals If true, we will not write the original quality (OQ) tag for each read - - - diff -r c2a356708570 -r 33c067c3ae34 tools/gatk/unified_genotyper.xml --- a/tools/gatk/unified_genotyper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,414 +0,0 @@ - - SNP and indel caller - gatk_wrapper.py - --stdout "${output_log}" - #for $i, $input_bam in enumerate( $reference_source.input_bams ): - -d "-I" "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "gatk_input_${i}" - -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "gatk_input_${i}" ##hardcode galaxy ext type as bam_index - #end for - -p 'java - -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar" - -T "UnifiedGenotyper" - -o "${output_vcf}" - -et "NO_ET" ##ET no phone home - ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout - #if $reference_source.reference_source_selector != "history": - -R "${reference_source.ref_file.fields.path}" - #end if - --standard_min_confidence_threshold_for_calling "${standard_min_confidence_threshold_for_calling}" - --standard_min_confidence_threshold_for_emitting "${standard_min_confidence_threshold_for_emitting}" - ' - #set $rod_binding_names = dict() - #if str( $input_dbsnp_rod ) != "None": - -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod" - #end if - #for $rod_binding in $rod_bind: - #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': - #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name - #else - #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector - #end if - #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 - -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" - #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ): - -p '--rodToIntervalTrackName "${rod_bind_name}"' - #end if - #end for - - ##start standard gatk options - #if $gatk_param_type.gatk_param_type_selector == "advanced": - #for $sample_metadata in $gatk_param_type.sample_metadata: - -p '--sample_metadata "${sample_metadata.sample_metadata_file}"' - #end for - #for $read_filter in $gatk_param_type.read_filter: - -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" - ###raise Exception( str( dir( $read_filter ) ) ) - #for $name, $param in $read_filter.read_filter_type.iteritems(): - #if $name not in [ "__current_case__", "read_filter_type_selector" ]: - --${name} "${param}" - #end if - #end for - ' - #end for - #if str( $gatk_param_type.input_intervals ) != "None": - -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals" - #end if - #if str( $gatk_param_type.input_exclude_intervals ) != "None": - -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals" - #end if - - -p '--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"' - - -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' - #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": - -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' - #end if - -p ' - --baq "${gatk_param_type.baq}" - --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" - ${gatk_param_type.use_original_qualities} - --defaultBaseQualities "${gatk_param_type.default_base_qualities}" - --validation_strictness "${gatk_param_type.validation_strictness}" - --interval_merging "${gatk_param_type.interval_merging}" - ' - #if str( $gatk_param_type.read_group_black_list ) != "None": - -d "-read_group_black_list" "${gatk_param_type.read_group_black_list}" "txt" "input_read_group_black_list" - #end if - #end if - #if $reference_source.reference_source_selector == "history": - -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" - #end if - ##end standard gatk options - ##start analysis specific options - #if $analysis_param_type.analysis_param_type_selector == "advanced": - -p ' - --genotype_likelihoods_model "${analysis_param_type.genotype_likelihoods_model}" - --p_nonref_model "${analysis_param_type.p_nonref_model}" - --heterozygosity "${analysis_param_type.heterozygosity}" - --pcr_error_rate "${analysis_param_type.pcr_error_rate}" - --genotyping_mode "${analysis_param_type.genotyping_mode}" - --output_mode "${analysis_param_type.output_mode}" - ${analysis_param_type.noSLOD} - --min_base_quality_score "${analysis_param_type.min_base_quality_score}" - --min_mapping_quality_score "${analysis_param_type.min_mapping_quality_score}" - --max_deletion_fraction "${analysis_param_type.max_deletion_fraction}" - --min_indel_count_for_genotyping "${analysis_param_type.min_indel_count_for_genotyping}" - --indel_heterozygosity "${analysis_param_type.indel_heterozygosity}" - --indelGapContinuationPenalty "${analysis_param_type.indelGapContinuationPenalty}" - --indelGapOpenPenalty "${analysis_param_type.indelGapOpenPenalty}" - --indelHaplotypeSize "${analysis_param_type.indelHaplotypeSize}" - ${analysis_param_type.doContextDependentGapPenalties} - #if $analysis_param_type.annotation.value: - #for $annotation in $analysis_param_type.annotation.value: - --annotation "${annotation}" - #end for - #end if - #if $analysis_param_type.group.value: - #for $group in $analysis_param_type.group.value: - --group "${group}" - #end for - #end if - ' - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - A variant caller which unifies the approaches of several disparate callers. Works for single-sample and - multi-sample data. The user can choose from several different incorporated calculation models. - ------- - -Please cite the website "http://addlink.here" as well as: - -Add citation here 2011. - ------- - -**Input formats** - -GenomeAnalysisTK: UnifiedGenotyper accepts an aligned BAM input file. - ------- - -**Outputs** - -The output is in VCF format, see http://addlink.here for more details. - -------- - -**Settings**:: - - genotype_likelihoods_model Genotype likelihoods calculation model to employ -- BOTH is the default option, while INDEL is also available for calling indels and SNP is available for calling SNPs only (SNP|INDEL|BOTH) - p_nonref_model Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available. (EXACT|GRID_SEARCH) - heterozygosity Heterozygosity value used to compute prior likelihoods for any locus - pcr_error_rate The PCR error rate to be used for computing fragment-based likelihoods - genotyping_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (DISCOVERY|GENOTYPE_GIVEN_ALLELES) - output_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (EMIT_VARIANTS_ONLY|EMIT_ALL_CONFIDENT_SITES|EMIT_ALL_SITES) - standard_min_confidence_threshold_for_calling The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called - standard_min_confidence_threshold_for_emitting The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold) - noSLOD If provided, we will not calculate the SLOD - min_base_quality_score Minimum base quality required to consider a base for calling - min_mapping_quality_score Minimum read mapping quality required to consider a read for calling - max_deletion_fraction Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05] - min_indel_count_for_genotyping Minimum number of consensus indels required to trigger genotyping run - indel_heterozygosity Heterozygosity for indel calling - indelGapContinuationPenalty Indel gap continuation penalty - indelGapOpenPenalty Indel gap open penalty - indelHaplotypeSize Indel haplotype size - doContextDependentGapPenalties Vary gap penalties by context - indel_recal_file Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE - indelDebug Output indel debug info - out File to which variants should be written - annotation One or more specific annotations to apply to variant calls - group One or more classes/groups of annotations to apply to variant calls - - - diff -r c2a356708570 -r 33c067c3ae34 tools/genetrack/genetrack_indexer.py --- a/tools/genetrack/genetrack_indexer.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -""" -Wraps genetrack.scripts.tabs2genetrack so the tool can be executed from Galaxy. - -usage: %prog input output shift -""" - -import sys, shutil, os -from galaxy import eggs -import pkg_resources -pkg_resources.require( "GeneTrack" ) - -from genetrack.scripts import tabs2genetrack -from genetrack import logger - -if __name__ == "__main__": - import os - os.environ[ 'LC_ALL' ] = 'C' - #os.system( 'export' ) - - parser = tabs2genetrack.option_parser() - - options, args = parser.parse_args() - - # uppercase the format - options.format = options.format.upper() - - if options.format not in ('BED', 'GFF'): - sys.stdout = sys.stderr - parser.print_help() - sys.exit(-1) - - logger.disable(options.verbosity) - - # missing file names - if not (options.inpname and options.outname and options.format): - parser.print_help() - sys.exit(-1) - else: - tabs2genetrack.transform(inpname=options.inpname, outname=options.outname,\ - format=options.format, shift=options.shift, index=options.index, options=options) diff -r c2a356708570 -r 33c067c3ae34 tools/genetrack/genetrack_indexer.xml --- a/tools/genetrack/genetrack_indexer.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ - - - on a BED file - - - genetrack_indexer.py -i $input -o $output -s $shift -v 0 -f BED -x - - - - - - - - - - - - - - - - - - - - - -**Help** - -This tool will create a visualization of the bed file that is selected. - -**Parameters** - -- **Shift at 5' end** should be used when the location of interest is at a fixed distance from - the 5' end for **all sequenced fragments**! - - For example if the sequenced sample consists - mono-nucleosomal DNA (146bp) we should expect that - each nucleosome midpoint is located at 73 bp from the 5' end of the fragment. - Therefore we would enter 73 as the shift parameter. Once corrected the reads - on each strand will coincide and indicate the actual midpoints - of the nucleosomes. - - When shifting the averaging process in GeneTrack is able correct for longer or shorter - than expected fragment sizes as long as the errors are reasonably random. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/genetrack/genetrack_peak_prediction.py --- a/tools/genetrack/genetrack_peak_prediction.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -#!/usr/bin/env python - -""" -Wraps genetrack.scripts.peakpred so the tool can be executed from Galaxy. - -usage: %prog input output level sigma mode exclusion strand -""" - -import sys -from galaxy import eggs -import pkg_resources -pkg_resources.require( "GeneTrack" ) - -from genetrack.scripts import peakpred -from genetrack import logger - -if __name__ == "__main__": - - parser = peakpred.option_parser() - - options, args = parser.parse_args() - - logger.disable(options.verbosity) - - from genetrack import conf - - # trigger test mode - if options.test: - options.inpname = conf.testdata('test-hdflib-input.gtrack') - options.outname = conf.testdata('predictions.bed') - - # missing input file name - if not options.inpname and not options.outname: - parser.print_help() - else: - print 'Sigma = %s' % options.sigma - print 'Minimum peak = %s' % options.level - print 'Peak-to-peak = %s' % options.exclude - - peakpred.predict(options.inpname, options.outname, options) diff -r c2a356708570 -r 33c067c3ae34 tools/genetrack/genetrack_peak_prediction.xml --- a/tools/genetrack/genetrack_peak_prediction.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ - - - on GeneTrack index - - - genetrack_peak_prediction.py -i $input -o $output --level=$level --sigma=$sigma --mode=$mode --exclusion=$exclusion --strand=$strand -v 0 -x - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Help** - -This tool will generate genome wide peak prediction from an index file. - -**Parameters** - -- **Smoothing method** the function used to average nearby values - -- **Smoothing value** the factor used in the method - -- **Prediction method** the function used to average nearby values - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/cdblib.py --- a/tools/genome_diversity/cdblib.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,230 +0,0 @@ -#!/usr/bin/env python2.5 - -''' -Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables -that efficiently handle many keys, while remaining space-efficient. - - http://cr.yp.to/cdb.html - -When generated databases are only used with Python code, consider using hash() -rather than djb_hash() for a tidy speedup. -''' - -from _struct import Struct -from itertools import chain - - -def py_djb_hash(s): - '''Return the value of DJB's hash function for the given 8-bit string.''' - h = 5381 - for c in s: - h = (((h << 5) + h) ^ ord(c)) & 0xffffffff - return h - -try: - from _cdblib import djb_hash -except ImportError: - djb_hash = py_djb_hash - -read_2_le4 = Struct('> 1 for p in self.index) - - def iteritems(self): - '''Like dict.iteritems(). Items are returned in insertion order.''' - pos = 2048 - while pos < self.table_start: - klen, dlen = read_2_le4(self.data[pos:pos+8]) - pos += 8 - - key = self.data[pos:pos+klen] - pos += klen - - data = self.data[pos:pos+dlen] - pos += dlen - - yield key, data - - def items(self): - '''Like dict.items().''' - return list(self.iteritems()) - - def iterkeys(self): - '''Like dict.iterkeys().''' - return (p[0] for p in self.iteritems()) - __iter__ = iterkeys - - def itervalues(self): - '''Like dict.itervalues().''' - return (p[1] for p in self.iteritems()) - - def keys(self): - '''Like dict.keys().''' - return [p[0] for p in self.iteritems()] - - def values(self): - '''Like dict.values().''' - return [p[1] for p in self.iteritems()] - - def __getitem__(self, key): - '''Like dict.__getitem__().''' - value = self.get(key) - if value is None: - raise KeyError(key) - return value - - def has_key(self, key): - '''Return True if key exists in the database.''' - return self.get(key) is not None - __contains__ = has_key - - def __len__(self): - '''Return the number of records in the database.''' - return self.length - - def gets(self, key): - '''Yield values for key in insertion order.''' - # Truncate to 32 bits and remove sign. - h = self.hashfn(key) & 0xffffffff - start, nslots = self.index[h & 0xff] - - if nslots: - end = start + (nslots << 3) - slot_off = start + (((h >> 8) % nslots) << 3) - - for pos in chain(xrange(slot_off, end, 8), - xrange(start, slot_off, 8)): - rec_h, rec_pos = read_2_le4(self.data[pos:pos+8]) - - if not rec_h: - break - elif rec_h == h: - klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8]) - rec_pos += 8 - - if self.data[rec_pos:rec_pos+klen] == key: - rec_pos += klen - yield self.data[rec_pos:rec_pos+dlen] - - def get(self, key, default=None): - '''Get the first value for key, returning default if missing.''' - # Avoid exception catch when handling default case; much faster. - return chain(self.gets(key), (default,)).next() - - def getint(self, key, default=None, base=0): - '''Get the first value for key converted it to an int, returning - default if missing.''' - value = self.get(key, default) - if value is not default: - return int(value, base) - return value - - def getints(self, key, base=0): - '''Yield values for key in insertion order after converting to int.''' - return (int(v, base) for v in self.gets(key)) - - def getstring(self, key, default=None, encoding='utf-8'): - '''Get the first value for key decoded as unicode, returning default if - not found.''' - value = self.get(key, default) - if value is not default: - return value.decode(encoding) - return value - - def getstrings(self, key, encoding='utf-8'): - '''Yield values for key in insertion order after decoding as - unicode.''' - return (v.decode(encoding) for v in self.gets(key)) - - -class Writer(object): - '''Object for building new Constant Databases, and writing them to a - seekable file-like object.''' - - def __init__(self, fp, hashfn=djb_hash): - '''Create an instance writing to a file-like object, using hashfn to - hash keys.''' - self.fp = fp - self.hashfn = hashfn - - fp.write('\x00' * 2048) - self._unordered = [[] for i in xrange(256)] - - def put(self, key, value=''): - '''Write a string key/value pair to the output file.''' - assert type(key) is str and type(value) is str - - pos = self.fp.tell() - self.fp.write(write_2_le4(len(key), len(value))) - self.fp.write(key) - self.fp.write(value) - - h = self.hashfn(key) & 0xffffffff - self._unordered[h & 0xff].append((h, pos)) - - def puts(self, key, values): - '''Write more than one value for the same key to the output file. - Equivalent to calling put() in a loop.''' - for value in values: - self.put(key, value) - - def putint(self, key, value): - '''Write an integer as a base-10 string associated with the given key - to the output file.''' - self.put(key, str(value)) - - def putints(self, key, values): - '''Write zero or more integers for the same key to the output file. - Equivalent to calling putint() in a loop.''' - self.puts(key, (str(value) for value in values)) - - def putstring(self, key, value, encoding='utf-8'): - '''Write a unicode string associated with the given key to the output - file after encoding it as UTF-8 or the given encoding.''' - self.put(key, unicode.encode(value, encoding)) - - def putstrings(self, key, values, encoding='utf-8'): - '''Write zero or more unicode strings to the output file. Equivalent to - calling putstring() in a loop.''' - self.puts(key, (unicode.encode(value, encoding) for value in values)) - - def finalize(self): - '''Write the final hash tables to the output file, and write out its - index. The output file remains open upon return.''' - index = [] - for tbl in self._unordered: - length = len(tbl) << 1 - ordered = [(0, 0)] * length - for pair in tbl: - where = (pair[0] >> 8) % length - for i in chain(xrange(where, length), xrange(0, where)): - if not ordered[i][0]: - ordered[i] = pair - break - - index.append((self.fp.tell(), length)) - for pair in ordered: - self.fp.write(write_2_le4(*pair)) - - self.fp.seek(0) - for pair in index: - self.fp.write(write_2_le4(*pair)) - self.fp = None # prevent double finalize() diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/extract_flanking_dna.py --- a/tools/genome_diversity/extract_flanking_dna.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -#!/usr/bin/env python2.5 - -import os -import sys -from optparse import OptionParser -import genome_diversity as gd - -def main_function( parse_arguments=None ): - if parse_arguments is None: - parse_arguments = lambda arguments: ( None, arguments ) - def main_decorator( to_decorate ): - def decorated_main( arguments=None ): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments( arguments ) - rc = 1 - try: - rc = to_decorate( options, arguments ) - except Exception, err: - sys.stderr.write( 'ERROR: %s\n' % str( err ) ) - traceback.print_exc() - finally: - sys.exit( rc ) - return decorated_main - return main_decorator - -def parse_arguments( arguments ): - parser = OptionParser() - parser.add_option('--input', - type='string', dest='input', - help='file of selected SNPs') - parser.add_option('--output', - type='string', dest='output', - help='output file') - parser.add_option('--snps_loc', - type='string', dest='snps_loc', - help='snps .loc file') - parser.add_option('--scaffold_col', - type="int", dest='scaffold_col', - help='scaffold column in the input file') - parser.add_option('--pos_col', - type="int", dest='pos_col', - help='position column in the input file') - parser.add_option('--output_format', - type="string", dest='output_format', - help='output format, fasta or primer3') - parser.add_option('--species', - type="string", dest='species', - help='species') - return parser.parse_args( arguments[1:] ) - - -@main_function( parse_arguments ) -def main( options, arguments ): - if not options.input: - raise RuntimeError( 'missing --input option' ) - if not options.output: - raise RuntimeError( 'missing --output option' ) - if not options.snps_loc: - raise RuntimeError( 'missing --snps_loc option' ) - if not options.scaffold_col: - raise RuntimeError( 'missing --scaffold_col option' ) - if not options.pos_col: - raise RuntimeError( 'missing --pos_col option' ) - if not options.output_format: - raise RuntimeError( 'missing --output_format option' ) - if not options.species: - raise RuntimeError( 'missing --species option' ) - - snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) ) - - out_fh = gd._openfile( options.output, 'w' ) - - snpcalls_file = gd.get_filename_from_loc( options.species, options.snps_loc ) - file_root, file_ext = os.path.splitext( snpcalls_file ) - snpcalls_index_file = file_root + ".cdb" - snpcalls = gd.SnpcallsFile( data_file=snpcalls_file, index_file=snpcalls_index_file ) - - while snps.next(): - seq, pos = snps.get_seq_pos() - flanking_dna = snpcalls.get_flanking_dna( sequence=seq, position=pos, format=options.output_format ) - if flanking_dna: - out_fh.write( flanking_dna ) - - out_fh.close() - -if __name__ == "__main__": - main() - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/extract_flanking_dna.xml --- a/tools/genome_diversity/extract_flanking_dna.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,93 +0,0 @@ - - DNA flanking chosen SNPs - - - extract_flanking_dna.py "--input=$input" "--output=$output" "--snps_loc=${GALAXY_DATA_INDEX_DIR}/gd.snps.loc" - #if $override_metadata.choice == "0": - "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}" - #else - "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species" - #end if - "--output_format=$output_format" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - It reports a DNA segment containing each SNP, with up to 200 nucleotides on - either side of the SNP position, which is indicated by "n". Fewer nucleotides - are reported if the SNP is near an end of the assembled genome fragment. - ------ - -**Example** - -- input file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr18_50154905_50155664 304 A G Y C chr18 50155208 A Y 4 2 17 5 1 22 Y 8 0.022 0.996 0.128 0 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - -- output file:: - - > chr2_75111355_75112576 314 A C - TATCTTCATTTTTATTATAGACTCTCTGAACCAATTTGCCCTGAGGCAGACTTTTTAAAGTACTGTGTAATGTATGAAGTCCTTCTGCTCAAGCAAATCATTGGCATGAAAACAGTTGCAAACTTATTGTGAGAGAAGAGTCCAAGAGTTTTAACAGTCTGTAAGTATATAGCCTGTGAGTTTGATTTCCTTCTTGTTTTTnTTCCAGAAACATGATCAGGGGCAAGTTCTATTGGATATAGTCTTCAAGCATCTTGATTTGACTGAGCGTGACTATTTTGGTTTGCAGTTGACTGACGATTCCACTGATAACCCAGTAAGTTTAAGCTGTTGTCTTTCATTGTCATTGCAATTTTTCTGTCTTTATACTAGGTCCTTTCTGATTTACATTGTTCACTGATT - > chr8_93901796_93905612 2471 A C - GCTGCCGCTGGATTTACTTCTGCTTGGGTCGAGAGCGGGCTGGATGGGTGAAGAGTGGGCTCCCCGGCCCCTGACCAGGCAGGTGCAGACAAGTCGGAAGAAGGCCCGCCGCATCTCCTTGCTGGCCAGCGTGTAGATGACGGGGTTCATGGCAGAGTTGAGCACGGCCAGCACGATGAACCACTGGGCCTTGAACAGGATnGCGCACTCCTTCACCTTGCAGGCCACATCCACAAGGAAAAGGATGAAGAGTGGGGACCAGCAGGCGATGAACACGCTCACCACGATCACCACGGTCCGCAGCAGGGCCATGGACCGCTCTGAGTTGTGCGGGCTGGCCACCCTGCGGCTGCTGGACTTCACCAGGAAGTAGATGCGTGCGTACAGGATCACGATGGTCAC - > chr10_7434473_7435447 524 T C - ATTATTAACAGAAACATTTCTTTTTCATTACCCAGGGGTTACACTGGTCGTTGATGTTAATCAGTTTTTGGAGAAGGAGAAGCAAAGTGATATTTTGTCTGTTCTGAAGCCTGCCGTTGGTAATACAAATGACGTAATCCCTGAATGTGCTGACAGGTACCATGACGCCCTGGCAAAAGCAAAAGAGCAAAAATCTAGAAGnGGTAAGCATCTTCACTGTTTAGCACAAATTAAATAGCACTTTGAATATGATGATTTCTGTGGTATTGTGTTATCTTACTTTTGAGACAAATAATCGCTTTCAAATGAATATTTCTGAATGTTTGTCATCTCTGGCAAGGAAATTTTTTAGTGTTTCTTTTCCTTTTTTGTCTTTTGGAAATCTGTGATTAACTTGGTGGC - > chr14_80021455_80022064 138 G A - ACCCAGGGATCAAACCCAGGTCTCCCGCATTGCAGGCGGATTCTTTACTGTCTGAGCCTCCAGGGAAGCCCTCGGGGCTGAAGGGATGGTTATGAAGGTGAGAAACAGGGGCCACCTGTCCCCAAGGTACCTTGCGACnTGCCATCTGCGCTCCACCAGTAAATGGACGTCTTCGATCCTTCTGTTGTTGGCGTAGTGCAAACGTTTGGGAAGGTGCTGTTTCAAGTAAGGCTTAAAGTGCTGGTCTGGTTTTTTACACTGAAATATAAATGGACATTGGATTTTGCAATGGAGAGTCTTCTAGAAGAGTCCAAGACATTCTCTCCAGAAAGCTGAAGG - > chr15_64470252_64471048 89 G A - TGTGTGTGTGTGTGTGTGTGTGTGCCTGTGTCTGTACATGCACACCACGTGGCCTCACCCAGTGCCCTCAGCTCCATGGTGATGTCCACnTAGCCGTGCTCCGCGCTGTAGTACATGGCCTCCTGGAGGGCCTTGGTGCGCGTCCGGCTCAGGCGCATGGGCCCCTCGCTGCCGCTGCCCTGGCTGGATGCATCGCTCTCTTCCACGCCCTCAGCCAGGATCTCCTCCAGGGACAGCACATCTGCTTTGGCCTGCTGTGGCTGAGTCAGGAGCTTCCTCAGGACGTTCCT - etc. - - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/extract_primers.py --- a/tools/genome_diversity/extract_primers.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -#!/usr/bin/env python2.5 - -import os -import sys -from optparse import OptionParser -import genome_diversity as gd - -def main_function( parse_arguments=None ): - if parse_arguments is None: - parse_arguments = lambda arguments: ( None, arguments ) - def main_decorator( to_decorate ): - def decorated_main( arguments=None ): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments( arguments ) - rc = 1 - try: - rc = to_decorate( options, arguments ) - except Exception, err: - sys.stderr.write( 'ERROR: %s\n' % str( err ) ) - traceback.print_exc() - finally: - sys.exit( rc ) - return decorated_main - return main_decorator - -def parse_arguments( arguments ): - parser = OptionParser() - parser.add_option('--input', - type='string', dest='input', - help='file of selected SNPs') - parser.add_option('--output', - type='string', dest='output', - help='output file') - parser.add_option('--primers_loc', - type='string', dest='primers_loc', - help='primers .loc file') - parser.add_option('--scaffold_col', - type="int", dest='scaffold_col', - help='scaffold column in the input file') - parser.add_option('--pos_col', - type="int", dest='pos_col', - help='position column in the input file') - parser.add_option('--species', - type="string", dest='species', - help='species') - return parser.parse_args( arguments[1:] ) - - -@main_function( parse_arguments ) -def main( options, arguments ): - if not options.input: - raise RuntimeError( 'missing --input option' ) - if not options.output: - raise RuntimeError( 'missing --output option' ) - if not options.primers_loc: - raise RuntimeError( 'missing --primers_loc option' ) - if not options.scaffold_col: - raise RuntimeError( 'missing --scaffold_col option' ) - if not options.pos_col: - raise RuntimeError( 'missing --pos_col option' ) - if not options.species: - raise RuntimeError( 'missing --species option' ) - - snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) ) - - out_fh = gd._openfile( options.output, 'w' ) - - primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc ) - file_root, file_ext = os.path.splitext( primer_data_file ) - primer_index_file = file_root + ".cdb" - primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file ) - - while snps.next(): - seq, pos = snps.get_seq_pos() - primer = primers.get_entry( seq, pos ) - if primer: - out_fh.write( primer ) - - out_fh.close() - -if __name__ == "__main__": - main() - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/extract_primers.xml --- a/tools/genome_diversity/extract_primers.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ - - for selected SNPs - - - extract_primers.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc" - #if $override_metadata.choice == "0": - "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}" - #else - "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species" - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - This tool extracts primers for SNPs in the dataset using the Primer3 program. - The first line of output for a given SNP reports the name of the assembled - contig, the SNP's position in the contig, the two variant nucleotides, and - Primer3's "pair penalty". The next line, if not blank, names restriction - enzymes (from the user-adjustable list) that differentially cut at that - site, but do not cut at any other position between and including the - primer positions. The next lines show the SNP's flanking regions, with - the SNP position indicated by "n", including the primer positions and an - additional 3 nucleotides. - ------ - -**Example** - -- input file:: - - chr5_30800874_30802049 734 G A chr5 30801606 A 24 0 99 4 11 97 Y 496 0.502 0.033 0.215 6 - chr8_55117827_55119487 994 A G chr8 55118815 G 25 0 102 4 11 96 Y 22 0.502 0.025 2.365 1 - chr9_100484836_100485311 355 C T chr9 100485200 T 27 0 108 6 17 100 Y 190 0.512 0.880 2.733 4 - chr12_3635530_3637738 2101 T C chr12 3637630 T 25 0 102 4 13 93 Y 169 0.554 0.024 0.366 4 - -- output file:: - - chr5_30800874_30802049 734 G A 0.352964 - BglII,MboI,Sau3AI,Tru9I,XhoII - 1 CTGAAGGTGAGCAGGATTCAGGAGACAGAAAACAAAGCCCAGGCCTGCCCAAGGTGGAAA - >>>>>>>>>>>>>>>>>>>> - - 61 AGTCTAACAACTCGCCCTCTGCTTAnATCTGAGACTCACAGGGATAATAACACACTTGGT - - - 21 CAAGGAATAAACTAGATATTATTCACTCCTCTAGAAGGCTGCCAGGAAAATTGCCTGACT - <<<<<<< - - 181 TGAACCTTGGCTCTGA - <<<<<<<<<<<<< - etc. - - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/genome_diversity.py --- a/tools/genome_diversity/genome_diversity.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,266 +0,0 @@ -#!/usr/bin/env python2.5 - -import sys -import cdblib - -def _openfile( filename=None, mode='r' ): - try: - fh = open( filename, mode ) - except IOError, err: - raise RuntimeError( "can't open file: %s\n" % str( err ) ) - return fh - -def get_filename_from_loc( species=None, filename=None ): - fh = _openfile( filename ) - for line in fh: - if line and not line.startswith( '#' ): - line = line.rstrip( '\r\n' ) - if line: - elems = line.split( '\t' ) - if len( elems ) >= 2 and elems[0] == species: - return elems[1] - return None - - -class SnpFile( object ): - def __init__( self, filename=None, seq_col=1, pos_col=2, ref_seq_col=7, ref_pos_col=8 ): - self.filename = filename - self.fh = _openfile( filename ) - self.seq_col = seq_col - self.pos_col = pos_col - self.ref_seq_col = ref_seq_col - self.ref_pos_col = ref_pos_col - self.elems = None - self.line = None - self.comments = [] - - def next( self ): - while self.fh: - try: - self.line = self.fh.next() - except StopIteration: - self.line = None - self.elems = None - return None - if self.line: - self.line = self.line.rstrip( '\r\n' ) - if self.line: - if self.line.startswith( '#' ): - self.comments.append( self.line ) - else: - self.elems = self.line.split( '\t' ) - return 1 - - def get_seq_pos( self ): - if self.elems: - return self.elems[ self.seq_col - 1 ], self.elems[ self.pos_col - 1 ] - else: - return None, None - - def get_ref_seq_pos( self ): - if self.elems: - return self.elems[ self.ref_seq_seq - 1 ], self.elems[ self.ref_pos_col - 1 ] - else: - return None, None - - -class IndexedFile( object ): - - def __init__( self, data_file=None, index_file=None ): - self.data_file = data_file - self.index_file = index_file - self.data_fh = _openfile( data_file ) - self.index_fh = _openfile( index_file ) - self._reader = cdblib.Reader( self.index_fh.read(), hash ) - - def get_indexed_line( self, key=None ): - line = None - if key in self._reader: - offset = self._reader.getint( key ) - self.data_fh.seek( offset ) - try: - line = self.data_fh.next() - except StopIteration: - raise RuntimeError( 'index file out of sync for %s' % key ) - return line - -class PrimersFile( IndexedFile ): - def get_primer_header( self, sequence=None, position=None ): - key = "%s %s" % ( str( sequence ), str( position ) ) - header = self.get_indexed_line( key ) - if header: - if header.startswith( '>' ): - elems = header.split() - if len( elems ) < 3: - raise RuntimeError( 'short primers header for %s' % key ) - if sequence != elems[1] or str( position ) != elems[2]: - raise RuntimeError( 'primers index for %s finds %s %s' % ( key, elems[1], elems[2] ) ) - else: - raise RuntimeError( 'primers index out of sync for %s' % key ) - return header - - def get_entry( self, sequence=None, position=None ): - entry = self.get_primer_header( sequence, position ) - if entry: - while self.data_fh: - try: - line = self.data_fh.next() - except StopIteration: - break - if line.startswith( '>' ): - break - entry += line - return entry - - def get_enzymes( self, sequence=None, position=None ): - entry = self.get_primer_header( sequence, position ) - enzyme_list = [] - if entry: - try: - line = self.data_fh.next() - except StopIteration: - raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) ) - if line.startswith( '>' ): - raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) ) - line.rstrip( '\r\n' ) - if line: - enzymes = line.split( ',' ) - for enzyme in enzymes: - enzyme = enzyme.strip() - if enzyme: - enzyme_list.append( enzyme ) - return enzyme_list - -class SnpcallsFile( IndexedFile ): - def get_snp_seq( self, sequence=None, position=None ): - key = "%s %s" % ( str( sequence ), str( position ) ) - line = self.get_indexed_line( key ) - if line: - elems = line.split( '\t' ) - if len (elems) < 3: - raise RuntimeError( 'short snpcalls line for %s' % key ) - if sequence != elems[0] or str( position ) != elems[1]: - raise RuntimeError( 'snpcalls index for %s finds %s %s' % ( key, elems[0], elems[1] ) ) - return elems[2] - else: - return None - - def get_flanking_dna( self, sequence=None, position=None, format='fasta' ): - if format != 'fasta' and format != 'primer3': - raise RuntimeError( 'invalid format for flanking dna: %s' % str( format ) ) - seq = self.get_snp_seq( sequence, position ) - if seq: - p = seq.find('[') - if p == -1: - raise RuntimeError( 'snpcalls entry for %s %s missing left bracket: %s' % ( str( sequence ), str( position ), seq ) ) - q = seq.find(']', p + 1) - if q == -1: - raise RuntimeError( 'snpcalls entry for %s %s missing right bracket: %s' % ( str( sequence ), str( position ), seq ) ) - q += 1 - - if format == 'fasta': - flanking_seq = '> ' - else: - flanking_seq = 'SEQUENCE_ID=' - - flanking_seq += "%s %s %s %s\n" % ( str( sequence ), str( position ), seq[p+1], seq[p+3] ) - - if format == 'primer3': - flanking_seq += 'SEQUENCE_TEMPLATE=' - - flanking_seq += "%sn%s\n" % ( seq[0:p], seq[q:] ) - - if format == 'primer3': - flanking_seq += "SEQUENCE_TARGET=%d,11\n=\n" % ( p - 5 ) - - return flanking_seq - else: - return None - - - -class LocationFile( object ): - def __init__(self, filename): - self.build_map(filename) - - def build_map(self, filename): - self.map = {} - self.open_file(filename) - for line in self.read_lines(): - elems = line.split('\t', 1) - if len(elems) == 2: - self.map[ elems[0].strip() ] = elems[1].strip() - self.close_file() - - def read_lines(self): - for line in self.fh: - if not line.startswith('#'): - line = line.rstrip('\r\n') - yield line - - def open_file(self, filename): - self.filename = filename - try: - self.fh = open(filename, 'r') - except IOError, err: - print >> sys.stderr, "Error opening location file '%s': %s" % (filename, str(err)) - sys.exit(1) - - def close_file(self): - self.fh.close() - - def loc_file( self, key ): - if key in self.map: - return self.map[key] - else: - print >> sys.stderr, "'%s' does not appear in location file '%s'" % (key, self.filename) - sys.exit(1) - -class ChrLens( object ): - def __init__( self, location_file, species ): - self.chrlen_loc = LocationFile( location_file ) - self.chrlen_filename = self.chrlen_loc.loc_file( species ) - self.build_map() - - def build_map(self): - self.map = {} - self.open_file(self.chrlen_filename) - for line in self.read_lines(): - elems = line.split('\t', 1) - if len(elems) == 2: - chrom = elems[0].strip() - chrom_len_text = elems[1].strip() - try: - chrom_len = int( chrom_len_text ) - except ValueError: - print >> sys.stderr, "Bad length '%s' for chromosome '%s' in '%s'" % (chrom_len_text, chrom, self.chrlen_filename) - self.map[ chrom ] = chrom_len - self.close_file() - - def read_lines(self): - for line in self.fh: - if not line.startswith('#'): - line = line.rstrip('\r\n') - yield line - - def open_file(self, filename): - self.filename = filename - try: - self.fh = open(filename, 'r') - except IOError, err: - print >> sys.stderr, "Error opening chromosome length file '%s': %s" % (filename, str(err)) - sys.exit(1) - - def close_file(self): - self.fh.close() - - def length( self, key ): - if key in self.map: - return self.map[key] - else: - return None - - def __iter__( self ): - for chrom in self.map: - yield chrom - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/select_restriction_enzymes.py --- a/tools/genome_diversity/select_restriction_enzymes.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -#!/usr/bin/env python2.5 - -import os -import sys -from optparse import OptionParser -import genome_diversity as gd - -def main_function( parse_arguments=None ): - if parse_arguments is None: - parse_arguments = lambda arguments: ( None, arguments ) - def main_decorator( to_decorate ): - def decorated_main( arguments=None ): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments( arguments ) - rc = 1 - try: - rc = to_decorate( options, arguments ) - except Exception, err: - sys.stderr.write( 'ERROR: %s\n' % str( err ) ) - traceback.print_exc() - finally: - sys.exit( rc ) - return decorated_main - return main_decorator - -def parse_arguments( arguments ): - parser = OptionParser() - parser.add_option('--input', - type='string', dest='input', - help='file of selected SNPs') - parser.add_option('--output', - type='string', dest='output', - help='output file') - parser.add_option('--primers_loc', - type='string', dest='primers_loc', - help='primers .loc file') - parser.add_option('--scaffold_col', - type="int", dest='scaffold_col', - help='scaffold column in the input file') - parser.add_option('--pos_col', - type="int", dest='pos_col', - help='position column in the input file') - parser.add_option('--enzyme_list', - type="string", dest='enzyme_list_string', - help='comma separated list of enzymes') - parser.add_option('--species', - type="string", dest='species', - help='species') - return parser.parse_args( arguments[1:] ) - - -@main_function( parse_arguments ) -def main( options, arguments ): - if not options.input: - raise RuntimeError( 'missing --input option' ) - if not options.output: - raise RuntimeError( 'missing --output option' ) - if not options.primers_loc: - raise RuntimeError( 'missing --primers_loc option' ) - if not options.scaffold_col: - raise RuntimeError( 'missing --scaffold_col option' ) - if not options.pos_col: - raise RuntimeError( 'missing --pos_col option' ) - if not options.enzyme_list_string: - raise RuntimeError( 'missing --enzyme_list option' ) - if not options.species: - raise RuntimeError( 'missing --species option' ) - - snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) ) - - out_fh = gd._openfile( options.output, 'w' ) - - enzyme_dict = {} - for enzyme in options.enzyme_list_string.split( ',' ): - enzyme = enzyme.strip() - if enzyme: - enzyme_dict[enzyme] = 1 - - primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc ) - file_root, file_ext = os.path.splitext( primer_data_file ) - primer_index_file = file_root + ".cdb" - primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file ) - - comments_printed = False - - while snps.next(): - seq, pos = snps.get_seq_pos() - enzyme_list = primers.get_enzymes( seq, pos ) - for enzyme in enzyme_list: - if enzyme in enzyme_dict: - if not comments_printed: - for comment in snps.comments: - out_fh.write( "%s\n" % comment ) - comments_printed = True - out_fh.write( "%s\n" % snps.line ) - break - - out_fh.close() - -if __name__ == "__main__": - main() - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/select_restriction_enzymes.xml --- a/tools/genome_diversity/select_restriction_enzymes.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ - - a set of restriction enzymes - - - select_restriction_enzymes.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc" - #if $override_metadata.choice == "0": - "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}" - #else - "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species" - #end if - "--enzyme_list=$enzymes" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - It selects the SNPs that are differentially cut by at least one of the - specified restriction enzymes. The enzymes are required to cut the amplified - segment (for the specified PCR primers) only at the SNP. - ------ - -**Example** - -- input file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr18_50154905_50155664 304 A G Y C chr18 50155208 A Y 4 2 17 5 1 22 Y 8 0.022 0.996 0.128 0 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - -- output file:: - - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/select_snps.py --- a/tools/genome_diversity/select_snps.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ -#!/usr/bin/env python - -import sys -import math -from optparse import OptionParser -import genome_diversity as gd - -def main_function(parse_arguments=None): - if parse_arguments is None: - parse_arguments = lambda arguments: (None, arguments) - def main_decorator(to_decorate): - def decorated_main(arguments=None): - if arguments is None: - arguments = sys.argv - options, arguments = parse_arguments(arguments) - sys.exit(to_decorate(options, arguments)) - return decorated_main - return main_decorator - -def parse_arguments(arguments): - parser = OptionParser() - parser.add_option('--input', dest='input') - parser.add_option('--output', dest='output') - parser.add_option('--chrlens_loc', dest='chrlens_loc') - parser.add_option('--num_snps', dest='num_snps') - parser.add_option('--ref_chrom_col', dest='ref_chrom_col') - parser.add_option('--ref_pos_col', dest='ref_pos_col') - parser.add_option('--species', dest='species') - return parser.parse_args(arguments[1:]) - -@main_function(parse_arguments) -def main(options, arguments): - - ref_chrom_idx = to_int( options.ref_chrom_col ) -1 - ref_pos_idx = to_int( options.ref_pos_col ) -1 - - if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx): - print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset." - sys.exit(1) - - chrlens = gd.ChrLens( options.chrlens_loc, options.species ) - - total_len = 0 - for chrom in chrlens: - total_len += chrlens.length(chrom) - - total_requested = int( options.num_snps ) - lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx ) - selected = select_snps( data, total_len, total_requested ) - out_data = fix_selection_and_order_like_input(data, selected, total_requested) - write_selected_snps( options.output, out_data, lines, comments ) - -def to_int( value ): - try: - int_value = int( value ) - except ValueError: - int_value = 0 - return int_value - -def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ): - fh = open( filename, 'r' ) - if (chrom_idx >= pos_idx): - needed = chrom_idx + 1 - else: - needed = pos_idx + 1 - lines = [] - data = [] - comments = [] - line_idx = 0 - line_num = 0 - for line in fh: - line_num += 1 - line = line.rstrip('\r\n') - if line: - if line.startswith('#'): - comments.append(line) - else: - elems = line.split('\t') - if len(elems) >= needed: - chrom = elems[chrom_idx] - try: - pos = int(elems[pos_idx]) - except ValueError: - sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) ) - sys.exit(1) - lines.append(line) - chrom_sort = chrom.lstrip('chr') - data.append( [chrom_sort, chrom, pos, line_num, line_idx] ) - line_idx += 1 - fh.close() - data = sorted( data, key=lambda x: (x[0], x[2]) ) - return lines, data, comments - -def select_snps( data, total_len, requested ): - old_chrom = None - next_print = 0 - selected = [] - space = total_len / requested - for data_idx, datum in enumerate( data ): - chrom = datum[1] - pos = datum[2] - if chrom != old_chrom: - old_chrom = chrom - next_print = 0 - if pos >= next_print: - selected.append(data_idx) - next_print += space - return selected - -def fix_selection_and_order_like_input(data, selected, requested): - total_selected = len( selected ) - a = float( total_selected ) / requested - b = a / 2 - - idx_list = [] - for i in range( requested ): - idx = int( math.ceil( i * a + b ) - 1 ) - idx_list.append( idx ) - - out_data = [] - - for i, data_idx in enumerate(selected): - if total_selected > requested: - if i in idx_list: - out_data.append(data[data_idx]) - else: - out_data.append(data[data_idx]) - - out_data = sorted( out_data, key=lambda x: x[3] ) - - return out_data - -def write_selected_snps( filename, data, lines, comments ): - fh = open( filename, 'w' ) - - for comment in comments: - fh.write("%s\n" % comment ) - - for datum in data: - line_idx = datum[4] - fh.write("%s\n" % lines[line_idx]) - - fh.close() - -if __name__ == "__main__": - main() - - diff -r c2a356708570 -r 33c067c3ae34 tools/genome_diversity/select_snps.xml --- a/tools/genome_diversity/select_snps.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ - - a specified number of SNPs - - - select_snps.py "--input=$input" "--output=$output" "--chrlens_loc=${GALAXY_DATA_INDEX_DIR}/gd.chrlens.loc" "--num_snps=$num_snps" - #if $override_metadata.choice == "0": - "--ref_chrom_col=${input.metadata.ref}" "--ref_pos_col=${input.metadata.rPos}" "--species=${input.metadata.species}" - #else - "--ref_chrom_col=$ref_col" "--ref_pos_col=$rpos_col" "--species=$species" - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - It attempts to select a specified number of SNPs from the dataset, making them - approximately uniformly spaced relative to the reference genome. The number - actually selected may be slightly more than the specified number. - ------ - -**Example** - -- input file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr18_50154905_50155664 304 A G Y C chr18 50155208 A Y 4 2 17 5 1 22 Y 8 0.022 0.996 0.128 0 - chr18_57379354_57380496 315 C T V V chr18 57379669 G V 11 0 60 9 6 62 Y 726 0.118 0.048 0.014 1 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - chr19_39866997_39874915 3117 C T P P chr19 39870110 C P 3 7 65 14 2 32 Y 6 0.321 0.911 0.462 4 - etc. - -- output file:: - - chr2_75111355_75112576 314 A C L F chr2 75111676 C F 15 4 53 2 9 48 Y 96 0.369 0.355 0.396 0 - chr8_93901796_93905612 2471 A C A A chr8 93904264 A A 8 0 51 10 2 14 Y 961 0.016 0.534 0.114 2 - chr10_7434473_7435447 524 T C S S chr10 7435005 T S 11 5 90 14 0 69 Y 626 0.066 0.406 0.727 0 - chr14_80021455_80022064 138 G A H H chr14 80021593 G H 14 0 69 9 6 124 Y 377 0.118 0.997 0.195 1 - chr15_64470252_64471048 89 G A Y Y chr15 64470341 G Y 5 6 109 14 0 69 Y 312 0.247 0.998 0.393 0 - chr18_48070585_48071386 514 C T E K chr18 48071100 T K 7 7 46 14 0 69 Y 2 0.200 0.032 0.163 0 - chr19_14240610_14242055 232 C T A V chr19 14240840 C A 18 8 56 15 5 42 Y 73 0.003 0.153 0.835 0 - etc. - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/BEAM2_wrapper.sh --- a/tools/human_genome_variation/BEAM2_wrapper.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -#!/usr/bin/env bash -# -# Galaxy wrapper for Yu Zhang's BEAM2 adds two new options -# significance=foo renames significance.txt to foo after BEAM2 is run -# posterior=bar renames posterior.txt to bar after BEAM2 is run -# - -set -e - -export PATH=$PATH:$(dirname $0) - -## options -significance= -posterior= -new_args= -map= -ped= - -TFILE="/tmp/BEAM2.$$.tmp" - -## separate significance and posterior arguments from arguments to BEAM2 -until [ $# -eq 0 ] -do - case $1 in - significance=*) - significance=${1#significance=} - ;; - posterior=*) - posterior=${1#posterior=} - ;; - map=*) - map=${1#map=} - ;; - ped=*) - ped=${1#ped=} - ;; - *) - if [ -z "$new_args" ]; then - new_args=$1 - else - new_args="$new_args $1" - fi - ;; - esac - - shift -done - -## convert input for use with BEAM2 -lped_to_geno.pl $map $ped > $TFILE -if [ $? -ne 0 ]; then - echo "failed: lped_to_geno.pl $map $ped > $TFILE" - exit 1 -fi - -## run BEAM2 -BEAM2 $TFILE $new_args 1>/dev/null -if [ $? -ne 0 ]; then - echo "failed: BEAM2 $TFILE $new_args" - exit 1 -fi - -mergeSnps.pl significance.txt $TFILE -if [ $? -ne 0 ]; then - echo "failed: mergeSnps.pl significance.txt $TFILE" - exit 1 -fi - -## move output files -mv significance.txt $significance -mv posterior.txt $posterior - -## cleanup -rm -f $TFILE - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/beam.xml --- a/tools/human_genome_variation/beam.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,137 +0,0 @@ - - significant single- and multi-locus SNP associations in case-control studies - - - BEAM2_wrapper.sh map=${input.extra_files_path}/${input.metadata.base_name}.map ped=${input.extra_files_path}/${input.metadata.base_name}.ped $burnin $mcmc $pvalue significance=$significance posterior=$posterior - - - - - - - - - - - - - - - - beam - mv - rm - - - - - -.. class:: infomark - -This tool can take a long time to run, depending on the number of SNPs, the -sample size, and the number of MCMC steps specified. If you have hundreds -of thousands of SNPs, it may take over a day. The main tasks that slow down -this tool are searching for interactions and dynamically partitioning the -SNPs into blocks. Optimization is certainly possible, but hasn't been done -yet. **If your only interest is to detect SNPs with primary effects (i.e., -single-SNP associations), please use the GPASS tool instead.** - ------ - -**Dataset formats** - -The input dataset must be in lped_ format. The output datasets are both tabular_. -(`Dataset missing?`_) - -.. _lped: ./static/formatHelp.html#lped -.. _tabular: ./static/formatHelp.html#tabular -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -BEAM (Bayesian Epistasis Association Mapping) uses a Markov Chain Monte Carlo (MCMC) method to infer SNP block structures and detect both single-marker -and interaction effects from case-control SNP data. -This tool also partitions SNPs into blocks based on linkage disequilibrium (LD). The method utilized is Bayesian, so the outputs are posterior probabilities of association, along with block partitions. An advantage of this method is that it provides uncertainty measures for the associations and block partitions, and it scales well from small to large sample sizes. It is powerful in detecting gene-gene interactions, although slow for large datasets. - ------ - -**Example** - -- input map file:: - - 1 rs0 0 738547 - 1 rs1 0 5597094 - 1 rs2 0 9424115 - etc. - -- input ped file:: - - 1 1 0 0 1 1 G G A A A A A A A A A G A A G G G G A A G G G G G G A A A A A G A A G G A G A G A A G G A A G G A A G G A G A A G G A A G G A A A G A G G G A G G G G G A A A G A A G G G G G G G G A G A A A A A A A A - 1 1 0 0 1 1 G G A G G G A A A A A G A A G G G G G G A A G G A G A G G G G G A G G G A G A A G G A G G G A A G G G G A G A G G G A G A A A A G G G G A G A G G G A G A A A A A G G G A G G G A G G G G G A A G G A G - etc. - -- first output file, significance.txt:: - - ID chr position results - rs0 chr1 738547 10 20 score= 45.101397 , df= 8 , p= 0.000431 , N=1225 - -- second output file, posterior.txt:: - - id: chr position marginal + interaction = total posterior - 0: 1 738547 0.0000 + 0.0000 = 0.0000 - 1: 1 5597094 0.0000 + 0.0000 = 0.0000 - 2: 1 9424115 0.0000 + 0.0000 = 0.0000 - 3: 1 13879818 0.0000 + 0.0000 = 0.0000 - 4: 1 13934751 0.0000 + 0.0000 = 0.0000 - 5: 1 16803491 0.0000 + 0.0000 = 0.0000 - 6: 1 17236854 0.0000 + 0.0000 = 0.0000 - 7: 1 18445387 0.0000 + 0.0000 = 0.0000 - 8: 1 21222571 0.0000 + 0.0000 = 0.0000 - etc. - - id: chr position block_boundary | allele counts in cases and controls - 0: 1 738547 1.000 | 156 93 251 | 169 83 248 - 1: 1 5597094 1.000 | 323 19 158 | 328 16 156 - 2: 1 9424115 1.000 | 366 6 128 | 369 11 120 - 3: 1 13879818 1.000 | 252 31 217 | 278 32 190 - 4: 1 13934751 1.000 | 246 64 190 | 224 58 218 - 5: 1 16803491 1.000 | 91 160 249 | 91 174 235 - 6: 1 17236854 1.000 | 252 43 205 | 249 44 207 - 7: 1 18445387 1.000 | 205 66 229 | 217 56 227 - 8: 1 21222571 1.000 | 353 9 138 | 352 8 140 - etc. - - The "id" field is an internally used index. - ------ - -**References** - -Zhang Y, Liu JS. (2007) -Bayesian inference of epistatic interactions in case-control studies. -Nat Genet. 39(9):1167-73. Epub 2007 Aug 26. - -Zhang Y, Zhang J, Liu JS. (2010) -Block-based bayesian epistasis association mapping with application to WTCCC type 1 diabetes data. -Submitted. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/ctd.pl --- a/tools/human_genome_variation/ctd.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -use LWP::UserAgent; -require HTTP::Cookies; - -####################################################### -# ctd.pl -# Submit a batch query to CTD and fetch results into galaxy history -# usage: ctd.pl inFile idCol inputType resultType actionType outFile -####################################################### - -if (!@ARGV or scalar @ARGV != 6) { - print "usage: ctd.pl inFile idCol inputType resultType actionType outFile\n"; - exit; -} - -my $in = shift @ARGV; -my $col = shift @ARGV; -if ($col < 1) { - print "The column number is with a 1 start\n"; - exit 1; -} -my $type = shift @ARGV; -my $resType = shift @ARGV; -my $actType = shift @ARGV; -my $out = shift @ARGV; - -my @data; -open(FH, $in) or die "Couldn't open $in, $!\n"; -while () { - chomp; - my @f = split(/\t/); - if (scalar @f < $col) { - print "ERROR the requested column is not in the file $col\n"; - exit 1; - } - push(@data, $f[$col-1]); -} -close FH or die "Couldn't close $in, $!\n"; - -my $url = 'http://ctd.mdibl.org/tools/batchQuery.go'; -#my $url = 'http://globin.bx.psu.edu/cgi-bin/print-query'; -my $d = join("\n", @data); -#list maintains order, where hash doesn't -#order matters at ctd -#to use input file (gives error can't find file) -#my @form = ('inputType', $type, 'inputTerms', '', 'report', $resType, - #'queryFile', [$in, ''], 'queryFileColumn', $col, 'format', 'tsv', 'action', 'Submit'); -my @form = ('inputType', $type, 'inputTerms', $d, 'report', $resType, - 'queryFile', '', 'format', 'tsv', 'action', 'Submit'); -if ($resType eq 'cgixns') { #only add if this type - push(@form, 'actionTypes', $actType); -} -my $ua = LWP::UserAgent->new; -$ua->cookie_jar(HTTP::Cookies->new( () )); -$ua->agent('Mozilla/5.0'); -my $page = $ua->post($url, \@form, 'Content_Type'=>'form-data'); -if ($page->is_success) { - open(FH, ">", $out) or die "Couldn't open $out, $!\n"; - print FH "#"; - print FH $page->content, "\n"; - close FH or die "Couldn't close $out, $!\n"; -}else { - print "ERROR failed to get page from CTD, ", $page->status_line, "\n"; - print $page->content, "\n"; - my $req = $page->request(); - print "Requested \n"; - foreach my $k(keys %$req) { - if ($k eq '_headers') { - my $t = $req->{$k}; - foreach my $k2 (keys %$t) { print "$k2 => $t->{$k2}\n"; } - }else { print "$k => $req->{$k}\n"; } - } - exit 1; -} -exit; - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/ctd.xml --- a/tools/human_genome_variation/ctd.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,320 +0,0 @@ - - analysis of chemicals, diseases, or genes - - - ctd.pl $input $numerical_column $inType.inputType - #if $inType.inputType == "disease" - $inType.report ANY - #else if $inType.reportType.report == "cgixns" - $inType.reportType.report $inType.reportType.actType - #else - $inType.reportType.report ANY - #end if - $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Dataset formats** - -The input and output datasets are tabular_. -(`Dataset missing?`_) - -.. _tabular: ./static/formatHelp.html#tab -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool extracts data related to the provided list of identifiers -from the Comparative Toxicogenomics Database (CTD). The fields -extracted vary with the type of data requested; the first row -of the output identifies the columns. - -For the curated chemical-gene interactions, you can also choose the -interaction type from the search-and-select box. The choices that -start with '-' are a subset of a choice above them; you can chose -either the general interaction type or a more specific one. - -Website: http://ctd.mdibl.org/ - ------ - -**Examples** - -- input data file: - HBB - -- select Column = c1, Identifier type = Genes, and Data to extract = All disease relationships - -- output file:: - - #Input GeneSymbol GeneName GeneID DiseaseName DiseaseID GeneDiseaseRelation OmimIDs PubMedIDs - hbb HBB hemoglobin, beta 3043 Abnormalities, Drug-Induced MESH:D000014 inferred via Ethanol 17676605|18926900 - hbb HBB hemoglobin, beta 3043 Abnormalities, Drug-Induced MESH:D000014 inferred via Valproic Acid 8875741 - etc. - -Another example: - -- same input file: - HBB - -- select Column = c1, Identifier type = Genes, Data to extract = Curated chemical-gene interactions, and Interaction type = ANY - -- output file:: - - #Input GeneSymbol GeneName GeneID ChemicalName ChemicalID CasRN Organism OrganismID Interaction InteractionTypes PubMedIDs - hbb HBB hemoglobin, beta 3043 1-nitronaphthalene C016614 86-57-7 Macaca mulatta 9544 1-nitronaphthalene metabolite binds to HBB protein binding 16453347 - hbb HBB hemoglobin, beta 3043 2,6-diisocyanatotoluene C026942 91-08-7 Cavia porcellus 10141 2,6-diisocyanatotoluene binds to HBB protein binding 8728499 - etc. - ------ - -**Reference** - -Davis AP, Murphy CG, Saraceni-Richards CA, Rosenstein MC, Wiegers TC, Mattingly CJ. (2009) -Comparative Toxicogenomics Database: a knowledgebase and discovery tool for -chemical-gene-disease networks. -Nucleic Acids Res. 37(Database issue):D786-92. Epub 2008 Sep 9. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/disease_ontology_gene_fuzzy_selector.pl --- a/tools/human_genome_variation/disease_ontology_gene_fuzzy_selector.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -################################################################## -# Select genes that are associated with the diseases listed in the -# disease ontology. -# ontology: http://do-wiki.nubic.northwestern.edu/index.php/Main_Page -# gene associations by FunDO: http://projects.bioinformatics.northwestern.edu/do_rif/ -# Sept 2010, switch to doLite -# input: build outfile sourceFileLoc.loc term or partial term -################################################################## - -if (!@ARGV or @ARGV < 3) { - print "usage: disease_ontology_gene_selector.pl build outfile.txt sourceFile.loc [list of terms]\n"; - exit; -} - -my $build = shift @ARGV; -my $out = shift @ARGV; -my $in = shift @ARGV; -my $term = shift @ARGV; -$term =~ s/^'//; #remove quotes protecting from shell -$term =~ s/'$//; -my $data; -open(LOC, $in) or die "Couldn't open $in, $!\n"; -while () { - chomp; - if (/^\s*#/) { next; } - my @f = split(/\t/); - if ($f[0] eq $build) { - if ($f[1] eq 'disease associated genes') { - $data = $f[2]; - } - } -} -close LOC or die "Couldn't close $in, $!\n"; -if (!$data) { - print "Error $build not found in $in\n"; - exit; -} -if (!defined $term) { - print "No disease term entered\n"; - exit; -} - -#start with just fuzzy term matches -open(OUT, ">", $out) or die "Couldn't open $out, $!\n"; -open(FH, $data) or die "Couldn't open data file $data, $!\n"; -$term =~ s/\s+/|/g; #use OR between words -while () { - chomp; - my @f = split(/\t/); #chrom start end strand geneName geneID disease - if ($f[6] =~ /($term)/i) { - print OUT join("\t", @f), "\n"; - }elsif ($term eq 'disease') { #print all with disease - print OUT join("\t", @f), "\n"; - } -} -close FH or die "Couldn't close data file $data, $!\n"; -close OUT or die "Couldn't close $out, $!\n"; - -exit; diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/freebayes.xml --- a/tools/human_genome_variation/freebayes.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,124 +0,0 @@ - - - - freebayes - - Bayesian genetic variant detector - - ln -s $reference localref.fa; - ln -s $bamfile localbam.bam; - samtools faidx localref.fa; - samtools sort localbam.bam localbam.bam; - samtools index localbam.bam; - freebayes --fasta-reference localref.fa localbam.bam --vcf $output - #if $params.source_select == "full": - $params.showRefRepeats - -T $params.theta - -p $params.ploidy - $params.pooled - $params.mnps - $params.nosnps - -n $params.bestAlleles - $params.allAlleles - $params.duplicateReads - -M $params.refMapQuality - $params.ignoreRefAllele - $params.haploidReference - -m $params.minMapQuality - -q $params.minBaseQuality - $params.noFilters - -x $params.indelExclusionWindow - - -V $params.diffusionPriorScalar - -W $params.postIntegBandwidth - -Y $params.postIntegBanddepth - -F $params.minAltFraction - -C $params.minAltCount - -G $params.minAltTotal - --min-coverage $params.minCoverage - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool uses Freebayes to call SNPS given a reference sequence and a BAM alignment file. - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/funDo.xml --- a/tools/human_genome_variation/funDo.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ - - human genes associated with disease terms - - - disease_ontology_gene_fuzzy_selector.pl $build $out_file1 ${GALAXY_DATA_INDEX_DIR}/funDo.loc '$term' - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Dataset formats** - -There is no input dataset. The output is in interval_ format. - -.. _interval: ./static/formatHelp.html#interval - ------ - -**What it does** - -This tool searches the disease-term field of the DOLite mappings -used by the FunDO project and returns a set of genes that -are associated with terms matching the specified pattern. (This is the -reverse of what FunDO's own server does.) - -The search is case insensitive, and selects terms that contain any of -the given words, either exactly or within a longer word (e.g. "nemia" -selects not only "anemia", but also "hyperglycinemia", "tyrosinemias", -and many other things). Multiple words should be separated by spaces, -not commas. As a special case, entering the word "disease" returns all -genes associated with any disease, even if that word does not actually -appear in the term field. - -Website: http://django.nubic.northwestern.edu/fundo/ - ------ - -**Example** - -Typing:: - - carcinoma - -results in:: - - 1. 2. 3. 4. 5. 6. 7. - chr11 89507465 89565427 + NAALAD2 10003 Adenocarcinoma - chr15 50189113 50192264 - BCL2L10 10017 Carcinoma - chr7 150535855 150555250 - ABCF2 10061 Clear cell carcinoma - chr7 150540508 150555250 - ABCF2 10061 Clear cell carcinoma - chr10 134925911 134940397 - ADAM8 101 Adenocarcinoma - chr10 134925911 134940397 - ADAM8 101 Adenocarcinoma - etc. - -where the column contents are as follows:: - - 1. chromosome name - 2. start position of the gene - 3. end position of the gene - 4. strand - 4. gene name - 6. Entrez Gene ID - 7. disease term - ------ - -**References** - -Du P, Feng G, Flatow J, Song J, Holko M, Kibbe WA, Lin SM. (2009) -From disease ontology to disease-ontology lite: statistical methods to adapt a general-purpose -ontology for the test of gene-ontology associations. -Bioinformatics. 25(12):i63-8. - -Osborne JD, Flatow J, Holko M, Lin SM, Kibbe WA, Zhu LJ, Danila MI, Feng G, Chisholm RL. (2009) -Annotating the human genome with Disease Ontology. -BMC Genomics. 10 Suppl 1:S6. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/gpass.pl --- a/tools/human_genome_variation/gpass.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; -use File::Basename; -use File::Temp qw/ tempfile /; - -$ENV{'PATH'} .= ':' . dirname($0); - -#this is a wrapper for gpass that converts a linkage pedigree file to input -#for this program - -my($map, $ped, $out, $fdr) = @ARGV; - -if (!$map or !$ped or !$out or !$fdr) { die "missing args\n"; } - -my($fh, $name) = tempfile(); -#by default this file is removed when these variable go out of scope -print $fh "map=$map ped=$ped\n"; -close $fh; #converter will overwrite, just keep name - -#run converter -system("lped_to_geno.pl $map $ped > $name") == 0 - or die "system lped_to_geno.pl $map $ped > $name failed\n"; - -#system("cp $name tmp.middle"); - -#run GPASS -system("gpass $name -o $out -fdr $fdr 1>/dev/null") == 0 - or die "system gpass $name -o $out -fdr $fdr, failed\n"; - -#merge SNP data with results -merge(); - -exit; - -######################################## - -#merge the input and output files so have SNP data with result -sub merge { - open(FH, $out) or die "Couldn't open $out, $!\n"; - my %res; - my @ind; - while () { - chomp; - my $line = $_; - if ($line =~ /^(\d+)/) { $res{$1} = $line; push(@ind, $1); } - else { $res{'index'} = $line; } - } - close FH; - if (!@ind) { return; } #no results, leave alone - @ind = sort { $a <=> $b } @ind; - $res{'index'} =~ s/Index/#ID\tchr\tposition/; - #read input file to get SNP data - open(FH, $name) or die "Couldn't open $name, $!\n"; - my $i = 0; #index is 0 based not counting header line - my $c = shift @ind; - while () { - chomp; - if (/^ID/) { next; } - my @f = split(/\s+/); - if ($i == $c) { - $res{$i} =~ s/^$i/$f[0]\t$f[1]\t$f[2]/; - if (!@ind) { last; } - $c = shift @ind; - } - $i++; - } - close FH; - #now reprint results with SNP data included - open(FH, ">", $out) or die "Couldn't write to $out, $!\n"; - print FH $res{'index'}, "\n"; - delete $res{'index'}; - foreach $i (keys %res) { - print FH $res{$i}, "\n"; - } - close FH; -} - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/gpass.xml --- a/tools/human_genome_variation/gpass.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ - - significant single-SNP associations in case-control studies - - - gpass.pl ${input1.extra_files_path}/${input1.metadata.base_name}.map ${input1.extra_files_path}/${input1.metadata.base_name}.ped $output $fdr - - - - - - - - - - - - - gpass - - - - - -**Dataset formats** - -The input dataset must be in lped_ format, and the output is tabular_. -(`Dataset missing?`_) - -.. _lped: ./static/formatHelp.html#lped -.. _tabular: ./static/formatHelp.html#tab -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -GPASS (Genome-wide Poisson Approximation for Statistical Significance) -detects significant single-SNP associations in case-control studies at a user-specified FDR. Unlike previous methods, this tool can accurately approximate the genome-wide significance and FDR of SNP associations, while adjusting for millions of multiple comparisons, within seconds or minutes. - -The program has two main functionalities: - -1. Detect significant single-SNP associations at a user-specified false - discovery rate (FDR). - - *Note*: a "typical" definition of FDR could be - FDR = E(# of false positive SNPs / # of significant SNPs) - - This definition however is very inappropriate for association mapping, since SNPs are - highly correlated. Our FDR is - defined differently to account for SNP correlations, and thus will obtain - a proper FDR in terms of "proportion of false positive loci". - -2. Approximate the significance of a list of candidate SNPs, adjusting for - multiple comparisons. If you have isolated a few SNPs of interest and want - to know their significance in a GWAS, you can supply the GWAS data and let - the program specifically test those SNPs. - - -*Also note*: the number of SNPs in a study cannot be both too small and at the same -time too clustered in a local region. A few hundreds of SNPs, or tens of SNPs -spread in different regions, will be fine. The sample size cannot be too small -either; around 100 or more individuals (case + control combined) will be fine. -Otherwise use permutation. - ------ - -**Example** - -- input map file:: - - 1 rs0 0 738547 - 1 rs1 0 5597094 - 1 rs2 0 9424115 - etc. - -- input ped file:: - - 1 1 0 0 1 1 G G A A A A A A A A A G A A G G G G A A G G G G G G A A A A A G A A G G A G A G A A G G A A G G A A G G A G A A G G A A G G A A A G A G G G A G G G G G A A A G A A G G G G G G G G A G A A A A A A A A - 1 1 0 0 1 1 G G A G G G A A A A A G A A G G G G G G A A G G A G A G G G G G A G G G A G A A G G A G G G A A G G G G A G A G G G A G A A A A G G G G A G A G G G A G A A A A A G G G A G G G A G G G G G A A G G A G - etc. - -- output dataset, showing significant SNPs and their p-values and FDR:: - - #ID chr position Statistics adj-Pvalue FDR - rs35 chr1 136606952 4.890849 0.991562 0.682138 - rs36 chr1 137748344 4.931934 0.991562 0.795827 - rs44 chr2 14423047 7.712832 0.665086 0.218776 - etc. - ------ - -**Reference** - -Zhang Y, Liu JS. (2010) -Fast and accurate significance approximation for genome-wide association studies. -Submitted. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/hilbertvis.sh --- a/tools/human_genome_variation/hilbertvis.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -#!/usr/bin/env bash - -input_file="$1" -output_file="$2" -chromInfo_file="$3" -chrom="$4" -score_col="$5" -hilbert_curve_level="$6" -summarization_mode="$7" -chrom_col="$8" -start_col="$9" -end_col="${10}" -strand_col="${11}" - -## use first sequence if chrom filed is empty -if [ -z "$chrom" ]; then - chrom=$( head -n 1 "$input_file" | cut -f$chrom_col ) -fi - -## get sequence length -if [ ! -r "$chromInfo_file" ]; then - echo "Unable to read chromInfo_file $chromInfo_file" 1>&2 - exit 1 -fi - -chrom_len=$( awk '$1 == chrom {print $2}' chrom=$chrom $chromInfo_file ) - -## error if we can't find the chrom_len -if [ -z "$chrom_len" ]; then - echo "Can't find length for sequence \"$chrom\" in chromInfo_file $chromInfo_file" 1>&2 - exit 1 -fi - -## make sure chrom_len is positive -if [ $chrom_len -le 0 ]; then - echo "sequence \"$chrom\" length $chrom_len <= 0" 1>&2 - exit 1 -fi - -## modify R script depending on the inclusion of a score column, strand information -input_cols="\$${start_col}, \$${end_col}" -col_types='beg=0, end=0, strand=""' - -# if strand_col == 0 (strandCol metadata is not set), assume everything's on the plus strand -if [ $strand_col -ne 0 ]; then - input_cols="${input_cols}, \$${strand_col}" -else - input_cols="${input_cols}, \\\"+\\\"" -fi - -# set plot value (either from data or use a constant value) -if [ $score_col -eq -1 ]; then - value=1 -else - input_cols="${input_cols}, \$${score_col}" - col_types="${col_types}, score=0" - value='chunk$score[i]' -fi - -R --vanilla &> /dev/null < - visualization of genomic data with the Hilbert curve - - - hilbertvis.sh $input $output $chromInfo "$chrom" $plot_value.score_col $level $mode - #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__) - 1 4 5 7 - #else - ${input.metadata.chromCol} ${input.metadata.startCol} ${input.metadata.endCol} ${input.metadata.strandCol} - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Dataset formats** - -The input format is interval_, and the output is an image in PDF format. -(`Dataset missing?`_) - -.. _interval: ./static/formatHelp.html#interval -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -HilbertVis uses the Hilbert space-filling curve to visualize the structure of -position-dependent data. It maps the traditional one-dimensional line -visualization onto a two-dimensional square. For example, here is a diagram -showing the path of a level-2 Hilbert curve. - -.. image:: ./static/images/hilbertvisDiagram.png - -The shade of each pixel represents the value for the corresponding bin of -consecutive genomic positions, calculated according to the specified -summarization mode. The pixels are arranged so that bins that are close -to each other on the data vector are represented by pixels that are close -to each other in the plot. In particular, adjacent bins are mapped to -adjacent pixels. Hence, dark spots in a figure represent a peak; the area -of the spot in the two-dimensional plot is proportional to the width of the -peak in the one-dimensional data, and the darkness of the spot corresponds to -the height of the peak. - -The input file is in interval format, and typically contains a column with -scores or other numbers, such as conservation scores, SNP density, the -coverage of aligned reads from ChIP-Seq data, etc. - -Website: http://www.ebi.ac.uk/huber-srv/hilbert/ - ------ - -**Examples** - -Here are some examples from the HilbertVis homepage, using ChIP-Seq data. - -.. image:: ./static/images/hilbertvis1.png - ------ - -.. image:: ./static/images/hilbertvis2.png - ------ - -**Reference** - -Anders S. (2009) -Visualization of genomic data with the Hilbert curve. -Bioinformatics. 25(10):1231-5. Epub 2009 Mar 17. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/ldtools.xml --- a/tools/human_genome_variation/ldtools.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ - - linkage disequilibrium and tag SNPs - - - ldtools_wrapper.sh rsquare=$rsquare freq=$freq input=$input output=$output - - - - - - - - - - - - - - - - - - - - - - - - - - -**Dataset formats** - -The input and output datasets are tabular_. -(`Dataset missing?`_) - -.. _tabular: ./static/formatHelp.html#tab -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool can be used to analyze the patterns of linkage disequilibrium -(LD) between polymorphic sites in a locus. SNPs are grouped based on the -threshold level of LD as measured by r\ :sup:`2` (regardless of genomic -position), and a representative "tag SNP" is reported for each group. -The other SNPs in the group are in LD with the tag SNP, but not necessarily -with each other. - -The underlying algorithm is the same as the one used in ldSelect (Carlson -et al. 2004). However, this tool is implemented to be much faster and more -efficient than ldSelect. - -The input is a tabular file with genotype information for each individual -at each SNP site, in exactly four columns: site ID, sample ID, and the -two allele nucleotides. - ------ - -**Example** - -- input file:: - - rs2334386 NA20364 G T - rs2334386 NA20363 G G - rs2334386 NA20360 G G - rs2334386 NA20359 G G - rs2334386 NA20358 G G - rs2334386 NA20356 G G - rs2334386 NA20357 G G - rs2334386 NA20350 G G - rs2334386 NA20349 G G - rs2334386 NA20348 G G - rs2334386 NA20347 G G - rs2334386 NA20346 G G - rs2334386 NA20345 G G - rs2334386 NA20344 G G - rs2334386 NA20342 G G - etc. - -- output file:: - - rs2238748 rs2793064,rs6518516,rs6518517,rs2283641,rs5993533,rs715590,rs2072123,rs2105421,rs2800954,rs1557847,rs807750,rs807753,rs5993488,rs8138035,rs2800980,rs2525079,rs5992353,rs712966,rs2525036,rs807743,rs1034727,rs807744,rs2074003 - rs2871023 rs1210715,rs1210711,rs5748189,rs1210709,rs3788298,rs7284649,rs9306217,rs9604954,rs1210703,rs5748179,rs5746727,rs5748190,rs5993603,rs2238766,rs885981,rs2238763,rs5748165,rs9605996,rs9606001,rs5992398 - rs7292006 rs13447232,rs5993665,rs2073733,rs1057457,rs756658,rs5992395,rs2073760,rs739369,rs9606017,rs739370,rs4493360,rs2073736 - rs2518840 rs1061325,rs2283646,rs362148,rs1340958,rs361956,rs361991,rs2073754,rs2040771,rs2073740,rs2282684 - rs2073775 rs10160,rs2800981,rs807751,rs5993492,rs2189490,rs5747997,rs2238743 - rs5747263 rs12159924,rs2300688,rs4239846,rs3747025,rs3747024,rs3747023,rs2300691 - rs433576 rs9605439,rs1109052,rs400509,rs401099,rs396012,rs410456,rs385105 - rs2106145 rs5748131,rs2013516,rs1210684,rs1210685,rs2238767,rs2277837 - rs2587082 rs2257083,rs2109659,rs2587081,rs5747306,rs2535704,rs2535694 - rs807667 rs2800974,rs756651,rs762523,rs2800973,rs1018764 - rs2518866 rs1206542,rs807467,rs807464,rs807462,rs712950 - rs1110661 rs1110660,rs7286607,rs1110659,rs5992917,rs1110662 - rs759076 rs5748760,rs5748755,rs5748752,rs4819925,rs933461 - rs5746487 rs5992895,rs2034113,rs2075455,rs1867353 - rs5748212 rs5746736,rs4141527,rs5748147,rs5748202 - etc. - ------ - -**Reference** - -Carlson CS, Eberle MA, Rieder MJ, Yi Q, Kruglyak L, Nickerson DA. (2004) -Selecting a maximally informative set of single-nucleotide polymorphisms for -association analyses using linkage disequilibrium. -Am J Hum Genet. 74(1):106-20. Epub 2003 Dec 15. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/ldtools_wrapper.sh --- a/tools/human_genome_variation/ldtools_wrapper.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -#!/usr/bin/env bash -# -# Galaxy wrapper for Aakrosh Ratan's ldtools -# - -set -e - -export PATH=$PATH:$(dirname $0) - -## pagetag options -input= -rsquare=0.64 -freq=0.00 -sample=### - -## senatag options -excluded=### -required=### -output= - -until [ $# -eq 0 ] -do - case $1 in - rsquare=*) - rsquare=${1#rsquare=} - ;; - freq=*) - freq=${1#freq=} - ;; - input=*) - input=${1#input=} - ;; - output=*) - output=${1#output=} - ;; - *) - if [ -z "$new_args" ]; then - new_args=$1 - else - new_args="$new_args $1" - fi - ;; - esac - - shift -done - -## run pagetag -pagetag.py --rsquare $rsquare --freq $freq $input snps.txt neighborhood.txt &> /dev/null -if [ $? -ne 0 ]; then - echo "failed: pagetag.py --rsquare $rsquare --freq $freq $input snps.txt neighborhood.txt" - exit 1 -fi - -## run sentag -senatag.py neighborhood.txt snps.txt > $output 2> /dev/null -if [ $? -ne 0 ]; then - echo "failed: senatag.py neighborhood.txt snps.txt" - exit 1 -fi - -## cleanup -rm -f snps.txt neighborhood.txt - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/linkToDavid.pl --- a/tools/human_genome_variation/linkToDavid.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -################################################### -# linkToDavid.pl -# Generates a link to David for a list of gene IDs. -################################################### - -if (!@ARGV or scalar @ARGV != 4) { - print "usage: linkToDavid.pl infile.tab 1basedCol idType outfile\n"; - exit 1; -} - -my $in = shift @ARGV; -my $col = shift @ARGV; -my $type = shift @ARGV; -my $out = shift @ARGV; - -if ($col < 1) { - print "ERROR the column number should be 1 based counting\n"; - exit 1; -} -my @gene; -open(FH, $in) or die "Couldn't open $in, $!\n"; -while () { - chomp; - my @f = split(/\t/); - if (scalar @f < $col) { - print "ERROR there is no column $col in $in\n"; - exit 1; - } - if ($f[$col-1]) { push(@gene, $f[$col-1]); } -} -close FH or die "Couldn't close $in, $!\n"; - -if (scalar @gene > 400) { - print "ERROR David only allows 400 genes submitted via a link\n"; - exit 1; -} - -my $link = 'http://david.abcc.ncifcrf.gov/api.jsp?type=TYPE&ids=GENELIST&tool=summary'; - -my $g = join(",", @gene); -$link =~ s/GENELIST/$g/; -$link =~ s/TYPE/$type/; -#print output -if (length $link > 2048) { - print "ERROR too many genes to fit in URL, please select a smaller set\n"; - exit; -} -open(FH, ">", $out) or die "Couldn't open $out, $!\n"; -print FH "DAVID link\n", - 'click here to send of identifiers to DAVID', "\n", - '', "\n"; -close FH or die "Couldn't close $out, $!\n"; - -exit; diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/linkToDavid.xml --- a/tools/human_genome_variation/linkToDavid.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ - - functional annotation for a list of genes - - - linkToDavid.pl $input $numerical_column $type $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .. class:: infomark - -The list is limited to 400 IDs. - ------ - -**Dataset formats** - -The input dataset is in tabular_ format. The output dataset is html_ with -a link to the DAVID website as described below. -(`Dataset missing?`_) - -.. _tabular: ./static/formatHelp.html#tab -.. _html: ./static/formatHelp.html#html -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool creates a link to the Database for Annotation, -Visualization, and Integrated Discovery (DAVID) website at NIH, -sending a list of IDs from the selected column of a tabular -Galaxy dataset. To follow the created link, click on the -eye icon once the Galaxy tool has finished running. - -DAVID provides a comprehensive set of functional annotation tools -to help investigators discover biological meaning behind large -lists of genes. - ------ - -**References** - -Huang DW, Sherman BT, Lempicki RA. (2009) Systematic and integrative analysis -of large gene lists using DAVID bioinformatics resources. -Nat Protoc. 4(1):44-57. - -Dennis G, Sherman BT, Hosack DA, Yang J, Gao W, Lane HC, Lempicki RA. (2003) -DAVID: database for annotation, visualization, and integrated discovery. -Genome Biol. 4(5):P3. Epub 2003 Apr 3. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/linkToGProfile.pl --- a/tools/human_genome_variation/linkToGProfile.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -################################################### -# linkToGProfile.pl -# Generates a link to gprofile for a list of gene IDs. -# g:Profiler a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200 -################################################### - -if (!@ARGV or scalar @ARGV != 4) { - print "usage: linkToGProfile.pl infile.tab 1basedCol idType outfile\n"; - exit 1; -} - -my $in = shift @ARGV; -my $col = shift @ARGV; -my $type = shift @ARGV; -my $out = shift @ARGV; - -if ($col < 1) { - print "ERROR the column number should be 1 based counting\n"; - exit 1; -} -my @gene; -open(FH, $in) or die "Couldn't open $in, $!\n"; -while () { - chomp; - my @f = split(/\t/); - if (scalar @f < $col) { - print "ERROR there is no column $col in $in\n"; - exit 1; - } - if ($f[$col-1]) { push(@gene, $f[$col-1]); } -} -close FH or die "Couldn't close $in, $!\n"; - -my $link = 'http://biit.cs.ut.ee/gprofiler/index.cgi?organism=hsapiens&query=GENELIST&r_chr=1&r_start=start&r_end=end&analytical=1&domain_size_type=annotated&term=&significant=1&sort_by_structure=1&user_thr=1.00&output=png&prefix=TYPE'; -$link =~ s/TYPE/$type/; -my $g = join("+", @gene); -$link =~ s/GENELIST/$g/; -#print output -if (length $link > 2048) { - print "ERROR too many genes to fit in URL, please select a smaller set\n"; - exit; -} -open(FH, ">", $out) or die "Couldn't open $out, $!\n"; -print FH "g:Profiler link\n", - 'click here to send list of identifiers to g:Profiler', "\n", - '', "\n"; -close FH or die "Couldn't close $out, $!\n"; - -#also do link that prints text that could be pulled back into Galaxy? -exit; diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/linkToGProfile.xml --- a/tools/human_genome_variation/linkToGProfile.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ - - tools for functional profiling of gene lists - - - linkToGProfile.pl $input $numerical_column $type $out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Dataset formats** - -The input dataset is tabular_ with a column of identifiers. -The output dataset is html_ with a link to g:Profiler. -(`Dataset missing?`_) - -.. _tabular: ./static/formatHelp.html#tab -.. _html: ./static/formatHelp.html#html -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool creates a link to the g:GOSt tool (Gene Group Functional -Profiling), which is part of the g:Profiler site at the University -of Tartu in Estonia. g:GOSt retrieves the most significant Gene -Ontology (GO) terms, KEGG and REACTOME pathways, and TRANSFAC motifs -for a user-specified group of genes, proteins, or microarray probes. -g:GOSt also allows analysis of ranked or ordered lists of genes, -visual browsing of GO graph structure, interactive visualization of -retrieved results, and many other features. Multiple testing -corrections are applied to extract only statistically important -results. - -The g:GOSt form is pre-filled with gene, protein, or microarray probe -IDs from the selected column of a tabular Galaxy dataset. To follow -the created link, click on the eye icon when the Galaxy tool has -finished running. Once at the g:Profiler site, scroll down to see -the g:GOSt results. You can also adjust the options in the g:GOSt -form to your liking, or use the row of links between the form and -the results to run other g:Profiler tools using the same list of IDs. - ------ - -**Reference** - -Reimand J, Kull M, Peterson H, Hansen J, Vilo J. (2007) g:Profiler -- a web-based -toolset for functional profiling of gene lists from large-scale experiments. -Nucleic Acids Res. 35(Web Server issue):W193-200. Epub 2007 May 3. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/lped_to_geno.pl --- a/tools/human_genome_variation/lped_to_geno.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -#convert from a MAP and PED file to a genotype file -#assumes not many SNPs but lots of individuals -# transposed formats are used when lots of SNPs (TPED, TFAM) - -if (!@ARGV or scalar @ARGV ne 2) { - print "usage: lped_to_geno.pl infile.map infile.ped > outfile\n"; - exit; -} - -my $map = shift @ARGV; -my $ped = shift @ARGV; - -my @snp; #array to hold SNPs from map file -open(FH, $map) or die "Couldn't open $map, $!\n"; -while () { - chomp; - my @f = split(/\s+/); #3 or 4 columns - #chrom ID [distance|morgans] position - if (!exists $f[3]) { $f[3] = $f[2]; } #only 3 columns - #have to leave in so know which to skip later - #if ($f[3] < 0) { next; } #way of excluding SNPs - #if ($f[0] eq '0') { next; } #unplaced SNP - $f[0] = "chr$f[0]"; - push(@snp, "$f[0]:$f[3]:$f[1]"); -} -close FH or die "Couldn't finish $map, $!\n"; - -#rows are individuals, columns are SNPs (7 & up) -#need to print row per SNP -my @allele; #alleles to go with @snp -my @pheno; #marker for phenotype -open(FH, $ped) or die "Couldn't open $ped, $!\n"; -while () { - chomp; - my @f = split(/\s+/); - if (!defined $f[5]) { die "ERROR undefined phenotype $f[0] $f[1] $f[2] $f[3] $f[4]\n"; } - push(@pheno, $f[5]); - my $j = 0; - for(my $i = 6; $i< $#f; $i+=2) { - if (!$allele[$j]) { $allele[$j] = ''; } - #can be ACTG or 1234 (for haploview etc) or 0 for missing - if ($f[$i] eq '1') { $f[$i] = 'A'; } - elsif ($f[$i] eq '2') { $f[$i] = 'C'; } - elsif ($f[$i] eq '3') { $f[$i] = 'G'; } - elsif ($f[$i] eq '4') { $f[$i] = 'T'; } - if ($f[$i+1] eq '1') { $f[$i+1] = 'A'; } - elsif ($f[$i+1] eq '2') { $f[$i+1] = 'C'; } - elsif ($f[$i+1] eq '3') { $f[$i+1] = 'G'; } - elsif ($f[$i+1] eq '4') { $f[$i+1] = 'T'; } - $f[$i] = uc($f[$i]); - $f[$i+1] = uc($f[$i+1]); - $allele[$j] .= " $f[$i]$f[$i+1]"; - $j++; - } -} -close FH or die "Couldn't close $ped, $!\n"; - -print "ID Chr Pos"; -foreach (@pheno) { if ($_ > 0) { print " ", $_ - 1; }} #go from 1/2 to 0/1 -print "\n"; -for(my $i =0; $i <= $#snp; $i++) { #foreach snp - $allele[$i] =~ /(\w)/; - my $nt = $1; - my $j = 0; - my @t = split(/:/, $snp[$i]); - if ($t[0] eq 'chr0' or $t[1] < 0) { next; } #skip this SNP - if ($t[0] eq 'chrX') { $t[0] = 'chr23'; } - elsif ($t[0] eq 'chrY') { $t[0] = 'chr24'; } - elsif ($t[0] eq 'chrXY') { $t[0] = 'chr23'; } - elsif ($t[0] eq 'chrMT') { $t[0] = 'chr25'; } - print "$t[2] $t[0] $t[1]"; - $allele[$i] =~ s/^\s+//; - foreach my $p (split(/ +/, $allele[$i])) { - if ($pheno[$j] > 0) { #pheno 0 or -9 skip - #change AA BB AB to 2 0 1 - if ($p eq "$nt$nt") { print " 2"; } - elsif ($p =~ /$nt/) { print " 1"; } - else { print " 0"; } - } - $j++; - } - print "\n"; -} - -exit; diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/lps.xml --- a/tools/human_genome_variation/lps.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,304 +0,0 @@ - - LASSO-Patternsearch algorithm - - - lps_tool_wrapper.sh $lambda_fac $input_file $label_column $output_file $log_file - Initialization 0 - #if $advanced.options == "true": - Sample $advanced.sample - Verbosity $advanced.verbosity - Standardize $advanced.standardize - initialLambda $advanced.initialLambda - #if $advanced.continuation.continuation == "1": - Continuation $advanced.continuation.continuation - continuationSteps $advanced.continuation.continuationSteps - accurateIntermediates $advanced.continuation.accurateIntermediates - #end if - printFreq $advanced.printFreq - #if $advanced.newton.newton == "1": - Newton $advanced.newton.newton - NewtonThreshold $advanced.newton.newtonThreshold - #end if - HessianSampleFraction $advanced.hessianSampleFraction - BB 0 - Monotone 0 - FullGradient $advanced.fullGradient - GradientFraction $advanced.gradientFraction - InitialAlpha $advanced.initialAlpha - AlphaIncrease $advanced.alphaIncrease - AlphaDecrease $advanced.alphaDecrease - AlphaMax $advanced.alphaMax - c1 $advanced.c1 - MaxIter $advanced.maxIter - StopTol $advanced.stopTol - IntermediateTol $advanced.intermediateTol - FinalOnly $advanced.finalOnly - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - lps_tool - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Dataset formats** - -The input and output datasets are tabular_. The columns are described below. -There is a second output dataset (a log) that is in text_ format. -(`Dataset missing?`_) - -.. _tabular: ./static/formatHelp.html#tab -.. _text: ./static/formatHelp.html#text -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -The LASSO-Patternsearch algorithm fits your dataset to an L1-regularized -logistic regression model. A benefit of using L1-regularization is -that it typically yields a weight vector with relatively few non-zero -coefficients. - -For example, say you have a dataset containing M rows (subjects) -and N columns (attributes) where one of these N attributes is binary, -indicating whether or not the subject has some property of interest P. -In simple terms, LPS calculates a weight for each of the other attributes -in your dataset. This weight indicates how "relevant" that attribute -is for predicting whether or not a given subject has property P. -The L1-regularization causes most of these weights to be equal to zero, -which means LPS will find a "small" subset of the remaining N-1 attributes -in your dataset that can be used to predict P. - -In other words, LPS can be used for feature selection. - -The input dataset is tabular, and must contain a label column which -indicates whether or not a given row has property P. In the current -version of this tool, P must be encoded using +1 and -1. The Lambda_fac -parameter ranges from 0 to 1, and controls how sparse the weight -vector will be. At the low end, when Lambda_fac = 0, there will be -no regularization. At the high end, when Lambda_fac = 1, there will be -"too much" regularization, and all of the weights will equal zero. - -The LPS tool creates two output datasets. The first, called the results -file, is a tabular dataset containing one column of weights for each -value of the regularization parameter lambda that was tried. The weight -columns are in order from left to right by decreasing values of lambda. -The first N-1 rows in each column are the weights for the N-1 attributes -in your input dataset. The final row is a constant, the intercept. - -Let **x** be a row from your input dataset and let **b** be a column -from the results file. To compute the probability that row **x** has -a label value of +1: - - Probability(row **x** has label value = +1) = 1 / [1 + exp{**x** \* **b**\[1..N-1\] + **b**\[N\]}] - -where **x** \* **b**\[1..N-1\] represents matrix multiplication. - -The second output dataset, called the log file, is a text file which -contains additional data about the fitted L1-regularized logistic -regression model. These data include the number of features, the -computed value of lambda_max, the actual values of lambda used, the -optimal values of the log-likelihood and regularized log-likelihood -functions, the number of non-zeros, and the number of iterations. - -Website: http://pages.cs.wisc.edu/~swright/LPS/ - ------ - -**Example** - -- input file:: - - +1 1 0 0 0 0 1 0 1 1 ... - +1 1 1 1 0 0 1 0 1 1 ... - +1 1 0 1 0 1 0 1 0 1 ... - etc. - -- output results file:: - - 0 - 0 - 0 - 0 - 0.025541 - etc. - -- output log file:: - - Data set has 100 vectors with 50 features. - calculateLambdaMax: n=50, m=100, m+=50, m-=50 - computed value of lambda_max: 5.0000e-01 - - lambda=2.96e-02 solution: - optimal log-likelihood function value: 6.46e-01 - optimal *regularized* log-likelihood function value: 6.79e-01 - number of nonzeros at the optimum: 5 - number of iterations required: 43 - etc. - ------ - -**References** - -Koh K, Kim S-J, Boyd S. (2007) -An interior-point method for large-scale l1-regularized logistic regression. -Journal of Machine Learning Research. 8:1519-1555. - -Shi W, Wahba G, Wright S, Lee K, Klein R, Klein B. (2008) -LASSO-Patternsearch algorithm with application to ophthalmology and genomic data. -Stat Interface. 1(1):137-153. - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/lps_tool_wrapper.sh --- a/tools/human_genome_variation/lps_tool_wrapper.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -#!/usr/bin/env bash -# script for execution of deployed applications -# -# Sets up the MCR environment for the current $ARCH and executes -# the specified command. -# - -export PATH=$PATH:$(dirname $0) - -MCRROOT=${MCRROOT:-/galaxy/software/linux2.6-x86_64/bin/MCR-7.11/v711} -MWE_ARCH=glnxa64 - -if [ "$MWE_ARCH" = "sol64" ] ; then - LD_LIBRARY_PATH=.:/usr/lib/lwp:${MCRROOT}/runtime/glnxa64 -else - LD_LIBRARY_PATH=.:${MCRROOT}/runtime/glnxa64 -fi - -LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/bin/glnxa64 -LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/sys/os/glnxa64 - -if [ "$MWE_ARCH" = "maci" -o "$MWE_ARCH" = "maci64" ]; then - DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/System/Library/Frameworks/JavaVM.framework/JavaVM:/System/Library/Frameworks/JavaVM.framework/Libraries -else - MCRJRE=${MCRROOT}/sys/java/jre/glnxa64/jre/lib/amd64 - LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/native_threads - LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/server - LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/client - LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE} -fi - -XAPPLRESDIR=${MCRROOT}/X11/app-defaults - -export LD_LIBRARY_PATH XAPPLRESDIR - -lps_tool $* - -exit 0 diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/mergeSnps.pl --- a/tools/human_genome_variation/mergeSnps.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -#this merges the significance output with the SNPs so users get more than an index - -my($out, $snp) = @ARGV; - -if (!$out or !$snp) { die "missing args\n"; } - -#merge SNP data with results -merge(); - -exit; - -######################################## - -#merge the input and output files so have SNP data with result -sub merge { - open(FH, $out) or die "Couldn't open $out, $!\n"; - my %res; - my @ind; - while () { - chomp; - my $line = $_; - #0: 10 score= 14.224153 , df= 2 , p= 0.040760 , N=50 - if ($line =~ /^(\d+):\s+(.*)/) { $res{$1} = $2; push(@ind, $1); } - } - close FH; - if (!@ind) { return; } #no results, leave alone - @ind = sort { $a <=> $b } @ind; - #read input file to get SNP data - open(FH, $snp) or die "Couldn't open $snp, $!\n"; - my $i = 0; #0 based, not counting ID line - my $c = shift @ind; - while () { - chomp; - if (/^ID/) { next; } - my @f = split(/\s+/); - if ($i == $c) { - $res{$i} = "$f[0]\t$f[1]\t$f[2]\t$res{$i}"; - if (!@ind) { last; } - $c = shift @ind; - } - $i++; - } - close FH; - #now reprint results with SNP data included - open(FH, ">", $out) or die "Couldn't write to $out, $!\n"; - print FH "ID\tchr\tposition\tresults\n"; - foreach $i (keys %res) { - print FH $res{$i}, "\n"; - } - close FH; -} - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/pagetag.py --- a/tools/human_genome_variation/pagetag.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,297 +0,0 @@ -#!/usr/bin/env python - -""" -This accepts as input a file of the following format: - - Site Sample Allele1 Allele2 - -for example: - - 000834 D001 G G - 000834 D002 G G - 000834 D003 G G - 000834 D004 G G - 000834 D005 N N - 000834 E001 G G - 000834 E002 G G - 000834 E003 G G - 000834 E004 G G - 000834 E005 G G - 000963 D001 T T - 000963 D002 T T - 000963 D003 T T - 000963 D004 T T - 000963 D005 N N - 000963 E001 T T - 000963 E002 N N - 000963 E003 G T - 000963 E004 G G - 000963 E005 G T - -and a rsquare threshold and outputs two files: - -a) a file of input snps (one on each line). A SNP is identified by the "Site" -column in the input file - -b) a file where each line has the following: - SNP list -where SNP is one of the SNPs and the "list" is a comma separated list of SNPs -that exceed the rsquare threshold with the first SNP. -""" - -from sys import argv, stderr, exit -from getopt import getopt, GetoptError - -__author__ = "Aakrosh Ratan" -__email__ = "ratan@bx.psu.edu" - -# do we want the debug information to be printed? -debug_flag = False - -# denote different combos of alleles in code -HOMC = str(1) -HOMR = str(2) -HETE = str(3) -OTHER = str(4) - -indexcalculator = {(HOMC,HOMC) : 0, - (HOMC,HOMR) : 1, - (HOMC,HETE) : 2, - (HOMR,HOMC) : 3, - (HOMR,HOMR) : 4, - (HOMR,HETE) : 5, - (HETE,HOMC) : 6, - (HETE,HOMR) : 7, - (HETE,HETE) : 8} - -def read_inputfile(filename, samples): - input = {} - - file = open(filename, "r") - - for line in file: - position,sample,allele1,allele2 = line.split() - - # if the user specified a list of samples, then only use those samples - if samples != None and sample not in samples: continue - - if position in input: - v = input[position] - v[sample] = (allele1,allele2) - else: - v = {sample : (allele1, allele2)} - input[position] = v - - file.close() - return input - -def annotate_locus(input, minorallelefrequency, snpsfile): - locus = {} - for k,v in input.items(): - genotypes = [x for x in v.values()] - alleles = [y for x in genotypes for y in x] - alleleset = list(set(alleles)) - alleleset = list(set(alleles) - set(["N","X"])) - - if len(alleleset) == 2: - genotypevec = "" - num1 = len([x for x in alleles if x == alleleset[0]]) - num2 = len([x for x in alleles if x == alleleset[1]]) - - if num1 > num2: - major = alleleset[0] - minor = alleleset[1] - minorfreq = (num2 * 1.0)/(num1 + num2) - else: - major = alleleset[1] - minor = alleleset[0] - minorfreq = (num1 * 1.0)/(num1 + num2) - - if minorfreq < minorallelefrequency: continue - - for gen in genotypes: - if gen == (major,major): - genotypevec += HOMC - elif gen == (minor,minor): - genotypevec += HOMR - elif gen == (major, minor) or gen == (minor, major): - genotypevec += HETE - else: - genotypevec += OTHER - - locus[k] = genotypevec,minorfreq - elif len(alleleset) > 2: - print >> snpsfile, k - return locus - -def calculateLD(loci, rsqthreshold): - snps = list(loci) - rsquare = {} - - for index,loc1 in enumerate(snps): - for loc2 in snps[index + 1:]: - matrix = [0]*9 - - vec1 = loci[loc1][0] - vec2 = loci[loc2][0] - - for gen in zip(vec1,vec2): - if gen[0] == OTHER or gen[1] == OTHER: continue - matrix[indexcalculator[gen]] += 1 - - n = sum(matrix) - x11 = 2*matrix[0] + matrix[2] + matrix[6] - x12 = 2*matrix[1] + matrix[2] + matrix[7] - x21 = 2*matrix[3] + matrix[6] + matrix[5] - x22 = 2*matrix[4] + matrix[6] + matrix[5] - - p = (x11 + x12 + matrix[8] * 1.0) / (2 * n) - q = (x11 + x21 + matrix[8] * 1.0) / (2 * n) - - p11 = p * q - - oldp11 = p11 - range = 0.0 - converged = False - convergentcounter = 0 - if p11 > 0.0: - while converged == False and convergentcounter < 100: - if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0: - num = matrix[8] * p11 * (1.0 - p - q + p11) - den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11) - p11 = (x11 + (num/den))/(2.0*n) - range = p11/oldp11 - if range >= 0.9999 and range <= 1.001: - converged = True - oldp11 = p11 - convergentcounter += 1 - else: - converged = True - - dvalue = 0.0 - if converged == True: - dvalue = p11 - (p * q) - - if dvalue != 0.0: - rsq = (dvalue**2)/(p*q*(1-p)*(1-q)) - if rsq >= rsqthreshold: - rsquare["%s %s" % (loc1,loc2)] = rsq - - return rsquare - -def main(inputfile, snpsfile, neigborhoodfile, \ - rsquare, minorallelefrequency, samples): - # read the input file - input = read_inputfile(inputfile, samples) - print >> stderr, "Read %d locations" % len(input) - - # open the snpsfile to print - file = open(snpsfile, "w") - - # annotate the inputs, remove the abnormal loci (which do not have 2 alleles - # and add the major and minor allele to each loci - loci = annotate_locus(input, minorallelefrequency, file) - print >> stderr, "Read %d interesting locations" % len(loci) - - # print all the interesting loci as candidate snps - for k in loci.keys(): print >> file, k - file.close() - print >> stderr, "Finished creating the snpsfile" - - # calculate the LD values and store it if it exceeds the threshold - lds = calculateLD(loci, rsquare) - print >> stderr, "Calculated all the LD values" - - # create a list of SNPs - snps = {} - ldvals = {} - for k,v in lds.items(): - s1,s2 = k.split() - if s1 in snps: snps[s1].append(s2) - else : snps[s1] = [s2] - if s2 in snps: snps[s2].append(s1) - else : snps[s2] = [s1] - - if s1 in ldvals: ldvals[s1].append(str(v)) - else : ldvals[s1] = [str(v)] - if s2 in ldvals: ldvals[s2].append(str(v)) - else : ldvals[s2] = [str(v)] - - # print the snps to the output file - file = open(neigborhoodfile, "w") - - for k,v in snps.items(): - ldv = ldvals[k] - if debug_flag == True: - print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv)) - else: - print >> file, "%s\t%s" % (k, ",".join(v)) - - file.close() - - -def read_list(filename): - file = open(filename, "r") - list = {} - - for line in file: - list[line.strip()] = 1 - - file.close() - return list - -def usage(): - f = stderr - print >> f, "usage:" - print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt" - print >> f, "where input.txt is the prettybase file" - print >> f, "where snps.txt is the first output file with the snps" - print >> f, "where neighborhood.txt is the output neighborhood file" - print >> f, "where the options are:" - print >> f, "-h,--help : print usage and quit" - print >> f, "-d,--debug: print debug information" - print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)" - print >> f, "-f,--freq : the minimum MAF required (default: 0.0)" - print >> f, "-s,--sample : a list of samples to be clustered" - -if __name__ == "__main__": - try: - opts, args = getopt(argv[1:], "hds:r:f:",\ - ["help", "debug", "rsquare=","freq=", "sample="]) - except GetoptError, err: - print str(err) - usage() - exit(2) - - rsquare = 0.64 - minorallelefrequency = 0.0 - samples = None - - for o, a in opts: - if o in ("-h", "--help"): - usage() - exit() - elif o in ("-d", "--debug"): - debug_flag = True - elif o in ("-r", "--rsquare"): - rsquare = float(a) - elif o in ("-f", "--freq"): - minorallelefrequency = float(a) - elif o in ("-s", "--sample"): - samples = read_list(a) - else: - assert False, "unhandled option" - - if rsquare < 0.00 or rsquare > 1.00: - print >> stderr, "input value of rsquare should be in [0.00, 1.00]" - exit(3) - - if minorallelefrequency < 0.0 or minorallelefrequency > 0.5: - print >> stderr, "input value of MAF should be (0.00,0.50]" - exit(4) - - if len(args) != 3: - usage() - exit(5) - - main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples) diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/pass.xml --- a/tools/human_genome_variation/pass.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ - - significant transcription factor binding sites from ChIP data - - - pass_wrapper.sh "$input" "$min_window" "$max_window" "$false_num" "$output" - - - - - - - - - - - - - - - pass - sed - - - - - -**Dataset formats** - -The input is in GFF_ format, and the output is tabular_. -(`Dataset missing?`_) - -.. _GFF: ./static/formatHelp.html#gff -.. _tabular: ./static/formatHelp.html#tab -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -PASS (Poisson Approximation for Statistical Significance) detects -significant transcription factor binding sites in the genome from -ChIP data. This is probably the only peak-calling method that -accurately controls the false-positive rate and FDR in ChIP data, -which is important given the huge discrepancy in results obtained -from different peak-calling algorithms. At the same time, this -method achieves a similar or better power than previous methods. - - - ------ - -**Hints** - -- ChIP-Seq data: - - If the data is from ChIP-Seq, you need to convert the ChIP-Seq values - into z-scores before using this program. It is also recommended that - you group read counts within a neighborhood together, e.g. in tiled - windows of 30bp. In this way, the ChIP-Seq data will resemble - ChIP-chip data in format. - -- Choosing window size options: - - The window size is related to the probe tiling density. For example, - if the probes are tiled at every 100bp, then setting the smallest - window = 2 and largest window = 6 is appropriate, because the DNA - fragment size is around 300-500bp. - ------ - -**Example** - -- input file:: - - chr7 Nimblegen ID 40307603 40307652 1.668944 . . . - chr7 Nimblegen ID 40307703 40307752 0.8041307 . . . - chr7 Nimblegen ID 40307808 40307865 -1.089931 . . . - chr7 Nimblegen ID 40307920 40307969 1.055044 . . . - chr7 Nimblegen ID 40308005 40308068 2.447853 . . . - chr7 Nimblegen ID 40308125 40308174 0.1638694 . . . - chr7 Nimblegen ID 40308223 40308275 -0.04796628 . . . - chr7 Nimblegen ID 40308318 40308367 0.9335709 . . . - chr7 Nimblegen ID 40308526 40308584 0.5143972 . . . - chr7 Nimblegen ID 40308611 40308660 -1.089931 . . . - etc. - - In GFF, a value of dot '.' is used to mean "not applicable". - -- output file:: - - ID Chr Start End WinSz PeakValue # of FPs FDR - 1 chr7 40310931 40311266 4 1.663446 0.248817 0.248817 - ------ - -**References** - -Zhang Y. (2008) -Poisson approximation for significance in genome-wide ChIP-chip tiling arrays. -Bioinformatics. 24(24):2825-31. Epub 2008 Oct 25. - -Chen KB, Zhang Y. (2010) -A varying threshold method for ChIP peak calling using multiple sources of information. -Submitted. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/pass_wrapper.sh --- a/tools/human_genome_variation/pass_wrapper.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -export PATH=$PATH:$(dirname $0) - -input=$1 -min_window=$2 -max_window=$3 -false_num=$4 -output=$5 - -pass "$input" "$min_window" "$max_window" "$false_num" "$output" >/dev/null -sed -i -e 's/\t\t*/\t/g' "$output" - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/senatag.py --- a/tools/human_genome_variation/senatag.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,243 +0,0 @@ -#!/usr/bin/env python - -""" -This tool takes the following file pairs as input: -a) input_snp : A file with identifiers for SNPs (one on each line) -b) ldfile : A file where each line has the following - snp list - where "snp" is an identifier for one SNP and the "list" is a - comma separated list of all the other snps that are in LD with - it (as per some threshold of rsquare) - -The output is a set of tag SNPs for the given datasets - -The algorithm is as follows: - -a) Construct a graph for each population, where each node is a SNP and two nodes -are connected using an edge iff they are in LD. -b) For each SNP, count the total number of connected nodes, which have not yet -been visited. -c) Find the SNP with the highest count and assign it to be a tag SNP. -d) Mark that SNP and all the snps connected to it as "visited". This should be -done for each population. -e) Continue steps b-e until all SNPs, in all populations have been visited. -""" - -from sys import argv, stderr, exit -from getopt import getopt, GetoptError - -import os -import heapq - -__author__ = "Aakrosh Ratan" -__email__ = "ratan@bx.psu.edu" - -# do we want the debug information to be printed? -debug_flag = False - -class node: - def __init__(self, name): - self.name = name - self.edges = [] - self.visited = False - - # return the number of nodes connected to this node, that have yet to be - # visited - def num_not_visited(self): - num = 0 - for n in self.edges: - if n.visited == False: num += 1 - return num - - def __cmp__(self, other): - return other.num_not_visited() - self.num_not_visited() - - def __str__(self): - return self.name - -class graph: - def __init__(self): - self.nodes = {} - - def __str__(self): - string = "" - for n1 in self.nodes.values(): - n2s = [x.name for x in n1.edges] - string += "%s %s\n" % (n1.name, ",".join(n2s)) - return string[:-1] - - def add_node(self, n): - self.nodes[n.name] = n - - def add_edges(self, n1, n2): - assert n1.name in self.nodes - assert n2.name in self.nodes - n1.edges.append(n2) - n2.edges.append(n1) - - def check_graph(self): - for n in self.nodes.values(): - ms = [x for x in n.edges] - for m in ms: - if n not in m.edges: - print >> stderr, "check : %s - %s" % (n,m) - -def construct_graph(ldfile, snpfile): - # construct the initial graph. add all the SNPs as nodes - g = graph() - file = open(snpfile, "r") - - for line in file: - # ignore empty lines and add the remainder to the graph - if len(line.strip()) == 0: continue - n = node(line.strip()) - g.add_node(n) - - file.close() - print >> stderr, "Added %d nodes to a graph" % len(g.nodes) - - # now add all the edges - file = open(ldfile, "r") - - for line in file: - tokens = line.split() - assert len(tokens) == 2 - - # if this node is in the graph, then we need to construct an edge from - # this node to all the nodes which are highly related to it - if tokens[0] in g.nodes: - n1 = g.nodes[tokens[0]] - n2s = [g.nodes[x] for x in tokens[1].split(",")] - - for n2 in n2s: - g.add_edges(n1, n2) - - file.close() - print >> stderr, "Added all edges to the graph" - - return g - -def check_output(g, tagsnps): - # find all the nodes in the graph - allsnps = [x.name for x in g.nodes.values()] - - # find the nodes that are covered by our tagsnps - mysnps = [x.name for x in tagsnps] - - for n in tagsnps: - for m in n.edges: - mysnps.append(m.name) - - mysnps = list(set(mysnps)) - - if set(allsnps) != set(mysnps): - diff = list(set(allsnps) - set(mysnps)) - print >> stderr, "%s are not covered" % ",".join(diff) - -def main(ldfile, snpsfile, required, excluded): - # construct the graph - g = construct_graph(ldfile, snpsfile) - if debug_flag == True: g.check_graph() - - tagsnps = [] - neighbors = {} - - # take care of the SNPs that are required to be TagSNPs - for s in required: - t = g.nodes[s] - - t.visited = True - ns = [] - - for n in t.edges: - if n.visited == False: ns.append(n.name) - n.visited = True - - tagsnps.append(t) - neighbors[t.name] = list(set(ns)) - - # find the tag SNPs for this graph - data = [x for x in g.nodes.values()] - heapq.heapify(data) - - while data: - s = heapq.heappop(data) - - if s.visited == True or s.name in excluded: continue - - s.visited = True - ns = [] - - for n in s.edges: - if n.visited == False: ns.append(n.name) - n.visited = True - - tagsnps.append(s) - neighbors[s.name] = list(set(ns)) - - heapq.heapify(data) - - for s in tagsnps: - if len(neighbors[s.name]) > 0: - print "%s\t%s" % (s, ",".join(neighbors[s.name])) - continue - print s - - if debug_flag == True: check_output(g, tagsnps) - -def read_list(filename): - assert os.path.exists(filename) == True - file = open(filename, "r") - list = {} - - for line in file: - list[line.strip()] = 1 - - file.close() - return list - -def usage(): - f = stderr - print >> f, "usage:" - print >> f, "senatag [options] neighborhood.txt inputsnps.txt" - print >> f, "where inputsnps.txt is a file of snps from one population" - print >> f, "where neighborhood.txt is neighborhood details for the pop." - print >> f, "where the options are:" - print >> f, "-h,--help : print usage and quit" - print >> f, "-d,--debug: print debug information" - print >> f, "-e,--excluded : file with names of SNPs that cannot be TagSNPs" - print >> f, "-r,--required : file with names of SNPs that should be TagSNPs" - -if __name__ == "__main__": - try: - opts, args = getopt(argv[1:], "hdr:e:",\ - ["help", "debug", "required=", "excluded="]) - except GetoptError, err: - print str(err) - usage() - exit(2) - - required = {} - excluded = {} - - for o, a in opts: - if o in ("-h", "--help"): - usage() - exit() - elif o in ("-d", "--debug"): - debug_flag = True - elif o in ("-r", "--required"): - required = read_list(a) - elif o in ("-e", "--excluded"): - excluded = read_list(a) - else: - assert False, "unhandled option" - - if len(args) != 2: - usage() - exit(3) - - assert os.path.exists(args[0]) == True - assert os.path.exists(args[1]) == True - - main(args[0], args[1], required, excluded) diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/sift.xml --- a/tools/human_genome_variation/sift.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,174 +0,0 @@ - - predictions of functional sites - - - sift_variants_wrapper.sh "$input" "$output" "${input.metadata.dbkey}" "${GALAXY_DATA_INDEX_DIR}/sift_db.loc" "$chrom_col" "$pos_col" "$base" "$allele_col" "$strand_source.strand_col" "$comment_source.comment_col" "$output_opts" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - awk - rm - sed - - - - - - - - - - - - - - - - - -.. class:: warningmark - -This currently works only for builds hg18 or hg19. - ------ - -**Dataset formats** - -The input and output datasets are tabular_. -(`Dataset missing?`_) - -.. _tabular: ./static/formatHelp.html#tab -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -SIFT predicts whether an amino-acid substitution affects protein function, -based on sequence homology and the physical properties of amino acids. -SIFT can be applied to naturally occurring non-synonymous polymorphisms -and laboratory-induced missense mutations. This tool uses SQLite databases -containing pre-computed SIFT scores and annotations for all possible nucleotide -substitutions at each position in the human exome. Allele frequency data -are from the HapMap frequency database, and additional transcript and -gene-level data are from Ensembl BioMart. - -The input dataset must contain columns for the chromosome, position, and -alleles. The alleles must be two nucleotides separated by '/', -usually the reference allele and the allele of interest. -The strand must either be in another column or all the same. -The output contains a standard set of columns plus the additional ones that -have been selected from the list above. - -Website: http://sift.jcvi.org/ - ------ - -**Example** - -- input file:: - - chr3 81780820 + T/C - chr2 230341630 + G/A - chr2 43881517 + A/T - chr2 43857514 + T/C - chr6 88375602 + G/A - chr22 29307353 - T/A - chr10 115912482 - G/T - chr10 115900918 - C/T - chr16 69875502 + G/T - etc. - -- output file:: - - #Chrom Position Strand Allele Codons Transcript ID Protein ID Substitution Region dbSNP ID SNP Type Prediction Score Median Info Num seqs at position User Comment - chr3 81780820 + T/C AGA-gGA ENST00000264326 ENSP00000264326 R190G EXON CDS rs2229519:C Nonsynonymous DAMAGING 0.04 3.06 149 - chr2 230341630 + G/T - ENST00000389045 ENSP00000373697 NA EXON CDS rs1803846:A Unknown Not scored NA NA NA - chr2 43881517 + A/T ATA-tTA ENST00000260605 ENSP00000260605 I230L EXON CDS rs11556157:T Nonsynonymous TOLERATED 0.47 3.19 7 - chr2 43857514 + T/C TTT-TcT ENST00000260605 ENSP00000260605 F33S EXON CDS rs2288709:C Nonsynonymous TOLERATED 0.61 3.33 6 - chr6 88375602 + G/A GTT-aTT ENST00000257789 ENSP00000257789 V217I EXON CDS rs2307389:A Nonsynonymous TOLERATED 0.75 3.17 13 - chr22 29307353 + T/A ACC-tCC ENST00000335214 ENSP00000334612 T264S EXON CDS rs42942:A Nonsynonymous TOLERATED 0.4 3.14 23 - chr10 115912482 + C/A CGA-CtA ENST00000369285 ENSP00000358291 R179L EXON CDS rs12782946:T Nonsynonymous TOLERATED 0.06 4.32 2 - chr10 115900918 + G/A CAA-tAA ENST00000369287 ENSP00000358293 Q271* EXON CDS rs7095762:T Nonsynonymous N/A N/A N/A N/A - chr16 69875502 + G/T ACA-AaA ENST00000338099 ENSP00000337512 T608K EXON CDS rs3096381:T Nonsynonymous TOLERATED 0.12 3.41 3 - etc. - ------ - -**References** - -Ng PC, Henikoff S. (2001) Predicting deleterious amino acid substitutions. -Genome Res. 11(5):863-74. - -Ng PC, Henikoff S. (2002) Accounting for human polymorphisms predicted to affect protein function. -Genome Res. 12(3):436-46. - -Ng PC, Henikoff S. (2003) SIFT: Predicting amino acid changes that affect protein function. -Nucleic Acids Res. 31(13):3812-4. - -Kumar P, Henikoff S, Ng PC. (2009) Predicting the effects of coding non-synonymous variants -on protein function using the SIFT algorithm. -Nat Protoc. 4(7):1073-81. Epub 2009 Jun 25. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/sift_variants_wrapper.sh --- a/tools/human_genome_variation/sift_variants_wrapper.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ -#!/usr/bin/env bash - -input_file=$1 -output_file=$2 -org=$3 -db_loc=$4 -chrom_col=$5 -pos_col=$6 -base=$7 -allele_col=$8 -strand_col=$9 -comment_col=${10} -output_opts=${11} - -working_dir=$PWD -sift_input="$working_dir/sift_input.txt" -sift_output="$working_dir/sift_output.txt" - -################################################################################ -## make sure input file column selections are mutually exclusive ## -################################################################################ -ERROR=0 -declare -a col_use - -function check_col () { - local col=$1 - local use=$2 - local int=$3 - - if [ -n "${col//[0-9]}" ]; then - if [ $int -eq 1 ]; then - echo "ERROR: invalid value for $use column: $col" 1>&2 - ERROR=1 - fi - return - fi - - local cur=${col_use[$col]} - if [ -n "$cur" ]; then - echo "ERROR: $use column is the same as $cur column" 1>&2 - col_use[$col]="${cur},$use" - ERROR=1 - else - col_use[$col]=$use - fi -} - -check_col $chrom_col 'chromosome' 1 -check_col $pos_col 'position' 1 -check_col $allele_col 'allele' 1 -check_col $strand_col 'strand' 0 -check_col $comment_col 'comment' 0 - -if [ $ERROR -ne 0 ]; then - exit 1 -fi - -################################################################################ -## get/check the db directory from the argument org,db_loc ## -################################################################################ -db_dir=$( awk '$1 == org { print $2 }' org=$org $db_loc ) - -if [ -z "$db_dir" ]; then - echo "Can't find dbkey \"$org\" in loc file \"$db_loc\"" 1>&2 - exit 1 -fi - -if [ ! -d "$db_dir" ]; then - echo "Can't access SIFT database directory \"$db_dir\"" 1>&2 - exit 1 -fi - -################################################################################ -## create input file for SIFT_exome_nssnvs.pl ## -################################################################################ -if [ ! -r "$input_file" ]; then - echo "Can't read input file \"$input_file\"" 1>&2 - exit 1 -fi - -if [ $base -eq 0 ]; then - beg_col="$pos_col" - end_col="$pos_col + 1" - pos_adj='$2 = $2 - 1' -else - beg_col="$pos_col - 1" - end_col="$pos_col" - pos_adj='' -fi - -strand_cvt='' -if [ \( "$strand_col" = "+" \) ]; then - strand='"1"' -elif [ \( "$strand_col" = "-" \) ]; then - strand='"-1"' -else - strand="\$$strand_col" - strand_cvt='if ('"${strand}"' == "+") {'"${strand}"' = "1"} else if ('"${strand}"' == "-") {'"${strand}"' = "-1"}' -fi - -print_row='print $'"${chrom_col}"', $'"${beg_col}"', $'"${end_col}"', '"${strand}"', $'"${allele_col}"'' -if [ "$comment_col" != "-" ]; then - print_row=''"${print_row}"', $'"${comment_col}"'' -fi - -awk ' -BEGIN {FS="\t";OFS=","} -$'"${chrom_col}"' ~ /^[cC][hH][rR]/ {$'"${chrom_col}"' = substr($'"${chrom_col}"',4)} -{ - '"${strand_cvt}"' - '"${print_row}"' -} -' "$input_file" > "$sift_input" - -################################################################################ -## run SIFT_exome_nssnvs.pl command line program ## -################################################################################ -if [ "$output_opts" = "None" ]; then - output_opts="" -else - output_opts=$( echo "$output_opts" | sed -e 's/,/ 1 -/g' ) - output_opts="-$output_opts 1" -fi - -SIFT_exome_nssnvs.pl -i "$sift_input" -d "$db_dir" -o "$working_dir" $output_opts &> "$sift_output" -if [ $? -ne 0 ]; then - echo "failed: SIFT_exome_nssnvs.pl -i \"$sift_input\" -d \"$db_dir\" -o \"$working_dir\" $output_opts" - exit 1 -fi - -################################################################################ -## locate the SIFT_exome_nssnvs.pl output file ## -################################################################################ -sift_pid=$( sed -n -e 's/^.*Your job id is \([0-9][0-9]*\) and is currently running.*$/\1/p' "$sift_output" ) - -if [ -z "$sift_pid" ]; then - echo "Can't find SIFT pid in \"$sift_output\"" 1>&2 - exit 1 -fi - -sift_outdir="$working_dir/$sift_pid" -if [ ! -d "$sift_outdir" ]; then - echo "Can't access SIFT output directory \"$sift_outdir\"" 1>&2 - exit 1 -fi - -sift_outfile="$sift_outdir/${sift_pid}_predictions.tsv" -if [ ! -r "$sift_outfile" ]; then - echo "Can't access SIFT output file \"$sift_outfile\"" 1>&2 - exit 1 -fi - -################################################################################ -## create galaxy output file ## -################################################################################ -awk ' -BEGIN {FS="\t";OFS="\t"} -NR == 1 { - $12 = "Num seqs at position" - $1 = "Chrom\tPosition\tStrand\tAllele" - print -} -NR != 1 { - $1 = "chr" $1 - gsub(/,/, "\t", $1) - print -} -' "$sift_outfile" | awk ' -BEGIN {FS="\t";OFS="\t"} -NR == 1 { - print "#" $0 -} -NR != 1 { - if ($3 == "1") {$3 = "+"} else if ($3 == "-1") {$3 = "-"} - '"${pos_adj}"' - print -} -' > "$output_file" - -################################################################################ -## cleanup ## -################################################################################ -rm -rf "$sift_outdir" "$sift_input" "$sift_output" - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/snpFreq.xml --- a/tools/human_genome_variation/snpFreq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ - - significant SNPs in case-control data - - - snpFreq2.pl $input $group1_1 $group1_2 $group1_3 $group2_1 $group2_2 $group2_3 0.05 $output - - - - - - - - - - - - - - - - - - R - - - - - - - - - - - - - - - - - -**Dataset formats** - -The input is tabular_, with six columns of allele counts. The output is also tabular, -and includes all of the input data plus the additional columns described below. -(`Dataset missing?`_) - -.. _tabular: ./static/formatHelp.html#tab -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool performs a basic analysis of bi-allelic SNPs in case-control -data, using the R statistical environment and Fisher's exact test to -identify SNPs with a significant difference in the allele frequencies -between the two groups. R's "qvalue" package is used to correct for -multiple testing. - -The input file includes counts for each allele combination (AA aa Aa) -for each group at each SNP position. The assignment of codes (1 2 3) -to these genotypes is arbitrary, as long as it is consistent for both -groups. Any other input columns are ignored in the computation, but -are copied to the output. The output appends eight additional columns, -namely the minimum expected counts of the three genotypes for each -group, the p-value, and the q-value. - ------ - -**Example** - -- input file:: - - chr1 210 211 38 4 15 56 0 1 x - chr1 228 229 55 0 2 56 0 1 x - chr1 230 231 46 0 11 55 0 2 x - chr1 234 235 43 0 14 55 0 2 x - chr1 236 237 55 0 2 13 10 34 x - chr1 437 438 55 0 2 46 0 11 x - chr1 439 440 56 0 1 55 0 2 x - chr1 449 450 56 0 1 13 20 24 x - chr1 518 519 56 0 1 38 4 15 x - -Here the group 1 genotype counts are in columns 4 - 6, while those -for group 2 are in columns 7 - 9. - -Note that the "x" column has no meaning. It was added to this example -to show that extra columns can be included, and to make it easier -to see where the new columns are appended in the output. - -- output file:: - - chr1 210 211 38 4 15 56 0 1 x 47 2 8 47 2 8 1.50219088598917e-05 6.32501425679652e-06 - chr1 228 229 55 0 2 56 0 1 x 55.5 0 1.5 55.5 0 1.5 1 0.210526315789474 - chr1 230 231 46 0 11 55 0 2 x 50.5 0 6.5 50.5 0 6.5 0.0155644201009862 0.00409590002657532 - chr1 234 235 43 0 14 55 0 2 x 49 0 8 49 0 8 0.00210854461554067 0.000739840215979182 - chr1 236 237 55 0 2 13 10 34 x 34 5 18 34 5 18 6.14613878554783e-17 4.31307984950725e-17 - chr1 437 438 55 0 2 46 0 11 x 50.5 0 6.5 50.5 0 6.5 0.0155644201009862 0.00409590002657532 - chr1 439 440 56 0 1 55 0 2 x 55.5 0 1.5 55.5 0 1.5 1 0.210526315789474 - chr1 449 450 56 0 1 13 20 24 x 34.5 10 12.5 34.5 10 12.5 2.25757007974134e-18 2.37638955762246e-18 - chr1 518 519 56 0 1 38 4 15 x 47 2 8 47 2 8 1.50219088598917e-05 6.32501425679652e-06 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/human_genome_variation/snpFreq2.pl --- a/tools/human_genome_variation/snpFreq2.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -#expected input: path to file, cols of counts (2 sets of 3), threshold -if (!@ARGV or scalar @ARGV != 9) { - print "usage snpFreq.pl /path/to/snps.txt <6 column numbers(1 based) with counts for alleles, first one group then another> #threshold outfile\n"; - exit 1; -} - -#get and verify inputs -my $file = shift @ARGV; -my $a1 = shift @ARGV; -if ($a1 =~ /\D/ or $a1 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a1\n"; - exit 1; -} -my $a2 = shift @ARGV; -if ($a2 =~ /\D/ or $a2 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a2\n"; - exit 1; -} -my $a3 = shift @ARGV; -if ($a3 =~ /\D/ or $a3 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a3\n"; - exit 1; -} -my $b1 = shift @ARGV; -if ($b1 =~ /\D/ or $b1 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b1\n"; - exit 1; -} -my $b2 = shift @ARGV; -if ($b2 =~ /\D/ or $b2 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b2\n"; - exit 1; -} -my $b3 = shift @ARGV; -if ($b3 =~ /\D/ or $b3 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b3\n"; - exit 1; -} -my $thresh = shift @ARGV; -if ($thresh !~ /^\d*\.?\d+$/) { - print "Error the threshold must be a number. Got $thresh\n"; - exit 1; -}elsif ($thresh > .3) { - print "Error the threshold can not be greater than 0.3 got $thresh\n"; - exit 1; -} -my $outfile = shift @ARGV; - -#run a fishers exact test (using R) on whole table -my $cmd = qq|options(warn=-1) - tab <- read.table('$file', sep="\t") - size <- length(tab[,1]) - width <- length(tab[1,]) - x <- 1:size - y <- matrix(data=0, nr=size, nc=6) - for(i in 1:size) { - m <- matrix(c(tab[i,$a1], tab[i,$b1], tab[i,$a2], tab[i,$b2], tab[i,$a3], tab[i,$b3]), nrow=2) - t <- fisher.test(m) - x[i] <- t\$p.value - if (x[i] >= 1) { - x[i] <- .999 - } - n <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3] + tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) - n_a <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3]) - y[i,1] <- ((tab[i,$a1] + tab[i,$b1])*(n_a))/n - y[i,1] <- round(y[i,1],3) - y[i,2] <- ((tab[i,$a2] + tab[i,$b2])*(n_a))/n - y[i,2] <- round(y[i,2],3) - y[i,3] <- ((tab[i,$a3] + tab[i,$b3])*(n_a))/n - y[i,3] <- round(y[i,3],3) - n_b <- (tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) - y[i,4] <- ((tab[i,$a1] + tab[i,$b1])*(n_b))/n - y[i,4] <- round(y[i,4],3) - y[i,5] <- ((tab[i,$a2] + tab[i,$b2])*(n_b))/n - y[i,5] <- round(y[i,5],3) - y[i,6] <- ((tab[i,$a3] + tab[i,$b3])*(n_b))/n - y[i,6] <- round(y[i,6],3) - }|; - #results <- data.frame(tab[1:size,1:width], x[1:size]) - #write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - #q()|; - -my $cmd2 = qq|suppressPackageStartupMessages(library(qvalue)) - qobj <- qvalue(x[1:size], lambda=seq(0,0.90,$thresh), pi0.method="bootstrap", fdr.level=0.1, robust=FALSE, smooth.log.pi0 = FALSE) - q <- qobj\$qvalues - results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size], q[1:size]) - write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - q()|; - -#for TESTING -my $pr = qq|results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size]) - write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - q()|; - -open(FT, "| R --slave --vanilla") - or die "Couldn't call fisher.text, $!\n"; -print FT $cmd, "\n"; #fisher test -print FT $cmd2, "\n"; #qvalues and results -#print FT $pr, "\n"; -close FT or die "Couldn't finish fisher.test, $!\n"; - -exit; diff -r c2a356708570 -r 33c067c3ae34 tools/hyphy/hyphy_branch_lengths_wrapper.py --- a/tools/hyphy/hyphy_branch_lengths_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -#Dan Blankenberg -#takes commandline tree def and input multiple fasta alignment file and runs the branch length ananlysis -import os, sys -from galaxy import eggs -from galaxy.tools.util import hyphy_util - -#Retrieve hyphy path, this will need to be the same across the cluster -tool_data = sys.argv.pop() -HYPHY_PATH = os.path.join( tool_data, "HYPHY" ) -HYPHY_EXECUTABLE = os.path.join( HYPHY_PATH, "HYPHY" ) - -#Read command line arguments -input_filename = os.path.abspath(sys.argv[1].strip()) -output_filename = os.path.abspath(sys.argv[2].strip()) -tree_contents = sys.argv[3].strip() -nuc_model = sys.argv[4].strip() -base_freq = sys.argv[5].strip() -model_options = sys.argv[6].strip() - -#Set up Temporary files for hyphy run -#set up tree file -tree_filename = hyphy_util.get_filled_temp_filename(tree_contents) - -#Guess if this is a single or multiple FASTA input file -found_blank = False -is_multiple = False -for line in open(input_filename): - line = line.strip() - if line == "": found_blank = True - elif line.startswith(">") and found_blank: - is_multiple = True - break - else: found_blank = False - -#set up BranchLengths file -BranchLengths_filename = hyphy_util.get_filled_temp_filename(hyphy_util.BranchLengths) -if is_multiple: - os.unlink(BranchLengths_filename) - BranchLengths_filename = hyphy_util.get_filled_temp_filename(hyphy_util.BranchLengthsMF) - print "Multiple Alignment Analyses" -else: print "Single Alignment Analyses" - -#setup Config file -config_filename = hyphy_util.get_branch_lengths_config_filename(input_filename, nuc_model, model_options, base_freq, tree_filename, output_filename, BranchLengths_filename) - -#Run Hyphy -hyphy_cmd = "%s BASEPATH=%s USEPATH=/dev/null %s" % (HYPHY_EXECUTABLE, HYPHY_PATH, config_filename) -hyphy = os.popen(hyphy_cmd, 'r') -#print hyphy.read() -hyphy.close() - -#remove temporary files -os.unlink(BranchLengths_filename) -os.unlink(tree_filename) -os.unlink(config_filename) diff -r c2a356708570 -r 33c067c3ae34 tools/hyphy/hyphy_branch_lengths_wrapper.xml --- a/tools/hyphy/hyphy_branch_lengths_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ - - - - Estimation - - hyphy_branch_lengths_wrapper.py $input1 $out_file1 "$tree" "$model" "$base_freq" "Global" ${GALAXY_DATA_INDEX_DIR} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool takes a single or multiple FASTA alignment file and estimates branch lengths using HYPHY_, a maximum likelihood analyses package. - -For the tree definition, you only need to specify the species build names. For example, you could use the tree *((hg17,panTro1),(mm5,rn3),canFam1)*, if your FASTA file looks like this:: - - >hg17.chr7(+):26907301-26907310|hg17_0 - GTGGGAGGT - >panTro1.chr6(+):28037319-28037328|panTro1_0 - GTGGGAGGT - >mm5.chr6(+):52104022-52104031|mm5_0 - GTGGGAGGT - >rn3.chr4(+):80734395-80734404|rn3_0 - GTGGGAGGT - >canFam1.chr14(+):42826409-42826418|canFam1_0 - GTGGGAGGT - - >hg17.chr7(+):26907310-26907326|hg17_1 - AGTCAGAGTGTCTGAG - >panTro1.chr6(+):28037328-28037344|panTro1_1 - AGTCAGAGTGTCTGAG - >mm5.chr6(+):52104031-52104047|mm5_1 - AGTCAGAGTGTCTGAG - >rn3.chr4(+):80734404-80734420|rn3_1 - AGTCAGAGTATCTGAG - >canFam1.chr14(+):42826418-42826434|canFam1_1 - AGTCAGAGTGTCTGAG - - >hg17.chr7(+):26907326-26907338|hg17_2 - GTAGAAGACCCC - >panTro1.chr6(+):28037344-28037356|panTro1_2 - GTAGAAGACCCC - >mm5.chr6(+):52104047-52104059|mm5_2 - GTAGACGATGCC - >rn3.chr4(+):80734420-80734432|rn3_2 - GTAGATGATGCG - >canFam1.chr14(+):42826434-42826446|canFam1_2 - GTAGAAGACCCC - - >hg17.chr7(+):26907338-26907654|hg17_3 - GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC - >panTro1.chr6(+):28037356-28037672|panTro1_3 - GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC - >mm5.chr6(+):52104059-52104375|mm5_3 - GGAGAAGGGGCACTGGGCGAGGGGCTAGATTTCTCAGATGAT---TCTTCCGTTTTCTCAT-----CGCTGCCAGG----AGGAGTGGCAGGGGAGATGGGCAGGAGCCCCTCCTTCTCACGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGCTGTAGGGACGCGGCAATCTCCACCCTGCGCGCTCGTGTAAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAATTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC - >rn3.chr4(+):80734432-80734748|rn3_3 - GGAGAAGGGGCGCTGGGCGAGGAGCTGGATTTCTCAGATGAT---TCTTCAGTTTTCTCAT-----CGCTTCCAGG----AGGGGTGGCGGGTGAAATGGGCAAGAGCCCCTCTTTCTCGCGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGTTGCAGGGACGCGGCTATCTCCACCCTGCGGGCTCTTGTTAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAGTTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCATACTCTCCAACTTTCC - >canFam1.chr14(+):42826446-42826762|canFam1_3 - GGAGACGGAATGCAGGGCGAGGAGCTGGATTTCTCTGAAGAT---TCCTCCGCCTTCTCCT-----CACTTCCTGG----CGGGGTGGCAGGGGAGATGGGCAAAAGGCCCTCTTTCTCTCGTTTCTTCTGCTTCATCCGGCGGTTCTGGAACCAGATCTTCACCTGGGTCTCGTTGAGCTGCAGGGATGCTGCGATCTCCACCCTGCGGGCGCGGGTCAGATACTTATTGAAGTGGAACTCCTTTTCCAGCTCGGTGAGCTGCTTGGTGGTGAAGTTGGTACGCACTGCATTCGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC - - - -.. _HYPHY: http://www.hyphy.org - - - diff -r c2a356708570 -r 33c067c3ae34 tools/hyphy/hyphy_dnds_wrapper.py --- a/tools/hyphy/hyphy_dnds_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -#Guru -#takes fasta alignments, a distance metric and builds neighbor joining trees -import os, sys -from galaxy import eggs -from galaxy.tools.util import hyphy_util - -#Retrieve hyphy path, this will need to be the same across the cluster -tool_data = sys.argv.pop() -HYPHY_PATH = os.path.join( tool_data, "HYPHY" ) -HYPHY_EXECUTABLE = os.path.join( HYPHY_PATH, "HYPHY" ) - -#Read command line arguments -input_filename = os.path.abspath(sys.argv[1].strip()) -output_filename = os.path.abspath(sys.argv[2].strip()) -tree_contents = sys.argv[3].strip() -nuc_model = sys.argv[4].strip() -analysis = sys.argv[5].strip() - -if tree_contents == "": - print >> sys.stderr, "Please specify a valid tree definition." - sys.exit() - -tree_filename = hyphy_util.get_filled_temp_filename(tree_contents) - -if analysis == "local": - fitter_filename = hyphy_util.get_filled_temp_filename(hyphy_util.SimpleLocalFitter) -else: - fitter_filename = hyphy_util.get_filled_temp_filename(hyphy_util.SimpleGlobalFitter) - -tabwriter_filename = hyphy_util.get_filled_temp_filename(hyphy_util.TabWriter) -FastaReader_filename = hyphy_util.get_filled_temp_filename(hyphy_util.FastaReader) -#setup Config file -config_filename = hyphy_util.get_dnds_config_filename(fitter_filename, tabwriter_filename, "Universal", tree_filename, input_filename, nuc_model, output_filename, FastaReader_filename) - -#Run Hyphy -hyphy_cmd = "%s BASEPATH=%s USEPATH=/dev/null %s" % (HYPHY_EXECUTABLE, HYPHY_PATH, config_filename) -hyphy = os.popen(hyphy_cmd, 'r') -#print hyphy.read() -hyphy.close() - -#remove temporary files -os.unlink(fitter_filename) -os.unlink(tabwriter_filename) -os.unlink(tree_filename) -os.unlink(FastaReader_filename) -os.unlink(config_filename) - -if nuc_model == "000000": - model = "F81" -elif nuc_model == "010010": - model = "HKY85" -else: - model = "REV" - -print "Analysis: %s; Model: %s; Tree: %s" %(analysis, model, tree_contents) diff -r c2a356708570 -r 33c067c3ae34 tools/hyphy/hyphy_dnds_wrapper.xml --- a/tools/hyphy/hyphy_dnds_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ - - - - Estimation - - hyphy_dnds_wrapper.py $input1 $out_file1 "$tree" "$model" $analysis ${GALAXY_DATA_INDEX_DIR} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -This tool takes a FASTA alignment file and estimates dN/dS ratio using HYPHY_, a maximum likelihood analyses package. - ------ - -.. class:: warningmark - -The tool returns an error message if no tree definition or an invalid tree definition is supplied. -Any block/s not containing as many species as mentioned in the tree definition will be omitted from the output. - ------ - -For the tree definition, you only need to specify the species build names. For example, you could use the tree *(hg17,panTro1),(mm5,rn3),canFam1)*, if your FASTA file looks like the example below. You may also use **Neighbor Joining Tree Builder** tool to obtain the tree definition:: - - >hg17.chr7(+):26907301-26907310|hg17_0 - GTGGGAGGT - >panTro1.chr6(+):28037319-28037328|panTro1_0 - GTGGGAGGT - >mm5.chr6(+):52104022-52104031|mm5_0 - GTGGGAGGT - >rn3.chr4(+):80734395-80734404|rn3_0 - GTGGGAGGT - >canFam1.chr14(+):42826409-42826418|canFam1_0 - GTGGGAGGT - - >hg17.chr7(+):26907310-26907326|hg17_1 - AGTCAGAGTGTCTGAG - >panTro1.chr6(+):28037328-28037344|panTro1_1 - AGTCAGAGTGTCTGAG - >mm5.chr6(+):52104031-52104047|mm5_1 - AGTCAGAGTGTCTGAG - >rn3.chr4(+):80734404-80734420|rn3_1 - AGTCAGAGTATCTGAG - >canFam1.chr14(+):42826418-42826434|canFam1_1 - AGTCAGAGTGTCTGAG - - >hg17.chr7(+):26907326-26907338|hg17_2 - GTAGAAGACCCC - >panTro1.chr6(+):28037344-28037356|panTro1_2 - GTAGAAGACCCC - >mm5.chr6(+):52104047-52104059|mm5_2 - GTAGACGATGCC - >rn3.chr4(+):80734420-80734432|rn3_2 - GTAGATGATGCG - >canFam1.chr14(+):42826434-42826446|canFam1_2 - GTAGAAGACCCC - - >hg17.chr7(+):26907338-26907654|hg17_3 - GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC - >panTro1.chr6(+):28037356-28037672|panTro1_3 - GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC - >mm5.chr6(+):52104059-52104375|mm5_3 - GGAGAAGGGGCACTGGGCGAGGGGCTAGATTTCTCAGATGAT---TCTTCCGTTTTCTCAT-----CGCTGCCAGG----AGGAGTGGCAGGGGAGATGGGCAGGAGCCCCTCCTTCTCACGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGCTGTAGGGACGCGGCAATCTCCACCCTGCGCGCTCGTGTAAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAATTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC - >rn3.chr4(+):80734432-80734748|rn3_3 - GGAGAAGGGGCGCTGGGCGAGGAGCTGGATTTCTCAGATGAT---TCTTCAGTTTTCTCAT-----CGCTTCCAGG----AGGGGTGGCGGGTGAAATGGGCAAGAGCCCCTCTTTCTCGCGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGTTGCAGGGACGCGGCTATCTCCACCCTGCGGGCTCTTGTTAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAGTTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCATACTCTCCAACTTTCC - >canFam1.chr14(+):42826446-42826762|canFam1_3 - GGAGACGGAATGCAGGGCGAGGAGCTGGATTTCTCTGAAGAT---TCCTCCGCCTTCTCCT-----CACTTCCTGG----CGGGGTGGCAGGGGAGATGGGCAAAAGGCCCTCTTTCTCTCGTTTCTTCTGCTTCATCCGGCGGTTCTGGAACCAGATCTTCACCTGGGTCTCGTTGAGCTGCAGGGATGCTGCGATCTCCACCCTGCGGGCGCGGGTCAGATACTTATTGAAGTGGAACTCCTTTTCCAGCTCGGTGAGCTGCTTGGTGGTGAAGTTGGTACGCACTGCATTCGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC - - - -.. _HYPHY: http://www.hyphy.org - - - diff -r c2a356708570 -r 33c067c3ae34 tools/hyphy/hyphy_nj_tree_wrapper.py --- a/tools/hyphy/hyphy_nj_tree_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -#Dan Blankenberg -#takes fasta alignments, a distance metric and builds neighbor joining trees -import os, sys -from galaxy import eggs -from galaxy.tools.util import hyphy_util - -#Retrieve hyphy path, this will need to be the same across the cluster -tool_data = sys.argv.pop() -HYPHY_PATH = os.path.join( tool_data, "HYPHY" ) -HYPHY_EXECUTABLE = os.path.join( HYPHY_PATH, "HYPHY" ) - -#Read command line arguments -input_filename = os.path.abspath(sys.argv[1].strip()) -output_filename1 = os.path.abspath(sys.argv[2].strip()) -output_filename2 = os.path.abspath(sys.argv[3].strip()) -distance_metric = sys.argv[4].strip() -temp_ps_filename = hyphy_util.get_filled_temp_filename("") - -#Guess if this is a single or multiple FASTA input file -found_blank = False -is_multiple = False -for line in open(input_filename): - line = line.strip() - if line == "": found_blank = True - elif line.startswith(">") and found_blank: - is_multiple = True - break - else: found_blank = False - -NJ_tree_shared_ibf = hyphy_util.get_filled_temp_filename(hyphy_util.NJ_tree_shared_ibf) - -#set up NJ_tree file -NJ_tree_filename = hyphy_util.get_filled_temp_filename(hyphy_util.get_NJ_tree(NJ_tree_shared_ibf)) -#setup Config file -config_filename = hyphy_util.get_nj_tree_config_filename(input_filename, distance_metric, output_filename1, temp_ps_filename, NJ_tree_filename) -if is_multiple: - os.unlink(NJ_tree_filename) - os.unlink(config_filename) - NJ_tree_filename = hyphy_util.get_filled_temp_filename(hyphy_util.get_NJ_treeMF(NJ_tree_shared_ibf)) - config_filename = hyphy_util.get_nj_treeMF_config_filename(input_filename, output_filename1, temp_ps_filename, distance_metric, NJ_tree_filename) - print "Multiple Alignment Analyses" -else: print "Single Alignment Analyses" - - -#Run Hyphy -hyphy_cmd = "%s BASEPATH=%s USEPATH=/dev/null %s" % (HYPHY_EXECUTABLE, HYPHY_PATH, config_filename) -hyphy = os.popen(hyphy_cmd, 'r') -#print hyphy.read() -hyphy.close() - -#remove temporary files -os.unlink(NJ_tree_filename) -os.unlink(config_filename) - - -#Convert PS to PDF -if os.path.getsize(temp_ps_filename)>0: temp = os.popen("ps2pdf %s %s" % (temp_ps_filename, output_filename2), 'r').close() -os.unlink(temp_ps_filename) diff -r c2a356708570 -r 33c067c3ae34 tools/hyphy/hyphy_nj_tree_wrapper.xml --- a/tools/hyphy/hyphy_nj_tree_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ - - - - Builder - - hyphy_nj_tree_wrapper.py $input1 $out_file1 $out_file2 $distance_metric ${GALAXY_DATA_INDEX_DIR} - - - - - - - - - - - - - - - - - - - - - - - - ps2pdf - - - - - - - - - - -This tool takes a single or multiple FASTA alignment file and builds Neighbor Joining Trees using HYPHY_, a maximum likelihood analyses package. - -.. _HYPHY: http://www.hyphy.org - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/abyss.xml --- a/tools/ilmn_pacbio/abyss.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - Short-read de Bruijn assembly - - quake_wrapper.py -k $k -r $input1 -p 8 > $output1 - - - - - - - - - - -**What it does** - -TBD. Calls ABySS assembler - -**Parameter list** - -k - -**Output** - -Corrected reads - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/assembly_stats.py --- a/tools/ilmn_pacbio/assembly_stats.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -#!/usr/bin/env python -# -#Copyright (c) 2011, Pacific Biosciences of California, Inc. -# -#All rights reserved. -# -#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -# * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. -# -#THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY -#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -import sys, os -from optparse import OptionParser -from galaxy import eggs -import pkg_resources -pkg_resources.require( 'bx-python' ) -from bx.seq.fasta import FastaReader - -def getStats( fastaFile, genomeLength, minContigLength ): - lengths = [] - stats = { "Num" : 0, - "Sum" : 0, - "Max" : 0, - "Avg" : 0, - "N50" : 0, - "99%" : 0 } - fasta_reader = FastaReader( open( fastaFile, 'rb' ) ) - while True: - seq = fasta_reader.next() - if not seq: - break - if seq.length < minContigLength: - continue - lengths.append( seq.length ) - if lengths: - stats[ 'Num' ] = len( lengths ) - stats[ 'Sum' ] = sum( lengths ) - stats[ 'Max' ] = max( lengths ) - stats[ 'Avg' ] = int( sum( lengths ) / float( len( lengths ) ) ) - stats[ 'N50' ] = 0 - stats[ '99%' ] = 0 - if genomeLength == 0: - genomeLength = sum( lengths ) - lengths.sort() - lengths.reverse() - lenSum = 0 - stats[ "99%" ] = len( lengths ) - for idx, length in enumerate( lengths ): - lenSum += length - if ( lenSum > genomeLength / 2 ): - stats[ "N50" ] = length - break - lenSum = 0 - for idx, length in enumerate( lengths ): - lenSum += length - if lenSum > genomeLength * 0.99: - stats[ "99%" ] = idx + 1 - break - return stats - -def __main__(): - #Parse Command Line - usage = 'Usage: %prog input output --minContigLength' - parser = OptionParser( usage=usage ) - parser.add_option( "--minContigLength", dest="minContigLength", help="Minimum length of contigs to analyze" ) - parser.add_option( "--genomeLength", dest="genomeLength", help="Length of genome for which to calculate N50s" ) - parser.set_defaults( minContigLength=0, genomeLength=0 ) - options, args = parser.parse_args() - input_fasta_file = args[ 0 ] - output_tabular_file = args[ 1 ] - statKeys = "Num Sum Max Avg N50 99%".split( " " ) - stats = getStats( input_fasta_file, int( options.genomeLength ), int( options.minContigLength ) ) - fout = open( output_tabular_file, "w" ) - fout.write( "%s\n" % "\t".join( map( lambda key: str( stats[ key ] ), statKeys ) ) ) - fout.close() - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/assembly_stats.xml --- a/tools/ilmn_pacbio/assembly_stats.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ - - Calculate common measures of assembly quality - - assembly_stats.py $input1 $output1 --minContigLength=${minLength} - - - - - - - - - - - - - - - - - -**What it does** - -Reports standard measures of *de novo* assembly quality such as number of contigs, sum of contigs, mean contig length, and N50. - -**Parameter list** - -Minimum length - Only include contigs of this size or greater for calculating statistics. - -**Output** - -Num contigs - Total number of contigs in the assembly - -Sum of contig lengths - Total sum of contig lengths - -Maximum contig length - Maximum of the contig lengths - -Mean contig length - Average contig length - -N50 - Contig length at which 50% of the assembly is contained in contigs of this size or greater. - -99% - Number of contigs accounting for 99% of the observed assembly. - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/cov_model.py --- a/tools/ilmn_pacbio/cov_model.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,238 +0,0 @@ -#!/usr/bin/env python -from optparse import OptionParser, SUPPRESS_HELP -import os, random, quake - -############################################################ -# cov_model.py -# -# Given a file of kmer counts, reports the cutoff to use -# to separate trusted/untrusted kmers. -############################################################ - -############################################################ -# main -############################################################ -def main(): - usage = 'usage: %prog [options] ' - parser = OptionParser(usage) - parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]') - parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]') - parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]') - # help='Model kmer coverage as a function of GC content of kmers [default: %default]' - parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP) - (options, args) = parser.parse_args() - - if len(args) != 1: - parser.error('Must provide kmers counts file') - else: - ctsf = args[0] - - if options.count_kmers: - model_cutoff(ctsf, options.ratio) - print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() - - else: - if options.model_gc: - model_q_gc_cutoffs(ctsf, 25000, options.ratio) - else: - model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample) - print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() - - -############################################################ -# model_cutoff -# -# Make a histogram of kmers to give to R to learn the cutoff -############################################################ -def model_cutoff(ctsf, ratio): - # make kmer histogram - cov_max = 0 - for line in open(ctsf): - cov = int(line.split()[1]) - if cov > cov_max: - cov_max = cov - - kmer_hist = [0]*cov_max - for line in open(ctsf): - cov = int(line.split()[1]) - kmer_hist[cov-1] += 1 - - cov_out = open('kmers.hist', 'w') - for cov in range(0,cov_max): - if kmer_hist[cov]: - print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov]) - cov_out.close() - - os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir)) - - -############################################################ -# model_q_cutoff -# -# Sample kmers to give to R to learn the cutoff -# 'div100' is necessary when the number of kmers is too -# large for random.sample, so we only consider every 100th -# kmer. -############################################################ -def model_q_cutoff(ctsf, sample, ratio, no_sample=False): - if not no_sample: - # count number of kmer coverages - num_covs = 0 - for line in open(ctsf): - num_covs += 1 - - # choose random kmer coverages - div100 = False - if sample >= num_covs: - rand_covs = range(num_covs) - else: - if num_covs > 1000000000: - div100 = True - rand_covs = random.sample(xrange(num_covs/100), sample) - else: - rand_covs = random.sample(xrange(num_covs), sample) - rand_covs.sort() - - # print to file - out = open('kmers.txt', 'w') - kmer_i = 0 - rand_i = 0 - for line in open(ctsf): - if div100: - if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]: - print >> out, line.split()[1] - rand_i += 1 - if rand_i >= sample: - break - else: - if kmer_i == rand_covs[rand_i]: - print >> out, line.split()[1] - rand_i += 1 - if rand_i >= sample: - break - kmer_i += 1 - out.close() - - os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir)) - - -############################################################ -# model_q_gc_cutoffs -# -# Sample kmers to give to R to learn the cutoff for each -# GC value -############################################################ -def model_q_gc_cutoffs(ctsf, sample, ratio): - # count number of kmer coverages at each at - k = len(open(ctsf).readline().split()[0]) - num_covs_at = [0]*(k+1) - for line in open(ctsf): - kmer = line.split()[0] - num_covs_at[count_at(kmer)] += 1 - - # for each AT bin - at_cutoffs = [] - for at in range(1,k): - # sample covs - if sample >= num_covs_at[at]: - rand_covs = range(num_covs_at[at]) - else: - rand_covs = random.sample(xrange(num_covs_at[at]), sample) - rand_covs.sort() - - # print to file - out = open('kmers.txt', 'w') - kmer_i = 0 - rand_i = 0 - for line in open(ctsf): - (kmer,cov) = line.split() - if count_at(kmer) == at: - if kmer_i == rand_covs[rand_i]: - print >> out, cov - rand_i += 1 - if rand_i >= sample: - break - kmer_i += 1 - out.close() - - os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) - - at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) - if at in [1,k-1]: # setting extremes to next closests - at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) - - os.system('mv kmers.txt kmers.at%d.txt' % at) - os.system('mv cutoff.txt cutoff.at%d.txt' % at) - - out = open('cutoffs.gc.txt','w') - print >> out, '\n'.join(at_cutoffs) - out.close() - - -############################################################ -# model_q_gc_cutoffs_bigmem -# -# Sample kmers to give to R to learn the cutoff for each -# GC value -############################################################ -def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio): - # input coverages - k = 0 - for line in open(ctsf): - (kmer,cov) = line.split() - if k == 0: - k = len(kmer) - at_covs = ['']*(k+1) - else: - at = count_at(kmer) - if at_covs[at]: - at_covs[at].append(cov) - else: - at_covs[at] = [cov] - - for at in range(1,k): - print '%d %d' % (at,len(at_covs[at])) - - # for each AT bin - at_cutoffs = [] - for at in range(1,k): - # sample covs - if sample >= len(at_covs[at]): - rand_covs = at_covs[at] - else: - rand_covs = random.sample(at_covs[at], sample) - - # print to file - out = open('kmers.txt', 'w') - for rc in rand_covs: - print >> out, rc - out.close() - - os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) - - at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) - if at in [1,k-1]: # setting extremes to next closests - at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) - - os.system('mv kmers.txt kmers.at%d.txt' % at) - os.system('mv cutoff.txt cutoff.at%d.txt' % at) - - out = open('cutoffs.gc.txt','w') - print >> out, '\n'.join(at_cutoffs) - out.close() - - -############################################################ -# count_at -# -# Count A's and T's in the given sequence -############################################################ -def count_at(seq): - return len([nt for nt in seq if nt in ['A','T']]) - - -############################################################ -# __main__ -############################################################ -if __name__ == '__main__': - main() diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/quake.py --- a/tools/ilmn_pacbio/quake.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,136 +0,0 @@ -#!/usr/bin/env python -from optparse import OptionParser, SUPPRESS_HELP -import os, random, sys -import cov_model - -############################################################ -# quake.py -# -# Launch pipeline to correct errors in Illumina sequencing -# reads. -############################################################ - -#r_dir = '/nfshomes/dakelley/research/error_correction/bin' -quake_dir = os.path.abspath(os.path.dirname(sys.argv[0])) - -############################################################ -# main -############################################################ -def main(): - usage = 'usage: %prog [options]' - parser = OptionParser(usage) - parser.add_option('-r', dest='readsf', help='Fastq file of reads') - parser.add_option('-f', dest='reads_listf', help='File containing fastq file names, one per line or two per line for paired end reads.') - parser.add_option('-k', dest='k', type='int', help='Size of k-mers to correct') - parser.add_option('-p', dest='proc', type='int', default=4, help='Number of processes [default: %default]') - parser.add_option('-q', dest='quality_scale', type='int', default=-1, help='Quality value ascii scale, generally 64 or 33. If not specified, it will guess.') - parser.add_option('--no_count', dest='no_count', action='store_true', default=False, help='Kmers are already counted and in expected file [reads file].qcts or [reads file].cts [default: %default]') - parser.add_option('--no_cut', dest='no_cut', action='store_true', default=False, help='Coverage model is optimized and cutoff was printed to expected file cutoff.txt [default: %default]') - parser.add_option('--int', dest='counted_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]') - parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff. Generally set between 10-1000 with lower numbers suggesting a lower threshold. [default: %default]') - # help='Model kmer coverage as a function of GC content of kmers [default: %default]' - parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP) - parser.add_option('--headers', action='store_true', default=False, help='Output original read headers (i.e. pass --headers to correct)' ) - (options, args) = parser.parse_args() - - if not options.readsf and not options.reads_listf: - parser.error('Must provide fastq file of reads with -r or file with list of fastq files of reads with -f') - if not options.k: - parser.error('Must provide k-mer size with -k') - if options.quality_scale == -1: - options.quality_scale = guess_quality_scale(options.readsf, options.reads_listf) - - if options.counted_kmers: - cts_suf = 'cts' - else: - cts_suf = 'qcts' - if options.readsf: - ctsf = '%s.%s' % (os.path.splitext( os.path.split(options.readsf)[1] )[0], cts_suf) - reads_str = '-r %s' % options.readsf - else: - ctsf = '%s.%s' % (os.path.split(options.reads_listf)[1], cts_suf) - reads_str = '-f %s' % options.reads_listf - - if not options.no_count and not options.no_cut: - count_kmers(options.readsf, options.reads_listf, options.k, ctsf, options.quality_scale) - - if not options.no_cut: - # model coverage - if options.counted_kmers: - cov_model.model_cutoff(ctsf, options.ratio) - else: - if options.model_gc: - cov_model.model_q_gc_cutoffs(ctsf, 10000, options.ratio) - else: - cov_model.model_q_cutoff(ctsf, 25000, options.ratio) - - - if options.model_gc: - # run correct C++ code - os.system('%s/correct %s -k %d -m %s -a cutoffs.gc.txt -p %d -q %d' % (quake_dir,reads_str, options.k, ctsf, options.proc, options.quality_scale)) - - else: - cutoff = open('cutoff.txt').readline().rstrip() - - # run correct C++ code - headers = '--headers' if options.headers else '' - os.system('%s/correct %s %s -k %d -m %s -c %s -p %d -q %d' % (quake_dir,headers, reads_str, options.k, ctsf, cutoff, options.proc, options.quality_scale)) - - -################################################################################ -# guess_quality_scale -# Guess at ascii scale of quality values by examining -# a bunch of reads and looking for quality values < 64, -# in which case we set it to 33. -################################################################################ -def guess_quality_scale(readsf, reads_listf): - reads_to_check = 1000 - if not readsf: - readsf = open(reads_listf).readline().split()[0] - - fqf = open(readsf) - reads_checked = 0 - header = fqf.readline() - while header and reads_checked < reads_to_check: - seq = fqf.readline() - mid = fqf.readline() - qual = fqf.readline().rstrip() - reads_checked += 1 - for q in qual: - if ord(q) < 64: - print 'Guessing quality values are on ascii 33 scale' - return 33 - header = fqf.readline() - - print 'Guessing quality values are on ascii 64 scale' - return 64 - - - -############################################################ -# count_kmers -# -# Count kmers in the reads file using AMOS count-kmers or -# count-qmers -############################################################ -def count_kmers(readsf, reads_listf, k, ctsf, quality_scale): - # find files - fq_files = [] - if readsf: - fq_files.append(readsf) - else: - for line in open(reads_listf): - for fqf in line.split(): - fq_files.append(fqf) - - if ctsf[-4:] == 'qcts': - os.system('cat %s | %s/count-qmers -k %d -q %d > %s' % (' '.join(fq_files), quake_dir, k, quality_scale, ctsf)) - else: - os.system('cat %s | %s/count-kmers -k %d > %s' % (' '.join(fq_files), quake_dir, k, ctsf)) - - -############################################################ -# __main__ -############################################################ -if __name__ == '__main__': - main() diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/quake.xml --- a/tools/ilmn_pacbio/quake.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - Quality-aware error correction - - quake_wrapper.py --default_cutoff=10 --headers -k $k -f $fofnfile -p 12 > $output1 - - - - - - - -${input1.file_name} - - - - - - - -**What it does** - -Applies the Quake_ algorithm for quality-aware correction of -substitution error in short reads. - -Kelley DR, Schatz MC, Salzberg SL. -"Quake: quality-aware detection and correction of sequencing errors." -*Genome Biol.* 2010;11(11):R116. - -.. _Quake: http://www.cbcb.umd.edu/software/quake - -**Parameter list** - -k - k-mer size for detecting spurious k-mers versus true k-mers from - the genome. Recommendations for choosing a value of k can be found - here_. - -.. _here: http://www.cbcb.umd.edu/software/quake/faq.html - -**Output** - -A FASTQ file of corrected and trimmed reads. - - diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/quake_pe.xml --- a/tools/ilmn_pacbio/quake_pe.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ - - Quality-aware error correction for paired-end reads - - quake_wrapper.py --default_cutoff=$cutoff --headers -k $k -f $fofnfile -p 12 --output=$output1,$output2 - - - - - - - - - ${input1.file_name} ${input2.file_name} - - - - - - - - -**What it does** - -Applies the Quake_ algorithm for quality-aware correction of -substitution error in short reads. This form of the tool is customized -for correcting paired-end reads. - -Kelley DR, Schatz MC, Salzberg SL. -"Quake: quality-aware detection and correction of sequencing errors." -*Genome Biol.* 2010;11(11):R116. - -.. _Quake: http://www.cbcb.umd.edu/software/quake - -**Parameter list** - -K-mer size - k-mer size for detecting spurious k-mers versus true k-mers from - the genome. Recommendations for choosing a value of k can be found - here_. - -Default coverage cutoff - If the appropriate coverage cutoff can not be found then Quake can be - forced to proceed anyways with the supplied cutoff. In this case, - the optimal cutoff can be estimated by examining - the k-mer coverage histogram by eye. - -.. _here: http://www.cbcb.umd.edu/software/quake/faq.html - -**Output** - -A FASTQ file of corrected and trimmed reads. - - diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/quake_wrapper.py --- a/tools/ilmn_pacbio/quake_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,132 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2011, Pacific Biosciences of California, Inc. -# -# All rights reserved. -# -#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -# * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. -# -#THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY -#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -import sys -import os -import subprocess - -QUAKE_EXE = os.path.join( os.path.dirname(os.path.abspath(sys.argv[0])), 'quake.py' ) -cmdLine = sys.argv -cmdLine.pop(0) - -# -# horribly not robust, but it was a pain to rewrite everything with -# optparse -# -j = -1 -cut = 0 -for i,arg in enumerate(cmdLine): - if '--default_cutoff' in arg: - j = i - cut = int(arg.split('=')[1]) -if j>=0: - cmdLine = cmdLine[:j] + cmdLine[j+1:] - -j = -1 -output='' -for i,arg in enumerate(cmdLine): - if '--output' in arg: - j = i - output = arg.split('=')[1] -if j>=0: - cmdLine = cmdLine[:j] + cmdLine[j+1:] - -def backticks( cmd, merge_stderr=True ): - """ - Simulates the perl backticks (``) command with error-handling support - Returns ( command output as sequence of strings, error code, error message ) - """ - if merge_stderr: - _stderr = subprocess.STDOUT - else: - _stderr = subprocess.PIPE - - p = subprocess.Popen( cmd, shell=True, stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=_stderr, - close_fds=True ) - - out = [ l[:-1] for l in p.stdout.readlines() ] - - p.stdout.close() - if not merge_stderr: - p.stderr.close() - - # need to allow process to terminate - p.wait() - - errCode = p.returncode and p.returncode or 0 - if p.returncode>0: - errorMessage = os.linesep.join(out) - output = [] - else: - errorMessage = '' - output = out - - return output, errCode, errorMessage - -def to_stdout(): - def toCorFastq(f): - stem, ext = os.path.splitext( os.path.basename(f) ) - dir = os.path.dirname(f) - corFastq = os.path.join(dir,'%s.cor%s' % (stem,ext) ) - if not os.path.exists(corFastq): - print >>sys.stderr, "Can't find path %s" % corFastq - sys.exit(1) - return corFastq - if '-r' in cmdLine: - fastqFile = cmdLine[ cmdLine.index('-r')+1 ] - corFastq = toCorFastq(fastqFile) - infile = open( corFastq, 'r' ) - for line in infile: - sys.stdout.write( line ) - infile.close() - else: - fofnFile = cmdLine[ cmdLine.index('-f')+1 ] - infile = open(fofnFile,'r') - for line in infile: - line = line.strip() - if len(line)>0: - fastqFiles = line.split() - break - infile.close() - outs = output.split(',') - for o,f in zip(outs,fastqFiles): - cf = toCorFastq(f) - os.system( 'cp %s %s' % ( cf, o ) ) - -def run(): - cmd = '%s %s' % ( QUAKE_EXE, " ".join(cmdLine) ) - output, errCode, errMsg = backticks( cmd ) - - if errCode==0: - to_stdout() - else: - # if Quake exits with an error in cutoff determination we - # can force correction if requested - if 'cutoff.txt' in errMsg and cut>0: - outfile = open( 'cutoff.txt', 'w' ) - print >>outfile, str(cut) - outfile.close() - cmd = '%s --no_count --no_cut %s' % ( QUAKE_EXE, " ".join(cmdLine) ) - output, errCode, errMsg = backticks( cmd ) - if errCode==0: - to_stdout() - else: - print >>sys.stderr, errMsg - sys.exit(1) - -if __name__=='__main__': run() diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/smrtpipe.py --- a/tools/ilmn_pacbio/smrtpipe.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -#!/usr/bin/env python -# EASY-INSTALL-SCRIPT: 'pbpy==0.1','smrtpipe.py' -__requires__ = 'pbpy==0.1' -import pkg_resources -pkg_resources.run_script('pbpy==0.1', 'smrtpipe.py') diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/smrtpipe_filter.xml --- a/tools/ilmn_pacbio/smrtpipe_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ - - Produce filtered reads from a set of PacBio primary analysis outputs. - - smrtpipe_galaxy.py --output=data/filtered_subreads.fasta --galaxy_output=${outfile} ${iniFile} - - - - - - - - - - - - - - - - - - - - - -[input] -#if $source.input_source=="history": -#for $l in open($source.input1.file_name,'r'): -$l -#end for -#else -#for $p in $source.inputFiles -${p.path} -#end for -#end if - -[S_Filter] -filters=MinRL=${minimum_readlength},MinReadScore=${minimum_readscore} - - - - - - - -**What it does** - -Filters PacBio bas.h5 files and produces a FASTA file of filtered subreads. - -In PacBio SMRT sequencing, the template format is a SMRTbell: a circular -molecule with adapters at two locations in the circle. The subreads are the -portions of the read between adapters. - -**Parameter list** - -Minimum readlength - Only keep reads from ZMWs that produced this many bases or more. - -Minimum read quality - Only keep reads with overall quality scores of this value or more. The read quality score is a *de novo* prediction of the accuracy of the read. - -**Output** - -FASTA file of filtered reads. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/smrtpipe_galaxy.py --- a/tools/ilmn_pacbio/smrtpipe_galaxy.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,265 +0,0 @@ -#!/usr/bin/python -import sys -import os -import subprocess -import optparse as op -import xml.etree.cElementTree as et - -TRACE=False -# -# Turn on tracing to dump out __input__.xml and __settings__.xml somewhere -# -#TRACE=True -#TRACE_PATH='/home/UNIXHOME/jsorenson' - -class SmrtpipeGalaxy: - """Wrapper for running smrtpipe under galaxy""" - def __init__( self, argv ): - self.__parseOptions( argv ) - - def __parseOptions( self, argv ): - usage = 'Usage: %prog [--help] [options] smrtpipe.ini' - parser = op.OptionParser( usage=usage, description=SmrtpipeGalaxy.__doc__ ) - parser.add_option( "--output", - help="Designate a file generated by smrtpipe as the expected output for galaxy" ) - parser.add_option( "--nproc", type="int", - help="Number of processes to use (-D NPROC)" ) - parser.add_option( "--galaxy_output", - help="File name provided by galaxy where output should be placed" ) - parser.add_option( "--dry_run", action="store_true", - help="Create auxiliary XML files and exit" ) - parser.add_option( "--dat_extension", - help="Soft link .dat files to have this extension (some pipelines require certain extensions)" ) - - parser.set_defaults( output=None, dry_run=False, galaxy_output=None, - dat_extension=None, nproc=0 ) - self.options, self.args = parser.parse_args( argv ) - - if len(self.args)!=2: - parser.error( 'Expected 1 argument' ) - - self.configFile = self.args[1] - - def __parseConfig( self ): - infile = open( self.configFile, 'r' ) - section = None - sections = [] - for line in infile: - l = line.strip() - if len(l)==0 or line.startswith('#'): - continue - if l.startswith('[') and l.endswith(']'): - section = section_factory( l[1:-1] ) - sections.append(section) - continue - if section is None: - continue - if '=' in l: - section.addParameterLine(l) - else: - section.addLine(l) - infile.close() - return sections - - def transferOutput( self ): - if not self.options.output or not self.options.galaxy_output: - return True, '' - if not os.path.exists(self.options.output): - return False, "Can't find file %s (job error?)" % self.options.output - os.system( 'cp %s %s' % (self.options.output, self.options.galaxy_output )) - return True, '' - - def run( self ): - if not os.path.exists( self.configFile ): - print >>sys.stderr, "Can't find config file %s" % self.configFile - return 1 - - sections = self.__parseConfig() - - if len(sections)==0: - print >>sys.stderr, "No sections found in %s" % self.configFile - return 1 - if sections[0].name != 'input': - print >>sys.stderr, "No [input] section found in %s" % self.configFile - return 1 - - INPUT_FILE = '__input__.xml' - SETTINGS_FILE = '__settings__.xml' - - sections[0].softLinkDats( self.options.dat_extension ) - inputXml = sections[0].makeXmlElement() - write_xml_to_file( INPUT_FILE, inputXml ) - if TRACE: - write_xml_to_file( os.path.join(TRACE_PATH,INPUT_FILE), inputXml ) - - settings = et.Element( 'smrtpipeSettings' ) - for s in sections[1:]: - s.makeXmlElement( settings ) - - write_xml_to_file( SETTINGS_FILE, settings ) - if TRACE: - write_xml_to_file( os.path.join(TRACE_PATH,SETTINGS_FILE), settings ) - - nproc = '-D NPROC=%d' % self.options.nproc if self.options.nproc>0 else '' - cmd = 'smrtpipe.py %s --params=%s xml:%s > smrtpipe.err 2>1' % \ - ( nproc, SETTINGS_FILE, INPUT_FILE ) - - if self.options.dry_run: - print 'Command to run:' - print cmd - return 0 - - out, errCode, errMsg = backticks( cmd ) - if errCode!=0: - print >>sys.stderr, "error while running: %s" % cmd - print >>sys.stderr, errMsg - if os.path.exists('log/smrtpipe.log'): - print >>sys.stderr, 'Log:' - infile = open('log/smrtpipe.log','r') - for line in infile: sys.stderr.write(line) - infile.close() - return errCode - - success, errMsg = self.transferOutput() - if not success: - print >>sys.stderr, errMsg - return 1 - - return 0 - -def write_xml_to_file( fileName, root ): - outfile = open( fileName, 'w' ) - outfile.write( '\n' ) - outfile.write( et.tostring(root) + '\n' ) - outfile.close() - -def section_factory( name ): - if name=='input': - return InputSection(name) - else: - return Section(name) - -class Section: - def __init__( self, name ): - self._name = name - self._lines = [] - self._vars = {} - - @property - def name(self): - return self._name - - def addLine( self, line ): - self._lines.append(line) - - def addParameterLine( self, line ): - self.addLine(line) - i = line.find( '=' ) - key = line[:i].strip() - value = line[i+1:].strip() - self._vars[key] = value - - def makeXmlElement( self, settings ): - if self._name=='global': - root = et.SubElement( settings, "protocol", {'name':'generic'} ) - else: - root = et.SubElement( settings, "module", {'name':self._name} ) - for k,v in self._vars.iteritems(): - param = et.SubElement( root, 'param', {'name':k} ) - val = et.SubElement( param, 'value' ) - val.text = v - return None - - def __str__( self ): - "for debugging" - buffer = [ 'S { name=' ] - buffer.append(self._name) - buffer.append('; lines=%s' % ','.join(self._lines) ) - for k,v in self._vars.iteritems(): - buffer.append('; %s=%s' % (k,v) ) - buffer.append(' }') - return ''.join(buffer) - -class InputSection( Section ): - def __init__( self, name ): - Section.__init__(self,name) - - def softLinkDats( self, newExtension ): - if not newExtension: - return - newLines = [] - for l in self._lines: - if ':' in l: - protocol = l[:l.find(':')+1] - file = l[l.find(':')+1:] - else: - protocol = '' - file = l - if os.path.exists(file) and file.endswith('.dat'): - newFile = '%s.%s' % ( file, newExtension ) - if not os.path.exists(newFile): - os.system( 'ln -s %s %s' % ( file, newFile ) ) - newLines.append(protocol+newFile) - else: - newLines.append(l) - self._lines = newLines - - def makeXmlElement( self, parent=None ): - root = et.Element( "pacbioAnalysisInputs" ) - data = et.SubElement( root, 'dataReferences' ) - iRef = 0 - for l in self._lines: - def add(x,iRef): - if len(x)==0: return iRef - node = et.SubElement( data, 'url' ) - if ':' in x: - node.attrib[ 'ref' ] = x - else: - node.attrib[ 'ref' ] = 'run:0000000-%04d' % iRef - node2 = et.SubElement( node, 'location' ) - node2.text = x - return iRef+1 - if l.endswith('fofn') and os.path.exists(l): - infile = open(l,'r') - for j,line in enumerate(infile): iRef=add(line.strip(),iRef) - infile.close() - else: - iRef=add(l,iRef) - return root - -def backticks( cmd, merge_stderr=True ): - """ - Simulates the perl backticks (``) command with error-handling support - Returns ( command output as sequence of strings, error code, error message ) - """ - if merge_stderr: - _stderr = subprocess.STDOUT - else: - _stderr = subprocess.PIPE - - p = subprocess.Popen( cmd, shell=True, stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=_stderr, - close_fds=True ) - - out = [ l[:-1] for l in p.stdout.readlines() ] - - p.stdout.close() - if not merge_stderr: - p.stderr.close() - - # need to allow process to terminate - p.wait() - - errCode = p.returncode and p.returncode or 0 - if p.returncode>0: - errorMessage = os.linesep.join(out) - output = [] - else: - errorMessage = '' - output = out - - return output, errCode, errorMessage - -if __name__=='__main__': - app = SmrtpipeGalaxy( sys.argv ) - sys.exit( app.run() ) diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/smrtpipe_hybrid.xml --- a/tools/ilmn_pacbio/smrtpipe_hybrid.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ - - Assemble contigs from a set of contigs and PacBio reads. - - smrtpipe_galaxy.py --nproc=24 --dat_extension=fasta --output=data/scaffold.fasta --galaxy_output=${outfile} ${iniFile} - - - - - - - - - -[input] -assembled_contigs:${contigs} -file:${reads} - -[HybridAssembly] -instrumentModel=RS -cleanup=False -untangler=pacbio -#set $schedule2 = $schedule.replace('X',';') -paramSchedule=${schedule2} -dontFillin=False -longReadsAsStrobe=True -exactQueryIds=True -rm4Opts=-minMatch 7 -minFrac 0.1 -minPctIdentity 65 -bestn 10 -noSplitSubreads -numberProcesses=16 -cluster=False -minRepeatLength=100000 - - - - - - - -**What it does** - -The AHA assembly algorithm is an AMOS_-based pipeline -for finishing bacterial-sized -genomes using draft contigs and PacBio reads. - -.. _AMOS: http://sourceforge.net/apps/mediawiki/amos - -**Parameter list** - -Parameter schedule - The parameter schedule is a semi-colon delimited list of triples. Each triple represents an iteration of hybrid assembly (alignment/scaffolding/gap-filling). The three paremeters for each iteration are the Z-score, number of reads required to define a link, and the minimum length of subreads used in links. - -**Output** - -FASTA file containing scaffolded and gap-filled contigs resulting from the -hybrid assembly. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ilmn_pacbio/soap_denovo.xml --- a/tools/ilmn_pacbio/soap_denovo.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ - - Short-read de novo assembly - - - SOAPdenovo-127mer all -s ${soap_config} -o assembly -K ${k} -p 24 -d -D -R - - - - - - - - - - - - - - - - - - - - max_rd_len=105 -[LIB] -#if $inputs.read_type == "single" -q=${inputs.input1.file_name} -#else -avg_ins=${inputs.d} -asm_flags=3 -reverse_seq=0 -q1=${inputs.input1.file_name} -q2=${inputs.input2.file_name} -#end if - - - - - - - -**What it does** - -Runs SOAPdenovo_ to generate a genome assembly -using single-fragment or paired-end short reads. - -Li R, Zhu H, Ruan J, Qian W, Fang X, Shi Z, Li Y, Li S, Shan G, Kristiansen K, Li S, Yang H, Wang J, Wang J. -"De novo assembly of human genomes with massively parallel short read sequencing." -*Genome Res.* 2010 Feb;20(2):265-72. - -.. _SOAPdenovo: http://soap.genomics.org.cn/soapdenovo.html - -**Parameter list** - -k - k-mer size for constructing the de Bruijn graph. The appropriate size of k is genome and data set dependent, but a good starting choice might be 75% of the read length. - -Insert size - For paired-end libraries, the expected insert size. - -**Output** - -assembly - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/indels/indel_analysis.py --- a/tools/indels/indel_analysis.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,227 +0,0 @@ -#!/usr/bin/env python - -""" -Given an input sam file, provides analysis of the indels.. - -usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]] - -i, --input=i: The sam file to analyze - -t, --threshold=t: The deletion frequency threshold - -I, --out_ins=I: The interval output file showing insertions - -D, --out_del=D: The interval output file showing deletions -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def add_to_mis_matches( mis_matches, pos, bases ): - """ - Adds the bases and counts to the mis_matches dict - """ - for j, base in enumerate( bases ): - try: - mis_matches[ pos + j ][ base ] += 1 - except KeyError: - try: - mis_matches[ pos + j ][ base ] = 1 - except KeyError: - mis_matches[ pos + j ] = { base: 1 } - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - # prep output files - out_ins = open( options.out_ins, 'wb' ) - out_del = open( options.out_del, 'wb' ) - # patterns - pat = re.compile( '^((?P\d+)M(?P\d+)(?P[ID])(?P\d+)M)$|((?P\d+)M)$' ) - pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) - # for tracking occurences at each pos of ref - mis_matches = {} - indels = {} - multi_indel_lines = 0 - # go through all lines in input file - for i,line in enumerate( open( options.input, 'rb' ) ): - if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) : - split_line = line.split( '\t' ) - chrom = split_line[2].strip() - pos = int( split_line[3].strip() ) - cigar = split_line[5].strip() - bases = split_line[9].strip() - # if not an indel or match, exit - if chrom == '*': - continue - # find matches like 3M2D7M or 7M3I10M - match = {} - m = pat.match( cigar ) - # unprocessable CIGAR - if not m: - m = pat_multi.match( cigar ) - # skip this line if no match - if not m: - continue - # account for multiple indels or operations we don't process - else: - multi_indel_lines += 1 - # get matching parts for the indel or full match if matching - else: - if not mis_matches.has_key( chrom ): - mis_matches[ chrom ] = {} - indels[ chrom ] = { 'D': {}, 'I': {} } - parts = m.groupdict() - if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ): - match = parts - # see if matches meet filter requirements - if match: - # match/mismatch - if parts[ 'match_width' ]: - add_to_mis_matches( mis_matches[ chrom ], pos, bases ) - # indel - else: - # pieces of CIGAR string - left = int( match[ 'lmatch' ] ) - middle = int( match[ 'ins_del_width' ] ) - right = int( match[ 'rmatch' ] ) - left_bases = bases[ : left ] - if match[ 'ins_del' ] == 'I': - middle_bases = bases[ left : left + middle ] - else: - middle_bases = '' - right_bases = bases[ -right : ] - start = pos + left - # add data to ref_pos dict for match/mismatch bases on left and on right - add_to_mis_matches( mis_matches[ chrom ], pos, left_bases ) - if match[ 'ins_del' ] == 'I': - add_to_mis_matches( mis_matches[ chrom ], start, right_bases ) - else: - add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases ) - # for insertions, count instances of particular inserted bases - if match[ 'ins_del' ] == 'I': - if indels[ chrom ][ 'I' ].has_key( start ): - try: - indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1 - except KeyError: - indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1 - else: - indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 } - # for deletions, count number of deletions bases - else: - if indels[ chrom ][ 'D' ].has_key( start ): - try: - indels[ chrom ][ 'D' ][ start ][ middle ] += 1 - except KeyError: - indels[ chrom ][ 'D' ][ start ][ middle ] = 1 - else: - indels[ chrom ][ 'D' ][ start ] = { middle: 1 } - # compute deletion frequencies and insertion frequencies for checking against threshold - freqs = {} - ins_freqs = {} - chroms = mis_matches.keys() - chroms.sort() - for chrom in chroms: - freqs[ chrom ] = {} - ins_freqs[ chrom ] = {} - poses = mis_matches[ chrom ].keys() - poses.extend( indels[ chrom ][ 'D' ].keys() ) - poses.extend( indels[ chrom ][ 'I' ].keys() ) - poses = list( set( poses ) ) - for pos in poses: - # all reads touching this particular position - freqs[ chrom ][ pos ] = {} - sum_counts = 0.0 - sum_counts_end = 0.0 - # get basic counts (match/mismatch) - try: - sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) ) - except KeyError: - pass - try: - sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) ) - except KeyError: - pass - # add deletions also touching this position - try: - sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) ) - except KeyError: - pass - try: - sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) ) - except KeyError: - pass - freqs[ chrom ][ pos ][ 'total' ] = sum_counts - # calculate actual frequencies - # deletions - # frequencies for deletions - try: - for d in indels[ chrom ][ 'D' ][ pos ].keys(): - freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts - except KeyError: - pass - # frequencies for matches/mismatches - try: - for base in mis_matches[ chrom ][ pos ].keys(): - try: - prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts - freqs[ chrom ][ pos ][ base ] = prop - except ZeroDivisionError: - freqs[ chrom ][ pos ][ base ] = 0.0 - except KeyError: - pass - # insertions - try: - for bases in indels[ chrom ][ 'I' ][ pos ].keys(): - prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts ) - try: - prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end ) - except ZeroDivisionError: - prop_end = 0.0 - try: - ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ] - except KeyError: - ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] } - except KeyError, e: - pass - # output to files if meet threshold requirement - threshold = float( options.threshold ) - #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' ) - #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' ) - for chrom in chroms: - # deletions file - poses = indels[ chrom ][ 'D' ].keys() - poses.sort() - for pos in poses: - start = pos - dels = indels[ chrom ][ 'D' ][ start ].keys() - dels.sort() - for d in dels: - end = start + d - prop = freqs[ chrom ][ start ][ d ] - if prop > threshold : - out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) ) - # insertions file - poses = indels[ chrom ][ 'I' ].keys() - poses.sort() - for pos in poses: - start = pos - end = pos + 1 - ins_bases = indels[ chrom ][ 'I' ][ start ].keys() - ins_bases.sort() - for bases in ins_bases: - prop_start = ins_freqs[ chrom ][ start ][ bases ][0] - prop_end = ins_freqs[ chrom ][ start ][ bases ][1] - if prop_start > threshold or prop_end > threshold: - out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) ) - # close out files - out_del.close() - out_ins.close() - # if skipped lines because of more than one indel, output message - if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/indels/indel_analysis.xml --- a/tools/indels/indel_analysis.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,167 +0,0 @@ - - - - indel_analysis.py - --input=$input1 - --threshold=$threshold - --out_ins=$out_ins - --out_del=$out_del - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Given an input sam file, this tool provides analysis of the indels. It filters out matches that do not meet the frequency threshold. The way this frequency of occurence is calculated is different for deletions and insertions. The CIGAR string's "M" can indicate an exact match or a mismatch. For SAM containing the following bits of information (assuming the reference "ACTGCTCGAT"):: - - CHROM POS CIGAR SEQ - ref 3 2M1I3M TACTTC - ref 1 2M1D3M ACGCT - ref 4 4M2I3M GTTCAAGAT - ref 2 2M2D3M CTCCG - ref 1 3M1D4M AACCTGG - ref 6 3M1I2M TTCAAT - ref 5 3M1I3M CTCTGTT - ref 7 4M CTAT - ref 5 5M CGCTA - ref 3 2M1D2M TGCC - -The following totals would be calculated (this is an intermediate step and not output):: - - ------------------------------------------------------------------------------------------------------- - POS BASE NUMREADS DELPROPCALC DELPROP INSPROPSTARTCALC INSSTARTPROP INSPROPENDCALC INSENDPROP - ------------------------------------------------------------------------------------------------------- - 1 A 2 2/2 1.00 --- --- --- --- - 2 A 1 1/3 0.33 --- --- --- --- - C 2 2/3 0.67 --- --- --- --- - 3 C 1 1/5 0.20 --- --- --- --- - T 3 3/5 0.60 --- --- --- --- - - 1 1/5 0.20 --- --- --- --- - 4 A 1 1/6 0.17 --- --- --- --- - G 3 3/6 0.50 --- --- --- --- - - 1 1/6 0.17 --- --- --- --- - -- 1 1/6 0.17 --- --- --- --- - 5 C 4 4/7 0.57 --- --- --- --- - T 2 2/7 0.29 --- --- --- --- - - 1 1/7 0.14 --- --- --- --- - +C 1 --- --- 1/7 0.14 1/9 0.11 - 6 C 2 2/9 0.22 --- --- --- --- - G 1 1/9 0.11 --- --- --- --- - T 6 6/9 0.67 --- --- --- --- - 7 C 7 7/9 0.78 --- --- --- --- - G 1 1/9 0.11 --- --- --- --- - T 1 1/9 0.11 --- --- --- --- - 8 C 1 1/7 0.14 --- --- --- --- - G 4 4/7 0.57 --- --- --- --- - T 2 2/7 0.29 --- --- --- --- - +T 1 --- --- 1/8 0.13 1/6 0.17 - +AA 1 --- --- 1/8 0.13 1/6 0.17 - 9 A 4 4/5 0.80 --- --- --- --- - T 1 1/5 0.20 --- --- --- --- - +A 1 --- --- 1/6 0.17 1/5 0.20 - 10 T 4 4/4 1.00 --- --- --- --- - -The general idea for calculating these is that we want to find out the proportion of times a particular event occurred at a position among all reads that touch that base in some way. First, the basic total number of reads at a given position is the number of reads with each particular base plus the number of reads with that a deletion at that given position (including the bases that are "mismatches"). Note that deletions of two bases and one base would be counted completely separately. Insertions are not counted in this total. For position 4 above, the reference base is G, and there are 3 occurrences of it along with one mismatching base, A. Also, there is a 1-base deletion and another 2-base deletion. So there are a total of 5 matches/mismatches/deletions, and the proportions for each base are 1/6 = 0.17 (A) and 3/6 = 0.50 (G), and for each deletion it is 1/6 = 0.17. - -Insertions are slightly more complicated. We actually want to get the frequency of occurrence for both the associated start and end positions, since an insertion appears between those two bases. Each insertion is regarded individually, and the total number of occurrences of that insertion is divided by the sum of the number of its occurrences and the basic total for either the start or end. So for the insertions at position 8, there are a total of 7 matches/mismatches/deletions at position 8, and two insertions that each occur once, so each has an INSSTARTPROP of 1/8 = 0.13. For the end position there are 5 matches/mismatches/deletions, so the INSENDPROP is 1/6 = 0.17 for both insertions (T and AA). - -These proportions (DELPROP and either INSSTARTPROP or INSENDPROP) need to be greater than the threshold frequency specified by the user in order for that base, deletion or insertion to be included in the output. - - -** Output format ** - -The output varies for deletions and insertions, although for both, the first three columns are chromosome, start position, and end position. - -Columns in the deletions file:: - - Column Description - ----------------------------- --------------------------------------------------------------------------------------------------- - 1 Chrom Chromosome - 2 Start Starting position - 3 End Ending position - 4 Coverage Number of reads containing this exact deletion - 5 Frequency Percentage Frequency of this exact deletion (2 and 1 are mutually exclusive, for instance), as percentage (%) - -Columns in the insertions file:: - - Column Description - ------------------------ ----------------------------------------------------------------------------------------------------------------- - 1 Chrom Chromosome - 2 Start Starting position - 3 End Ending position (always Start + 1 for insertions) - 4 Inserted Base(s) The exact base(s) inserted at Start position - 5 Coverage Number of reads containing this exact insertion - 6 Freq. Perc. at Start Frequency of this exact insertion given Start position ("GG" and "G" are considered distinct), as percentage (%) - 7 Freq. Perc. at End Frequency of this exact insertion given End position ("GG" and "G" are considered distinct), as percentage (%) - -Before using this tool, you may will want to use the Filter SAM for indels tool to filter out indels on bases with insufficient quality scores, but this is not required. - - ------ - -**Example** - -If you set the threshold to 0.0 and have the following SAM file:: - - r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 - r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chrM,+134263658,14M1I61M,4; - r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 - r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chrM,-564927,76M,1; - r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 - r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 - r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 - r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chrM,-568532,76M,1; - r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 - r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - -The following will be produced (deletions file followed by insertions file):: - - chr1 11 13 1 100.00 - chr1 21 22 1 25.00 - chr1 21 23 1 25.00 - chrM 16 18 1 9.09 - chrM 19 20 1 8.33 - chrM 21 22 1 9.09 - chrM 22 23 1 9.09 - - chr2 18 19 AA 1 50.00 50.00 - chr2 28 29 CCC 1 50.00 50.00 - chrM 11 12 TTT 1 9.09 9.09 - chrM 13 14 C 1 9.09 9.09 - chrM 13 14 T 1 9.09 9.09 - chrM 19 20 T 1 7.69 8.33 - chrM 21 22 GA 1 8.33 8.33 - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/indels/indel_sam2interval.py --- a/tools/indels/indel_sam2interval.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,161 +0,0 @@ -#!/usr/bin/env python - -""" -Allows user to filter out non-indels from SAM. - -usage: %prog [options] - -i, --input=i: The input SAM file - -u, --include_base=u: Whether or not to include the base for insertions - -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown - -o, --int_out=o: The interval output file for the converted SAM file - -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file - -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def numeric_sort( text1, text2 ): - """ - For two items containing space-separated text, compares equivalent pieces - numerically if both numeric or as text otherwise - """ - pieces1 = text1.split() - pieces2 = text2.split() - if len( pieces1 ) == 0: - return 1 - if len( pieces2 ) == 0: - return -1 - for i, pc1 in enumerate( pieces1 ): - if i == len( pieces2 ): - return 1 - if not pieces2[i].isdigit(): - if pc1.isdigit(): - return -1 - else: - if pc1 > pieces2[i]: - return 1 - elif pc1 < pieces2[i]: - return -1 - else: - if not pc1.isdigit(): - return 1 - else: - if int( pc1 ) > int( pieces2[i] ): - return 1 - elif int( pc1 ) < int( pieces2[i] ): - return -1 - if i < len( pieces2 ) - 1: - return -1 - return 0 - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - - # open up output files - output = open( options.int_out, 'wb' ) - if options.bed_ins_out != 'None': - output_bed_ins = open( options.bed_ins_out, 'wb' ) - else: - output_bed_ins = None - if options.bed_del_out != 'None': - output_bed_del = open( options.bed_del_out, 'wb' ) - else: - output_bed_del = None - - # the pattern to match, assuming just one indel per cigar string - pat_indel = re.compile( '^(?P\d+)M(?P\d+)(?P[ID])(?P\d+)M$' ) - pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) - - # go through all lines in input file - out_data = {} - multi_indel_lines = 0 - for line in open( options.input, 'rb' ): - if line and not line.startswith( '#' ) and not line.startswith( '@' ) : - split_line = line.split( '\t' ) - if split_line < 12: - continue - # grab relevant pieces - cigar = split_line[5].strip() - pos = int( split_line[3] ) - chr = split_line[2] - base_string = split_line[9] - # parse cigar string - m = pat_indel.match( cigar ) - if not m: - m = pat_multi.match( cigar ) - # skip this line if no match - if not m: - continue - # account for multiple indels or operations we don't process - else: - multi_indel_lines += 1 - continue - else: - match = m.groupdict() - left = int( match[ 'lmatch' ] ) - middle = int( match[ 'ins_del_width' ] ) - middle_type = match[ 'ins_del' ] - bases = base_string[ left : left + middle ] - # calculate start and end positions, and output to insertion or deletion file - start = left + pos - if middle_type == 'D': - end = start + middle - data = [ chr, start, end, 'D' ] - if options.include_base == "true": - data.append( '-' ) - else: - end = start + 1 - data = [ chr, start, end, 'I' ] - if options.include_base == "true": - data.append( bases ) - location = '\t'.join( [ '%s' % d for d in data ] ) - try: - out_data[ location ] += 1 - except KeyError: - out_data[ location ] = 1 - # output to interval file - # get all locations and sort - locations = out_data.keys() - locations.sort( numeric_sort ) - last_line = '' - # output each location, either with counts or each occurrence - for loc in locations: - sp_loc = loc.split( '\t' ) - cur_line = '\t'.join( sp_loc[:3] ) - if options.collapse == 'true': - output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) ) - if output_bed_del and sp_loc[3] == 'D': - output_bed_del.write( '%s\n' % cur_line ) - if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line: - output_bed_ins.write( '%s\n' % cur_line ) - last_line = cur_line - else: - for i in range( out_data[ loc ] ): - output.write( '%s\n' % loc ) - if output_bed_del or output_bed_ins: - if output_bed_del and sp_loc[3] == 'D': - output_bed_del.write( '%s\n' % cur_line ) - if output_bed_ins and sp_loc[3] == 'I': - output_bed_ins.write( '%s\n' % cur_line ) - - # cleanup, close files - if output_bed_ins: - output_bed_ins.close() - if output_bed_del: - output_bed_del.close() - output.close() - - # if skipped lines because of more than one indel, output message - if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/indels/indel_sam2interval.xml --- a/tools/indels/indel_sam2interval.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ - - from SAM - - indel_sam2interval.py - --input=$input1 - --include_base=$include_base - --collapse=$collapse - --int_out=$output1 - #if $ins_out.include_ins_out == "true" - --bed_ins_out=$output2 - #else - --bed_ins_out="None" - #end if - #if $del_out.include_del_out == "true" - --bed_del_out=$output3 - #else - --bed_del_out="None" - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - ins_out[ "include_ins_out" ] == "true" - - - del_out[ "include_del_out" ] == "true" - - - - - - - - - - - - - - - - -**What it does** - -Given a SAM file containing indels, converts these to an interval file with a column indicating whether it is an insertion or a deletion, and then also can create a BED file for each type (one for insertions, one for deletions). The interval file can be combined with other like files to create a table useful for analysis with the Indel Analysis Table tool. The BED files can be useful for visualizing the reads. - ------ - -**Example** - -Suppose you have the following mapping results:: - - r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 - r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chrM,+134263658,14M1I61M,4; - r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 - r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chrM,-564927,76M,1; - r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 - r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 - r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 - r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chrM,-568532,76M,1; - r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 - r7899 69 * 0 0 * * 0 0 CTGCGTGTTGGTGTCTACTGGGGT #%#'##$#$##&%#%$$$%#%#'# - r9192 133 * 0 0 * * 0 0 GTGCGTCGGGGAGGGTGCTGTCGG ######%#$%#$$###($###&&% - r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r13452 16 chrM 13 0 3M1D11M * 0 0 TACGTCACTCATCA IIIABCCCICCCCI XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - - -The following three files will be produced (Interval, Insertions BED and Deletions BED):: - - chr1 11 13 D - 1 - chr1 21 22 D - 1 - chr1 21 23 D - 1 - chr2 18 19 I AA 1 - chr2 28 29 I CCC 1 - chrM 11 12 I TTT 1 - chrM 13 14 I C 1 - chrM 13 14 I T 1 - chrM 16 17 D - 1 - chrM 16 18 D - 1 - chrM 19 20 D - 1 - chrM 19 20 I T 1 - chrM 21 22 D - 1 - chrM 21 22 I GA 1 - chrM 22 23 D - 1 - - chr2 18 19 - chr2 28 29 - chrM 11 12 - chrM 13 14 - chrM 13 14 - chrM 19 20 - chrM 21 22 - - chr1 11 13 - chr1 21 22 - chr1 21 23 - chrM 16 17 - chrM 16 18 - chrM 19 20 - chrM 21 22 - chrM 22 23 - -For more information on SAM, please consult the `SAM format description`__. - -.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943 - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/indels/indel_table.py --- a/tools/indels/indel_table.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -#!/usr/bin/env python - -""" -Combines several interval files containing indels with counts. All input files need to have the same number of columns. - -usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]] - -1, --input1=1: The first input file - -s, --sum1=s: Whether or not to include the totals from first file in overall total - -2, --input2=2: The second input file - -S, --sum2=S: Whether or not to include the totals from second file in overall total - -o, --output=o: The interval output file for the combined files -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def numeric_sort( text1, text2 ): - """ - For two items containing space-separated text, compares equivalent pieces - numerically if both numeric or as text otherwise - """ - pieces1 = text1.split() - pieces2 = text2.split() - if len( pieces1 ) == 0: - return 1 - if len( pieces2 ) == 0: - return -1 - for i, pc1 in enumerate( pieces1 ): - if i == len( pieces2 ): - return 1 - if not pieces2[i].isdigit(): - if pc1.isdigit(): - return -1 - else: - if pc1 > pieces2[i]: - return 1 - elif pc1 < pieces2[i]: - return -1 - else: - if not pc1.isdigit(): - return 1 - else: - if int( pc1 ) > int( pieces2[i] ): - return 1 - elif int( pc1 ) < int( pieces2[i] ): - return -1 - if i < len( pieces2 ) - 1: - return -1 - return 0 - -def __main__(): - # Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - inputs = [ options.input1, options.input2 ] - includes = [ options.sum1, options.sum2 ] - inputs.extend( [ a for i, a in enumerate( args ) if i % 2 == 0 ] ) - includes.extend( [ a for i, a in enumerate( args ) if i % 2 == 1 ] ) - num_cols = 0 - counts = {} - # read in data from all files and get total counts - try: - for i, input in enumerate( inputs ): - for line in open( input, 'rb' ): - sp_line = line.strip().split( '\t' ) - # set num_cols on first pass - if num_cols == 0: - if len( sp_line ) < 4: - raise Exception, 'There need to be at least 4 columns in the file: Chrom, Start, End, and Count' - num_cols = len( sp_line ) - # deal with differing number of columns - elif len( sp_line ) != num_cols: - raise Exception, 'All of the files need to have the same number of columns (current %s != %s of first line)' % ( len( sp_line ), num_cols ) - # get actual counts for each indel - indel = '\t'.join( sp_line[:-1] ) - try: - count = int( sp_line[-1] ) - except ValueError, e: - raise Exception, 'The last column of each file must be numeric, with the count of the number of instances of that indel: %s' % str( e ) - # total across all included files - if includes[i] == "true": - try: - counts[ indel ]['tot'] += count - except ( IndexError, KeyError ): - counts[ indel ] = { 'tot': count } - # counts for ith file - counts[ indel ][i] = count - except Exception, e: - stop_err( 'Failed to read all input files:\n%s' % str( e ) ) - # output combined results to table file - try: - output = open( options.output, 'wb' ) - count_keys = counts.keys() - count_keys.sort( numeric_sort ) - for indel in count_keys: - count_out = [ str( counts[ indel ][ 'tot' ] ) ] - for i in range( len( inputs ) ): - try: - count_out.append( str( counts[ indel ][i] ) ) - except KeyError: - count_out.append( '0' ) - output.write( '%s\t%s\n' % ( indel, '\t'.join( count_out ) ) ) - output.close() - except Exception, e: - stop_err( 'Failed to output data: %s' % str( e ) ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/indels/indel_table.xml --- a/tools/indels/indel_table.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ - - for combining indel interval data - - indel_table.py - --input1=$input1 - --sum1=$sum1 - --input2=$input2 - --sum2=$sum2 - --output=$output1 - #for $i in $inputs - ${i.input} - ${i.sum} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Creates a table allowing for analysis and comparison of indel data. Combines any number of interval files that have been produced by the tool that converts indel SAM data to interval format. Includes overall total counts for all or some files. The tool has the option to not include a given file's counts in the total column. This could be useful for combined data if the counts for certain indels might be included more than once. - -The exact columns of the output will depend on the columns of the input. Here is the detailed specification of the output columns:: - - Column Description - ------------------------------- ---------------------------------------------------------------------------------- - 1 ... m "Indel" All the "indel" columns, which contain the info that will be checked for equality - m + 1 Total Occurrences Total number of occurrences of this indel across all (included) files - m + 2 Occurrences for File 1 Number of occurrences of this indel for first file - m + 3 Occurrences for File 2 Number of occurrences of this indel for second file - [m + ...] [...] [Number of occurrences of this indel for ... file] - -The most likely columns would be from the output of the Convert SAM to Interval/BED tool, so: Chromosome, Start position, End position, I/D (Insertion/Deletion), -/<base(s)> (Deletion/Inserted base(s)), Total Occurrences (across files), Occurrences for File 1, Occurrences for File 2, etc. See below for an example. - - ------ - -**Example** - -Suppose you have the following 4 files:: - - chrM 300 301 D - 6 - chrM 303 304 D - 19 - chrM 359 360 D - 1 - chrM 410 411 D - 1 - chrM 435 436 D - 1 - - chrM 410 411 D - 1 - chrM 714 715 D - 1 - chrM 995 997 D - 1 - chrM 1168 1169 I A 1 - chrM 1296 1297 D - 1 - - chrM 300 301 D - 8 - chrM 525 526 D - 1 - chrM 958 959 D - 1 - chrM 995 996 D - 3 - chrM 1168 1169 I C 1 - chrM 1296 1297 D - 1 - - chrM 303 304 D - 22 - chrM 410 411 D - 1 - chrM 435 436 D - 1 - chrM 714 715 D - 1 - chrM 753 754 I A 1 - chrM 1168 1169 I A 1 - -and the fifth file:: - - chrM 303 304 D - 22 - chrM 410 411 D - 2 - chrM 435 436 D - 1 - chrM 714 715 D - 2 - chrM 753 754 I A 1 - chrM 995 997 D - 1 - chrM 1168 1169 I A 2 - chrM 1296 1297 D - 1 - -The following will be produced if you include the first four files in the sum, but not the fifth:: - - chrM 300 301 D - 14 6 0 8 0 0 - chrM 303 304 D - 41 19 0 0 22 22 - chrM 359 360 D - 1 1 0 0 0 0 - chrM 410 411 D - 3 1 1 0 1 2 - chrM 435 436 D - 2 1 0 0 1 2 - chrM 525 526 D - 1 0 0 1 0 0 - chrM 714 715 D - 2 0 1 0 1 2 - chrM 753 754 I A 1 0 0 0 1 1 - chrM 958 959 D - 1 0 0 1 0 0 - chrM 995 996 D - 3 0 0 3 0 0 - chrM 995 997 D - 1 0 1 0 0 1 - chrM 1168 1169 I A 2 0 1 0 1 2 - chrM 1168 1169 I C 1 0 0 1 0 0 - chrM 1296 1297 D - 2 0 1 1 0 1 - -The first numeric column includes the total or the next four columns, but not the fifth. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/indels/sam_indel_filter.py --- a/tools/indels/sam_indel_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,91 +0,0 @@ -#!/usr/bin/env python - -""" -Allows user to filter out non-indels from SAM. - -usage: %prog [options] - -i, --input=i: Input SAM file to be filtered - -q, --quality_threshold=q: Minimum quality value for adjacent bases - -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities - -o, --output=o: Filtered output SAM file -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - # prep output file - output = open( options.output, 'wb' ) - # patterns - pat = re.compile( '^(?P\d+)M(?P\d+)(?P[ID])(?P\d+)M$' ) - pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) - try: - qual_thresh = int( options.quality_threshold ) - if qual_thresh < 0 or qual_thresh > 93: - raise ValueError - except ValueError: - stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' ) - try: - adj_bases = int( options.adjacent_bases ) - if adj_bases < 1: - raise ValueError - except ValueError: - stop_err( 'The number of adjacent bases should be an integer greater than 1.' ) - # record lines skipped because of more than one indel - multi_indel_lines = 0 - # go through all lines in input file - for i,line in enumerate(open( options.input, 'rb' )): - if line and not line.startswith( '#' ) and not line.startswith( '@' ) : - split_line = line.split( '\t' ) - cigar = split_line[5].strip() - # find matches like 3M2D7M or 7M3I10M - match = {} - m = pat.match( cigar ) - # if unprocessable CIGAR - if not m: - m = pat_multi.match( cigar ) - # skip this line if no match - if not m: - continue - # account for multiple indels or operations we don't process - else: - multi_indel_lines += 1 - # otherwise get matching parts - else: - match = m.groupdict() - # process for indels - if match: - left = int( match[ 'lmatch' ] ) - right = int( match[ 'rmatch' ] ) - if match[ 'ins_del' ] == 'I': - middle = int( match[ 'ins_del_width' ] ) - else: - middle = 0 - # if there are enough adjacent bases to check, then do so - if left >= adj_bases and right >= adj_bases: - quals = split_line[10] - eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ] - qual_thresh_met = True - for q in eligible_quals: - if ord( q ) - 33 < qual_thresh: - qual_thresh_met = False - break - # if filter reqs met, output line - if qual_thresh_met: - output.write( line ) - # close out file - output.close() - # if skipped lines because of more than one indel, output message - if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/indels/sam_indel_filter.xml --- a/tools/indels/sam_indel_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ - - for SAM - - sam_indel_filter.py - --input=$input1 - --quality_threshold=$quality_threshold - --adjacent_bases=$adjacent_bases - --output=$out_file1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Allows extracting indels from SAM produced by BWA. Currently it can handle SAM with alignments that have only one insertion or one deletion, and will skip that alignment if it encounters one with more than one indel. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or 4M2D10M, so there must be a match or mismatch of sufficient length on either side of the indel. - ------ - -**Example** - -Suppose you have the following:: - - r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r770 181 ref 116 0 24M = 72131356 0 TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$### - r1945 177 ref 41710908 0 23M 190342418 181247988 0 AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r3671 117 ref 190342418 0 24M = 190342418 0 CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$## - r3671 153 ref 190342418 37 16M1I6M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r3824 117 ref 80324999 0 24M = 80324999 0 TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/ - r3824 153 ref 80324999 37 8M1I14M = 80324999 0 TTTAGCCCGAAATGCCTAGAGCA 4;6//11!"11100110////00 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r4795 81 ref 26739130 0 23M 57401793 57401793 0 TGGCATTCCTGTAGGCAGAGAGG AZWWZS]!"QNXZ]VQ]]]/2]] XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 - r4795 161 ref 57401793 37 23M 26739130 26739130 0 GATCACCCAGGTGATGTAACTCC ]WV]]]]WW]]]]]]]]]]PU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r4800 16 ref 241 255 15M1D8M = 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5377 170 ref 59090793 37 23M 26739130 26739130 0 TATCAATAAGGTGATGTAACTCG ]WV]ABAWW]]]]]P]P//GU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r5612 151 ref 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - - -To select only alignments with indels, you need to determine the minimum quality you want the adjacent bases to have, as well as the number of adjacent bases to check. If you set the quality threshold to 47 and the number of bases to check to 2, you will get the following output:: - - r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r4800 16 ref 241 255 15M1D8M = 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5612 151 ref 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - - -For more information on SAM, please consult the `SAM format description`__. - -.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943 - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/genebed_maf_to_fasta.xml --- a/tools/maf/genebed_maf_to_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ - - given a set of coding exon intervals - - #if $maf_source_type.maf_source == "user" #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_file --mafIndex=$maf_source_type.maf_file.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --mafSourceType=$maf_source_type.maf_source --geneBED --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} - #else #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_identifier --interval_file=$input1 --output_file=$out_file1 --mafSourceType=$maf_source_type.maf_source --geneBED --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} - #end if# --overwrite_with_gaps=$overwrite_with_gaps - - - - - value.metadata.columns >= 12 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - in aligning species - - - - - - - - - - - - - - - - -**What it does** - -The coding sequence of genes are usually composed of several coding exons. Each of these coding exons is an individual genomic region, which when concatenated with each other constitutes the coding sequence. A single genomic region can be covered by multiple alignment blocks. In many cases it is desirable to stitch these alignment blocks together. This tool accepts a list of gene-based intervals, in the Gene BED format. For every interval it performs the following: - - * finds all MAF blocks that overlap the coding regions; - * sorts MAF blocks by alignment score; - * stitches blocks together and resolves overlaps based on alignment score; - * outputs alignments in FASTA format. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/interval2maf.py --- a/tools/maf/interval2maf.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -#!/usr/bin/env python - -""" -Reads a list of intervals and a maf. Produces a new maf containing the -blocks or parts of blocks in the original that overlapped the intervals. - -If a MAF file, not UID, is provided the MAF file is indexed before being processed. - -NOTE: If two intervals overlap the same block it will be written twice. - -usage: %prog maf_file [options] - -d, --dbkey=d: Database key, ie hg17 - -c, --chromCol=c: Column of Chr - -s, --startCol=s: Column of Start - -e, --endCol=e: Column of End - -S, --strandCol=S: Column of Strand - -t, --mafType=t: Type of MAF source to use - -m, --mafFile=m: Path of source MAF file, if not using cached version - -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version - -i, --interval_file=i: Input interval file - -o, --output_file=o: Output MAF file - -p, --species=p: Species to include in output - -P, --split_blocks_by_species=P: Split blocks by species - -r, --remove_all_gap_columns=r: Remove all Gap columns - -l, --indexLocation=l: Override default maf_index.loc file - -z, --mafIndexFile=z: Directory of local maf index file ( maf_index.loc or maf_pairwise.loc ) -""" - -#Dan Blankenberg -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse -import bx.align.maf -import bx.intervals.io -from galaxy.tools.util import maf_utilities -import sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - index = index_filename = None - mincols = 0 - - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - - if options.dbkey: dbkey = options.dbkey - else: dbkey = None - if dbkey in [None, "?"]: - maf_utilities.tool_fail( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) - - species = maf_utilities.parse_species_option( options.species ) - - if options.chromCol: chromCol = int( options.chromCol ) - 1 - else: - maf_utilities.tool_fail( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) - - if options.startCol: startCol = int( options.startCol ) - 1 - else: - maf_utilities.tool_fail( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) - - if options.endCol: endCol = int( options.endCol ) - 1 - else: - maf_utilities.tool_fail( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) - - if options.strandCol: strandCol = int( options.strandCol ) - 1 - else: - strandCol = -1 - - if options.interval_file: interval_file = options.interval_file - else: - maf_utilities.tool_fail( "Input interval file has not been specified." ) - - if options.output_file: output_file = options.output_file - else: - maf_utilities.tool_fail( "Output file has not been specified." ) - - split_blocks_by_species = remove_all_gap_columns = False - if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species': - split_blocks_by_species = True - if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns': - remove_all_gap_columns = True - else: - remove_all_gap_columns = True - #Finish parsing command line - - #Open indexed access to MAFs - if options.mafType: - if options.indexLocation: - index = maf_utilities.maf_index_by_uid( options.mafType, options.indexLocation ) - else: - index = maf_utilities.maf_index_by_uid( options.mafType, options.mafIndexFile ) - if index is None: - maf_utilities.tool_fail( "The MAF source specified (%s) appears to be invalid." % ( options.mafType ) ) - elif options.mafFile: - index, index_filename = maf_utilities.open_or_build_maf_index( options.mafFile, options.mafIndex, species = [dbkey] ) - if index is None: - maf_utilities.tool_fail( "Your MAF file appears to be malformed." ) - else: - maf_utilities.tool_fail( "Desired source MAF type has not been specified." ) - - #Create MAF writter - out = bx.align.maf.Writer( open(output_file, "w") ) - - #Iterate over input regions - num_blocks = 0 - num_regions = None - for num_regions, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chromCol, start_col = startCol, end_col = endCol, strand_col = strandCol, fix_strand = True, return_header = False, return_comments = False ) ): - src = maf_utilities.src_merge( dbkey, region.chrom ) - for block in index.get_as_iterator( src, region.start, region.end ): - if split_blocks_by_species: - blocks = [ new_block for new_block in maf_utilities.iter_blocks_split_by_species( block ) if maf_utilities.component_overlaps_region( new_block.get_component_by_src_start( dbkey ), region ) ] - else: - blocks = [ block ] - for block in blocks: - block = maf_utilities.chop_block_by_region( block, src, region ) - if block is not None: - if species is not None: - block = block.limit_to_species( species ) - block = maf_utilities.orient_block_by_region( block, src, region ) - if remove_all_gap_columns: - block.remove_all_gap_columns() - out.write( block ) - num_blocks += 1 - - #Close output MAF - out.close() - - #remove index file if created during run - maf_utilities.remove_temp_index_file( index_filename ) - - if num_blocks: - print "%i MAF blocks extracted for %i regions." % ( num_blocks, ( num_regions + 1 ) ) - elif num_regions is not None: - print "No MAF blocks could be extracted for %i regions." % ( num_regions + 1 ) - else: - print "No valid regions have been provided." - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/interval2maf.xml --- a/tools/maf/interval2maf.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,294 +0,0 @@ - - given a set of genomic intervals - - #if $maf_source_type.maf_source == "user" #interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafFile=$maf_source_type.mafFile --mafIndex=$maf_source_type.mafFile.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc --species=$maf_source_type.species - #else #interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafType=$maf_source_type.mafType --interval_file=$input1 --output_file=$out_file1 --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc --species=$maf_source_type.species - #end if# --split_blocks_by_species=$split_blocks_by_species_selector.split_blocks_by_species - #if $split_blocks_by_species_selector.split_blocks_by_species == "split_blocks_by_species"# - --remove_all_gap_columns=$split_blocks_by_species_selector.remove_all_gap_columns - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool takes genomic coordinates, superimposes them on multiple alignments (in MAF format) stored on the Galaxy site or from your history, and excises alignment blocks corresponding to each set of coordinates. Alignment blocks that extend past START and/or END positions of an interval are trimmed. Note that a single genomic interval may correspond to two or more alignment blocks. - ------ - -**Example** - -Here a single interval is superimposed on three MAF blocks. Blocks 1 and 3 are trimmed because they extend beyond boundaries of the interval: - -.. image:: ./static/images/maf_icons/interval2maf.png - -------- - -**Split blocks by species** - -This option examines each MAF block for multiple occurrences of a species in a single block. When this occurs, a block is split into multiple blocks where every combination of one sequence per species per block is represented. - -The interface for this option has two inputs: - - * **MAF file to split**. Choose multiple alignments from history to be split by species. - * **Collapse empty alignment columns**. Should alignment columns containing only gaps in the new blocks be removed. - - - -**Example 1**: **Collapse empty alignment columns is Yes**: - -For the following alignment:: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - -the tool will create **a single** history item containing 12 alignment blocks (notice that no columns contain only gaps):: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT-GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC--GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC-GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGCAG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC---AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC---AG - - - -**Example 2**: **Collapse empty alignment columns is No**: - -For the following alignment:: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - -the tool will create **a single** history item containing 12 alignment blocks (notice that some columns contain only gaps):: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/interval2maf_pairwise.xml --- a/tools/maf/interval2maf_pairwise.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ - - given a set of genomic intervals - interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafType=$mafType --interval_file=$input1 --output_file=$out_file1 --indexLocation=${GALAXY_DATA_INDEX_DIR}/maf_pairwise.loc - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool takes genomic coordinates, superimposes them on pairwise alignments (in MAF format) stored on the Galaxy site, and excises alignment blocks corresponding to each set of coordinates. Alignment blocks that extend past START and/or END positions of an interval are trimmed. Note that a single genomic interval may correspond to two or more alignment blocks. - ------ - -**Example** - -Here a single interval is superimposed on three MAF blocks. Blocks 1 and 3 are trimmed because they extend beyond boundaries of the interval: - -.. image:: ./static/images/maf_icons/interval2maf.png - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/interval_maf_to_merged_fasta.py --- a/tools/maf/interval_maf_to_merged_fasta.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,196 +0,0 @@ -#!/usr/bin/env python - -""" -Reads an interval or gene BED and a MAF Source. -Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates - -Alignment blocks are layered ontop of each other based upon score. - -usage: %prog maf_file [options] - -d, --dbkey=d: Database key, ie hg17 - -c, --chromCol=c: Column of Chr - -s, --startCol=s: Column of Start - -e, --endCol=e: Column of End - -S, --strandCol=S: Column of Strand - -G, --geneBED: Input is a Gene BED file, process and join exons as one region - -t, --mafSourceType=t: Type of MAF source to use - -m, --mafSource=m: Path of source MAF file, if not using cached version - -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version - -i, --interval_file=i: Input interval file - -o, --output_file=o: Output MAF file - -p, --species=p: Species to include in output - -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species. - -z, --mafIndexFileDir=z: Directory of local maf_index.loc file - -usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR -""" - -#Dan Blankenberg -from galaxy import eggs -from galaxy.tools.util import maf_utilities -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse -import bx.intervals.io -import sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def __main__(): - - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - mincols = 0 - strand_col = -1 - - if options.dbkey: - primary_species = options.dbkey - else: - primary_species = None - if primary_species in [None, "?", "None"]: - stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) - - include_primary = True - secondary_species = maf_utilities.parse_species_option( options.species ) - if secondary_species: - species = list( secondary_species ) # make copy of species list - if primary_species in secondary_species: - secondary_species.remove( primary_species ) - else: - include_primary = False - else: - species = None - - if options.interval_file: - interval_file = options.interval_file - else: - stop_err( "Input interval file has not been specified." ) - - if options.output_file: - output_file = options.output_file - else: - stop_err( "Output file has not been specified." ) - - if not options.geneBED: - if options.chromCol: - chr_col = int( options.chromCol ) - 1 - else: - stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) - - if options.startCol: - start_col = int( options.startCol ) - 1 - else: - stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) - - if options.endCol: - end_col = int( options.endCol ) - 1 - else: - stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) - - if options.strandCol: - strand_col = int( options.strandCol ) - 1 - - mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir - - overwrite_with_gaps = True - if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false': - overwrite_with_gaps = False - - #Finish parsing command line - - #get index for mafs based on type - index = index_filename = None - #using specified uid for locally cached - if options.mafSourceType.lower() in ["cached"]: - index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile ) - if index is None: - stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) ) - elif options.mafSourceType.lower() in ["user"]: - #index maf for use here, need to remove index_file when finished - index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] ) - if index is None: - stop_err( "Your MAF file appears to be malformed." ) - else: - stop_err( "Invalid MAF source type specified." ) - - #open output file - output = open( output_file, "w" ) - - if options.geneBED: - region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() ) - else: - region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) ) - - #Step through intervals - regions_extracted = 0 - line_count = 0 - for line_count, line in region_enumerator: - try: - if options.geneBED: #Process as Gene BED - try: - starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line ) - #create spliced alignment object - alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) - primary_name = secondary_name = fields[3] - alignment_strand = fields[5] - except Exception, e: - print "Error loading exon positions from input line %i: %s" % ( line_count, e ) - continue - else: #Process as standard intervals - try: - #create spliced alignment object - alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) - primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end ) - secondary_name = "" - alignment_strand = line.strand - except Exception, e: - print "Error loading region positions from input line %i: %s" % ( line_count, e ) - continue - - #Write alignment to output file - #Output primary species first, if requested - if include_primary: - output.write( ">%s.%s\n" %( primary_species, primary_name ) ) - if alignment_strand == "-": - output.write( alignment.get_sequence_reverse_complement( primary_species ) ) - else: - output.write( alignment.get_sequence( primary_species ) ) - output.write( "\n" ) - #Output all remainging species - for spec in secondary_species or alignment.get_species_names( skip = primary_species ): - if secondary_name: - output.write( ">%s.%s\n" % ( spec, secondary_name ) ) - else: - output.write( ">%s\n" % ( spec ) ) - if alignment_strand == "-": - output.write( alignment.get_sequence_reverse_complement( spec ) ) - else: - output.write( alignment.get_sequence( spec ) ) - output.write( "\n" ) - - output.write( "\n" ) - - regions_extracted += 1 - - except Exception, e: - print "Unexpected error from input line %i: %s" % ( line_count, e ) - continue - - #close output file - output.close() - - #remove index file if created during run - maf_utilities.remove_temp_index_file( index_filename ) - - #Print message about success for user - if regions_extracted > 0: - print "%i regions were processed successfully." % ( regions_extracted ) - else: - print "No regions were processed successfully." - if line_count > 0 and options.geneBED: - print "This tool requires your input file to conform to the 12 column BED standard." - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/interval_maf_to_merged_fasta.xml --- a/tools/maf/interval_maf_to_merged_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ - - given a set of genomic intervals - - #if $maf_source_type.maf_source == "user" #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_file --mafIndex=$maf_source_type.maf_file.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafSourceType=$maf_source_type.maf_source --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} - #else #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_identifier --interval_file=$input1 --output_file=$out_file1 --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafSourceType=$maf_source_type.maf_source --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR} - #end if# --overwrite_with_gaps=$overwrite_with_gaps - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -A single genomic region can be covered by multiple alignment blocks. In many cases it is desirable to stitch these alignment blocks together. This tool accepts a list of genomic intervals. For every interval it performs the following: - - * finds all MAF blocks that overlap the interval; - * sorts MAF blocks by alignment score; - * stitches blocks together and resolves overlaps based on alignment score; - * outputs alignments in FASTA format. - ------- - -**Example** - -Here three MAF blocks overlapping a single interval are stitched together. Space between blocks 2 and 3 is filled with gaps: - -.. image:: ./static/images/maf_icons/stitchMaf.png - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_by_block_number.py --- a/tools/maf/maf_by_block_number.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -""" -Reads a list of block numbers and a maf. Produces a new maf containing the -blocks specified by number. -""" - -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from galaxy.tools.util import maf_utilities -import bx.align.maf - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - input_block_filename = sys.argv[1].strip() - input_maf_filename = sys.argv[2].strip() - output_filename1 = sys.argv[3].strip() - block_col = int( sys.argv[4].strip() ) - 1 - if block_col < 0: - print >> sys.stderr, "Invalid column specified" - sys.exit(0) - species = maf_utilities.parse_species_option( sys.argv[5].strip() ) - - maf_writer = bx.align.maf.Writer( open( output_filename1, 'w' ) ) - #we want to maintain order of block file and write blocks as many times as they are listed - failed_lines = [] - for ctr, line in enumerate( open( input_block_filename, 'r' ) ): - try: - block_wanted = int( line.split( "\t" )[block_col].strip() ) - except: - failed_lines.append( str( ctr ) ) - continue - try: - for count, block in enumerate( bx.align.maf.Reader( open( input_maf_filename, 'r' ) ) ): - if count == block_wanted: - if species: - block = block.limit_to_species( species ) - maf_writer.write( block ) - break - except: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - if len( failed_lines ) > 0: print "Failed to extract from %i lines (%s)." % ( len( failed_lines ), ",".join( failed_lines ) ) -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_by_block_number.xml --- a/tools/maf/maf_by_block_number.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ - - given a set of block numbers and a MAF file - maf_by_block_number.py $input1 $input2 $out_file1 $block_col $species - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool takes a list of block numbers, one per line, and extracts the corresponding MAF blocks from the provided file. Block numbers start at 0. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_filter.py --- a/tools/maf/maf_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -#Dan Blankenberg -#Filters a MAF file according to the provided code file, which is generated in maf_filter.xml -#Also allows filtering by number of columns in a block, and limiting output species -import sys, os, shutil -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.align.maf -from galaxy.tools.util import maf_utilities - -def main(): - #Read command line arguments - try: - script_file = sys.argv.pop( 1 ) - maf_file = sys.argv.pop( 1 ) - out_file = sys.argv.pop( 1 ) - additional_files_path = sys.argv.pop( 1 ) - species = maf_utilities.parse_species_option( sys.argv.pop( 1 ) ) - min_size = int( sys.argv.pop( 1 ) ) - max_size = int( sys.argv.pop( 1 ) ) - if max_size < 1: max_size = sys.maxint - min_species_per_block = int( sys.argv.pop( 1 ) ) - exclude_incomplete_blocks = int( sys.argv.pop( 1 ) ) - if species: - num_species = len( species ) - else: - num_species = len( sys.argv.pop( 1 ).split( ',') ) - except: - print >>sys.stderr, "One or more arguments is missing.\nUsage: maf_filter.py maf_filter_file input_maf output_maf path_to_save_debug species_to_keep" - sys.exit() - - #Open input and output MAF files - try: - maf_reader = bx.align.maf.Reader( open( maf_file,'r' ) ) - maf_writer = bx.align.maf.Writer( open( out_file,'w' ) ) - except: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - - #Save script file for debuging/verification info later - os.mkdir( additional_files_path ) - shutil.copy( script_file, os.path.join( additional_files_path, 'debug.txt' ) ) - - #Loop through blocks, running filter on each - #'maf_block' and 'ret_val' are used/shared in the provided code file - #'ret_val' should be set to True if the block is to be kept - i = 0 - blocks_kept = 0 - for i, maf_block in enumerate( maf_reader ): - if min_size <= maf_block.text_size <= max_size: - local = {'maf_block':maf_block, 'ret_val':False} - execfile( script_file, {}, local ) - if local['ret_val']: - #Species limiting must be done after filters as filters could be run on non-requested output species - if species: - maf_block = maf_block.limit_to_species( species ) - if len( maf_block.components ) >= min_species_per_block and ( not exclude_incomplete_blocks or len( maf_block.components ) >= num_species ): - maf_writer.write( maf_block ) - blocks_kept += 1 - maf_writer.close() - maf_reader.close() - if i == 0: print "Your file contains no valid maf_blocks." - else: print 'Kept %s of %s blocks (%.2f%%).' % ( blocks_kept, i + 1, float( blocks_kept ) / float( i + 1 ) * 100.0 ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_filter.xml --- a/tools/maf/maf_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ - - by specified attributes - maf_filter.py $maf_filter_file $input1 $out_file1 $out_file1.files_path $species $min_size $max_size $min_species_per_block $exclude_incomplete_blocks ${input1.metadata.species} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#set $is_isnot_valid = {"==":"==", "!=":"!=", "in":"in", "not in":"not in"} -def maf_block_pass_filter( maf_block ): -#for $maf_filter in $maf_filters: -#if $len( $maf_filter['species1_attributes']['filter_condition'] ) == 0: -#continue -#end if - primary_component = maf_block.get_component_by_src_start( """$maf_filter['species1'].value.encode( 'string_escape' )""".decode( 'string_escape' ) ) - if primary_component is not None: -#if $maf_filter['species1_attributes']['species1_attribute_type'] == 'attribute_chr': - if primary_component.src.split( "." )[-1] $is_isnot_valid.get( $maf_filter['species1_attributes']['species1_is_isnot'].value.strip(), 'is in' ) """$maf_filter['species1_attributes']['species1_attribute'].value.encode( 'string_escape' )""".decode( 'string_escape' ).split( "," ): -#else - if primary_component.strand $is_isnot_valid.get( $maf_filter['species1_attributes']['species1_is_isnot'].value.strip(), '==' ) """$maf_filter['species1_attributes']['species1_attribute'].value.encode( 'string_escape' )""".decode( 'string_escape' ): -#end if -#for $filter_condition in $maf_filter['species1_attributes']['filter_condition']: - secondary_component = maf_block.get_component_by_src_start( """$filter_condition['species2'].value.encode( 'string_escape' )""".decode( 'string_escape' ) ) -#if $filter_condition['species2_attributes']['species2_attribute_type'] == 'attribute_chr': - if secondary_component is not None: - if not ( secondary_component.src.split( "." )[-1] $is_isnot_valid.get( $filter_condition['species2_attributes']['species2_is_isnot'].value.strip(), 'is in' ) """$filter_condition['species2_attributes']['species2_attribute'].value.encode( 'string_escape' )""".decode( 'string_escape' ).split( "," ) ): - return False -#else: - if secondary_component is not None: - if not ( secondary_component.strand $is_isnot_valid.get( $filter_condition['species2_attributes']['species2_is_isnot'].value.strip(), '==' ) """$filter_condition['species2_attributes']['species2_attribute'].value.encode( 'string_escape' )""".decode( 'string_escape' ) ): - return False -#end if -#end for -#end for - return True -ret_val = maf_block_pass_filter( maf_block ) - - - - - - - -This tool allows you to build complex filters to be applied to each alignment block of a MAF file. You can define restraints on species based upon chromosome and strand. You can specify comma separated lists of chromosomes where appropriate. - -.. class:: infomark - -For example, this tool is useful to restrict a set of alignments to only those blocks which contain alignments between chromosomes that are considered homologous. - ------ - -.. class:: warningmark - -If a species is not found in a particular block, all filters on that species are ignored. - ------ - -This tool allows the user to remove any undesired species from a MAF file. If no species are specified then all species will be kept. If species are specified, columns which contain only gaps are removed. The options for this are: - - * **Exclude blocks which have missing species** - suppose you want to restrict an 8-way alignment to human, mouse, and rat. The tool will first remove all other species. Next, if this option is set to **YES** the tool WILL NOT return MAF blocks, which do not include human, mouse, or rat. This means that all alignment blocks returned by the tool will have exactly three sequences in this example. - - * **Exclude blocks which have only one species** - if this option is set to **YES** all single sequence alignment blocks WILL NOT be returned. - ------ - -You can also provide a size range and limit your output to the MAF blocks which fall within the specified range. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_limit_size.py --- a/tools/maf/maf_limit_size.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -""" -Removes blocks that fall outside of specified size range. -""" - -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.align.maf - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - - input_maf_filename = sys.argv[1].strip() - output_filename1 = sys.argv[2].strip() - min_size = int( sys.argv[3].strip() ) - max_size = int( sys.argv[4].strip() ) - if max_size < 1: max_size = sys.maxint - maf_writer = bx.align.maf.Writer( open( output_filename1, 'w' ) ) - try: - maf_reader = bx.align.maf.Reader( open( input_maf_filename, 'r' ) ) - except: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - - blocks_kept = 0 - i = 0 - for i, m in enumerate( maf_reader ): - if min_size <= m.text_size <= max_size: - maf_writer.write( m ) - blocks_kept += 1 - print 'Kept %s of %s blocks (%.2f%%).' % ( blocks_kept, i + 1, float( blocks_kept ) / float( i + 1 ) * 100.0 ) - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_limit_size.xml --- a/tools/maf/maf_limit_size.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - - by Size - maf_limit_size.py $input1 $out_file1 $min_size $max_size - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool takes a MAF file and a size range and extracts the MAF blocks which fall within the specified range. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_limit_to_species.py --- a/tools/maf/maf_limit_to_species.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -#!/usr/bin/env python - -""" -Read a maf file and write out a new maf with only blocks having the -required species, after dropping any other species and removing -columns containing only gaps. - -usage: %prog species,species2,... input_maf output_maf allow_partial min_species_per_block -""" -#Dan Blankenberg -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.align.maf -from galaxy.tools.util import maf_utilities -import sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - - species = maf_utilities.parse_species_option( sys.argv[1] ) - if species: - spec_len = len( species ) - else: - spec_len = 0 - try: - maf_reader = bx.align.maf.Reader( open( sys.argv[2],'r' ) ) - maf_writer = bx.align.maf.Writer( open( sys.argv[3],'w' ) ) - except: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - allow_partial = False - if int( sys.argv[4] ): allow_partial = True - min_species_per_block = int( sys.argv[5] ) - - maf_blocks_kept = 0 - for m in maf_reader: - if species: - m = m.limit_to_species( species ) - m.remove_all_gap_columns() - spec_in_block_len = len( maf_utilities.get_species_in_block( m ) ) - if ( not species or allow_partial or spec_in_block_len == spec_len ) and spec_in_block_len > min_species_per_block: - maf_writer.write( m ) - maf_blocks_kept += 1 - - maf_reader.close() - maf_writer.close() - - print "Restricted to species: %s." % ", ".join( species ) - print "%i MAF blocks have been kept." % maf_blocks_kept - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_limit_to_species.xml --- a/tools/maf/maf_limit_to_species.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ - - by Species - maf_limit_to_species.py $species $input1 $out_file1 $allow_partial $min_species - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What It Does** - -This tool allows the user to remove any undesired species from a MAF file. Columns which contain only gaps are removed. The options for this tool are: - - * **Exclude blocks which have missing species** - suppose you want to restrict an 8-way alignment to human, mouse, and rat. The tool will first remove all other species. Next, if this option is set to **YES** the tool WILL NOT return MAF blocks, which do not include human, mouse, or rat. This means that all alignment blocks returned by the tool will have exactly three sequences in this example. - - * **Exclude blocks with have only one species** - if this option is set to **YES** all single sequence alignment blocks WILL NOT be returned. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_reverse_complement.py --- a/tools/maf/maf_reverse_complement.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -""" -Reads a MAF file. Produces a MAF file containing -the reverse complement for each block in the source file. - -usage: %prog input_maf_file output_maf_file -""" -#Dan Blankenberg -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.align.maf -from galaxy.tools.util import maf_utilities -import sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - #Parse Command Line - input_file = sys.argv.pop( 1 ) - output_file = sys.argv.pop( 1 ) - species = maf_utilities.parse_species_option( sys.argv.pop( 1 ) ) - - try: - maf_writer = bx.align.maf.Writer( open( output_file, 'w' ) ) - except: - print sys.stderr, "Unable to open output file" - sys.exit() - try: - count = 0 - for count, maf in enumerate( bx.align.maf.Reader( open( input_file ) ) ): - maf = maf.reverse_complement() - if species: - maf = maf.limit_to_species( species ) - maf_writer.write( maf ) - except: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - print "%i regions were reverse complemented." % count - maf_writer.close() - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_reverse_complement.xml --- a/tools/maf/maf_reverse_complement.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ - - a MAF file - maf_reverse_complement.py $input1 $out_file1 $species - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool takes a MAF file and creates a new MAF file, where each block has been reversed complemented. - -**Example** - -This MAF Block:: - - a score=8157.000000 - s hg17.chr7 127471526 58 + 158628139 AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG - s panTro1.chr6 129885407 58 + 161576975 AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG - s mm5.chr6 28904928 54 + 149721531 AA----CGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG - -becomes:: - - a score=8157.000000 - s hg17.chr7 31156555 58 - 158628139 CCTCTTCCACTATAGACCTCCTTAAACAAAATAATGAAAAATGAATAAACCACAAATT - s panTro1.chr6 31691510 58 - 161576975 CCTCTTCCACTATAGACCTCCTTAAACAAAATAATGAAAAACGAATAAACCACAAATT - s mm5.chr6 120816549 54 - 149721531 CCTCTTCCACTGAGGAATTTCTTTTTTTAAATGATGAGCAATCAATGAAACG----TT - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_split_by_species.py --- a/tools/maf/maf_split_by_species.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -#!/usr/bin/env python - -""" -Read a maf and split blocks by unique species combinations -""" -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.align import maf -from galaxy.tools.util import maf_utilities -from galaxy.util import string_as_bool - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - try: - maf_reader = maf.Reader( open( sys.argv[1] ) ) - except Exception, e: - maf_utilities.tool_fail( "Error opening MAF: %s" % e ) - try: - out = maf.Writer( open( sys.argv[2], "w") ) - except Exception, e: - maf_utilities.tool_fail( "Error opening file for output: %s" % e ) - try: - collapse_columns = string_as_bool( sys.argv[3] ) - except Exception, e: - maf_utilities.tool_fail( "Error determining collapse columns value: %s" % e ) - - start_count = 0 - end_count = 0 - for start_count, start_block in enumerate( maf_reader ): - for block in maf_utilities.iter_blocks_split_by_species( start_block ): - if collapse_columns: - block.remove_all_gap_columns() - out.write( block ) - end_count += 1 - out.close() - - if end_count: - print "%i alignment blocks created from %i original blocks." % ( end_count, start_count + 1 ) - else: - print "No alignment blocks were created." - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_split_by_species.xml --- a/tools/maf/maf_split_by_species.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ - - by Species - maf_split_by_species.py $input1 $out_file1 $collapse_columns - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool examines each MAF block for multiple occurrences of a species in a single block. When this occurs, a block is split into multiple blocks where every combination of one sequence per species per block is represented. - -The interface for this tool has two inputs: - - * **MAF file to split**. Choose multiple alignments from history to be split by species. - * **Collapse empty alignment columns**. Should alignment columns containing only gaps in the new blocks be removed. - ------ - -**Example 1**: **Collapse empty alignment columns is Yes**: - -For the following alignment:: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - -the tool will create **a single** history item containing 12 alignment blocks (notice that no columns contain only gaps):: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT-GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC--GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC-GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC-GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGCAG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC---AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC---AG - ------ - -**Example 2**: **Collapse empty alignment columns is No**: - -For the following alignment:: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - -the tool will create **a single** history item containing 12 alignment blocks (notice that some columns contain only gaps):: - - ##maf version=1 - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - - a score=2047408.0 - s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG - s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG - s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG - -------- - -.. class:: infomark - -**About formats** - -**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. - - - The .maf format is line-oriented. Each multiple alignment ends with a blank line. - - Each sequence in an alignment is on a single line. - - Lines starting with # are considered to be comments. - - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment. - - Some MAF files may contain two optional line types: - - - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; - - An "e" line containing information about the size of the gap between the alignments that span the current block. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_stats.py --- a/tools/maf/maf_stats.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg -""" -Reads a list of intervals and a maf. Outputs a new set of intervals with statistics appended. -""" - -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.intervals.io -from bx.bitset import BitSet -from galaxy.tools.util import maf_utilities - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - maf_source_type = sys.argv.pop( 1 ) - input_maf_filename = sys.argv[1].strip() - input_interval_filename = sys.argv[2].strip() - output_filename = sys.argv[3].strip() - dbkey = sys.argv[4].strip() - try: - chr_col = int( sys.argv[5].strip() ) - 1 - start_col = int( sys.argv[6].strip() ) - 1 - end_col = int( sys.argv[7].strip() ) - 1 - except: - print >>sys.stderr, "You appear to be missing metadata. You can specify your metadata by clicking on the pencil icon associated with your interval file." - sys.exit() - summary = sys.argv[8].strip() - if summary.lower() == "true": summary = True - else: summary = False - - mafIndexFile = "%s/maf_index.loc" % sys.argv[9] - try: - maf_index_filename = sys.argv[10].strip() - except: - maf_index_filename = None - index = index_filename = None - if maf_source_type == "user": - #index maf for use here - index, index_filename = maf_utilities.open_or_build_maf_index( input_maf_filename, maf_index_filename, species = [dbkey] ) - if index is None: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - elif maf_source_type == "cached": - #access existing indexes - index = maf_utilities.maf_index_by_uid( input_maf_filename, mafIndexFile ) - if index is None: - print >> sys.stderr, "The MAF source specified (%s) appears to be invalid." % ( input_maf_filename ) - sys.exit() - else: - print >>sys.stdout, 'Invalid source type specified: %s' % maf_source_type - sys.exit() - - out = open(output_filename, 'w') - - num_region = None - species_summary = {} - total_length = 0 - #loop through interval file - for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_interval_filename, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ) ): - src = "%s.%s" % ( dbkey, region.chrom ) - region_length = region.end - region.start - total_length += region_length - coverage = { dbkey: BitSet( region_length ) } - - - for block in index.get_as_iterator( src, region.start, region.end ): - for spec in maf_utilities.get_species_in_block( block ): - if spec not in coverage: coverage[spec] = BitSet( region_length ) - for block in maf_utilities.iter_blocks_split_by_species( block ): - if maf_utilities.component_overlaps_region( block.get_component_by_src( src ), region ): - #need to chop and orient the block - block = maf_utilities.orient_block_by_region( maf_utilities.chop_block_by_region( block, src, region ), src, region, force_strand = '+' ) - start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start ) - for i in range( len( alignment[dbkey] ) ): - for spec, text in alignment.items(): - if text[i] != '-': - coverage[spec].set( start_offset + i ) - if summary: - #record summary - for key in coverage.keys(): - if key not in species_summary: species_summary[key] = 0 - species_summary[key] = species_summary[key] + coverage[key].count_range() - else: - #print coverage for interval - coverage_sum = coverage[dbkey].count_range() - out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, coverage_sum, region_length - coverage_sum ) ) - keys = coverage.keys() - keys.remove( dbkey ) - keys.sort() - for key in keys: - coverage_sum = coverage[key].count_range() - out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, coverage_sum, region_length - coverage_sum ) ) - if summary: - out.write( "#species\tnucleotides\tcoverage\n" ) - for spec in species_summary: - out.write( "%s\t%s\t%.4f\n" % ( spec, species_summary[spec], float( species_summary[spec] ) / total_length ) ) - out.close() - if num_region is not None: - print "%i regions were processed with a total length of %i." % ( num_region + 1, total_length ) - maf_utilities.remove_temp_index_file( index_filename ) - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_stats.xml --- a/tools/maf/maf_stats.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ - - Alignment coverage information - - maf_stats.py - #if $maf_source_type.maf_source == "user": - $maf_source_type.maf_source $input2 $input1 $out_file1 $dbkey ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} $summary - #else: - $maf_source_type.maf_source $maf_source_type.mafType $input1 $out_file1 $dbkey ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} $summary - #end if - ${GALAXY_DATA_INDEX_DIR} - #if $maf_source_type.maf_source == "user": - $input2.metadata.maf_index - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - numpy - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool takes a MAF file and an interval file and relates coverage information by interval for each species. -If a column does not exist in the reference genome, it is not included in the output. - -Consider the interval: "chrX 1000 1100 myInterval" - Let's suppose we want to do stats on three way alignments for H, M, and R. The result look like this: - - chrX 1000 1100 myInterval H XXX YYY - - chrX 1000 1100 myInterval M XXX YYY - - chrX 1000 1100 myInterval R XXX YYY - - - where XXX and YYY are: - - XXX = number of nucleotides - - YYY = number of gaps - ----- - -Alternatively, you can request only summary information for a set of intervals: - - ======== =========== ======== - #species nucleotides coverage - ======== =========== ======== - hg18 30639 0.2372 - rheMac2 7524 0.0582 - panTro2 30390 0.2353 - ======== =========== ======== - - where **coverage** is the number of nucleotides divided by the total length of the provided intervals. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_thread_for_species.py --- a/tools/maf/maf_thread_for_species.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -#!/usr/bin/env python - -""" -Read a maf file and write out a new maf with only blocks having all of -the passed in species, after dropping any other species and removing columns -containing only gaps. This will attempt to fuse together any blocks -which are adjacent after the unwanted species have been dropped. - -usage: %prog input_maf output_maf species1,species2 -""" -#Dan Blankenberg -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.align.maf - -from bx.align.tools.thread import * -from bx.align.tools.fuse import * - -def main(): - input_file = sys.argv.pop( 1 ) - output_file = sys.argv.pop( 1 ) - species = sys.argv.pop( 1 ).split( ',' ) - - try: - maf_reader = bx.align.maf.Reader( open( input_file ) ) - except: - print >> sys.stderr, "Unable to open source MAF file" - sys.exit() - try: - maf_writer = FusingAlignmentWriter( bx.align.maf.Writer( open( output_file, 'w' ) ) ) - except: - print >> sys.stderr, "Unable to open output file" - sys.exit() - try: - for m in maf_reader: - new_components = m.components - if species != ['None']: - new_components = get_components_for_species( m, species ) - if new_components: - remove_all_gap_columns( new_components ) - m.components = new_components - m.score = 0.0 - maf_writer.write( m ) - except Exception, e: - print >> sys.stderr, "Error steping through MAF File: %s" % e - sys.exit() - maf_reader.close() - maf_writer.close() - - print "Restricted to species: %s." % ", ".join( species ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_thread_for_species.xml --- a/tools/maf/maf_thread_for_species.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - - by Species - maf_thread_for_species.py $input1 $out_file1 $species - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool allows the user to merge MAF blocks which are adjoining in each specified species from a MAF file. Columns which contain only gaps are removed. Species which are not desired are removed from the output. - -**Example** - -Specifying the desired species as hg17 and panTro1 with this MAF file:: - - ##maf version=1 - a score=60426.000000 - s hg17.chr7 127471195 331 + 158628139 gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCC-------------------------------AAATACT-GCCACTGATGTCCTG-----ATGGAGGTA-------TGAA-------------------AACATCCACTAA - s panTro1.chr6 129885076 331 + 161576975 gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCC-------------------------------AAATACT-GCCACTGATGTCCTG-----ATGGAGGTA-------TGAA-------------------AACATCCACTAA - s mm5.chr6 28904571 357 + 149721531 CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAA-CGAGGGTGGTCCAGTTACTATCTTGACTGCAGCTGGCAGTCAGTT-GCCACT-----CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCCAGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA - s rn3.chr4 56178191 282 + 187371129 CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCT-GTCAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTC---------------------------------GGAGTCTAGCTGTAGACAGCCCA-----ATG--GGTA-------TAAC-------------------AATACTCACTAA - - a score=8157.000000 - s hg17.chr7 127471526 58 + 158628139 AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG - s panTro1.chr6 129885407 58 + 161576975 AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG - s mm5.chr6 28904928 54 + 149721531 AA----CGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG - -results in:: - - ##maf version=1 - a score=0.0 - s hg17.chr7 127471195 389 + 158628139 gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGGTATGAAAACATCCACTAAAATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG - s panTro1.chr6 129885076 389 + 161576975 gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGGTATGAAAACATCCACTAAAATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_bed.py --- a/tools/maf/maf_to_bed.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,94 +0,0 @@ -#!/usr/bin/env python - -""" -Read a maf and output intervals for specified list of species. -""" -import sys, os, tempfile -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.align import maf - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - - input_filename = sys.argv[1] - output_filename = sys.argv[2] - #where to store files that become additional output - database_tmp_dir = sys.argv[5] - - species = sys.argv[3].split(',') - partial = sys.argv[4] - out_files = {} - primary_spec = None - - if "None" in species: - species = {} - try: - for i, m in enumerate( maf.Reader( open( input_filename, 'r' ) ) ): - for c in m.components: - spec,chrom = maf.src_split( c.src ) - if not spec or not chrom: - spec = chrom = c.src - species[spec] = "" - species = species.keys() - except: - print >>sys.stderr, "Invalid MAF file specified" - return - - if "?" in species: - print >>sys.stderr, "Invalid dbkey specified" - return - - - for i in range( 0, len( species ) ): - spec = species[i] - if i == 0: - out_files[spec] = open( output_filename, 'w' ) - primary_spec = spec - else: - out_files[spec] = tempfile.NamedTemporaryFile( mode = 'w', dir = database_tmp_dir, suffix = '.maf_to_bed' ) - filename = out_files[spec].name - out_files[spec].close() - out_files[spec] = open( filename, 'w' ) - num_species = len( species ) - - print "Restricted to species:", ",".join( species ) - - file_in = open( input_filename, 'r' ) - maf_reader = maf.Reader( file_in ) - - block_num = -1 - - for i, m in enumerate( maf_reader ): - block_num += 1 - if "None" not in species: - m = m.limit_to_species( species ) - l = m.components - if len(l) < num_species and partial == "partial_disallowed": continue - for c in l: - spec,chrom = maf.src_split( c.src ) - if not spec or not chrom: - spec = chrom = c.src - if spec not in out_files.keys(): - out_files[spec] = tempfile.NamedTemporaryFile( mode='w', dir = database_tmp_dir, suffix = '.maf_to_bed' ) - filename = out_files[spec].name - out_files[spec].close() - out_files[spec] = open( filename, 'w' ) - - if c.strand == "-": - out_files[spec].write( chrom + "\t" + str( c.src_size - c.end ) + "\t" + str( c.src_size - c.start ) + "\t" + spec + "_" + str( block_num ) + "\t" + "0\t" + c.strand + "\n" ) - else: - out_files[spec].write( chrom + "\t" + str( c.start ) + "\t" + str( c.end ) + "\t" + spec + "_" + str( block_num ) + "\t" + "0\t" + c.strand + "\n" ) - - file_in.close() - for file_out in out_files.keys(): - out_files[file_out].close() - - for spec in out_files.keys(): - if spec != primary_spec: - print "#FILE\t" + spec + "\t" + os.path.join( database_tmp_dir, os.path.split( out_files[spec].name )[1] ) - else: - print "#FILE1\t" + spec + "\t" + out_files[spec].name - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_bed.xml --- a/tools/maf/maf_to_bed.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,136 +0,0 @@ - - Converts a MAF formatted file to the BED format - maf_to_bed.py $input1 $out_file1 $species $complete_blocks $__new_file_path__ - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts every MAF block to an interval line (in BED format; scroll down for description of MAF and BED formats) describing position of that alignment block within a corresponding genome. - -The interface for this tool contains two pages (steps): - - * **Step 1 of 2**. Choose multiple alignments from history to be converted to BED format. - * **Step 2 of 2**. Choose species from the alignment to be included in the output and specify how to deal with alignment blocks that lack one or more species: - - * **Choose species** - the tool reads the alignment provided during Step 1 and generates a list of species contained within that alignment. Using checkboxes you can specify taxa to be included in the output (only reference genome, shown in **bold**, is selected by default). If you select more than one species, then more than one history item will be created. - * **Choose to include/exclude blocks with missing species** - if an alignment block does not contain any one of the species you selected within **Choose species** menu and this option is set to **exclude blocks with missing species**, then coordinates of such a block **will not** be included in the output (see **Example 2** below). - - ------ - -**Example 1**: **Include only reference genome** (hg18 in this case) and **include blocks with missing species**: - -For the following alignment:: - - ##maf version=1 - a score=68686.000000 - s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - a score=10289.000000 - s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - -the tool will create **a single** history item containing the following (**note** that field 4 is added to the output and is numbered iteratively: hg18_0, hg18_1 etc.):: - - chr20 56827368 56827443 hg18_0 0 + - chr20 56827443 56827480 hg18_1 0 + - ------ - -**Example 2**: **Include hg18 and mm8** and **exclude blocks with missing species**: - -For the following alignment:: - - ##maf version=1 - a score=68686.000000 - s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - a score=10289.000000 - s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - -the tool will create **two** history items (one for hg18 and one fopr mm8) containing the following (**note** that both history items contain only one line describing the first alignment block. The second MAF block is not included in the output because it does not contain mm8): - -History item **1** (for hg18):: - - chr20 56827368 56827443 hg18_0 0 + - -History item **2** (for mm8):: - - chr2 173910832 173910893 mm8_0 0 + - -------- - -.. class:: infomark - -**About formats** - -**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. - - - The .maf format is line-oriented. Each multiple alignment ends with a blank line. - - Each sequence in an alignment is on a single line. - - Lines starting with # are considered to be comments. - - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment. - - Some MAF files may contain two optional line types: - - - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; - - An "e" line containing information about the size of the gap between the alignments that span the current block. - -**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and a number of additional optional ones: - -The first three BED fields (required) are:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - -Additional (optional) fields are:: - - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_bed_code.py --- a/tools/maf/maf_to_bed_code.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.align import maf -from galaxy import datatypes, config, jobs -from shutil import move - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - output_data = out_data.items()[0][1] - history = output_data.history - if history == None: - print "unknown history!" - return - new_stdout = "" - split_stdout = stdout.split("\n") - basic_name = output_data.name - output_data_list = [] - for line in split_stdout: - if line.startswith("#FILE1"): - fields = line.split("\t") - dbkey = fields[1] - filepath = fields[2] - output_data.dbkey = dbkey - output_data.name = basic_name + " (" + dbkey + ")" - app.model.context.add( output_data ) - app.model.context.flush() - output_data_list.append(output_data) - elif line.startswith("#FILE"): - fields = line.split("\t") - dbkey = fields[1] - filepath = fields[2] - newdata = app.model.HistoryDatasetAssociation( create_dataset = True, sa_session = app.model.context ) - newdata.set_size() - newdata.extension = "bed" - newdata.name = basic_name + " (" + dbkey + ")" - app.model.context.add( newdata ) - app.model.context.flush() - history.add_dataset( newdata ) - app.security_agent.copy_dataset_permissions( output_data.dataset, newdata.dataset ) - app.model.context.add( history ) - app.model.context.flush() - try: - move(filepath,newdata.file_name) - newdata.info = newdata.name - newdata.state = newdata.states.OK - except: - newdata.info = "The requested file is missing from the system." - newdata.state = newdata.states.ERROR - newdata.dbkey = dbkey - newdata.init_meta() - newdata.set_meta() - newdata.set_peek() - app.model.context.flush() - output_data_list.append(newdata) - else: - new_stdout = new_stdout + line - for data in output_data_list: - if data.state == data.states.OK: - data.info = new_stdout - app.model.context.add( data ) - app.model.context.flush() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_fasta.xml --- a/tools/maf/maf_to_fasta.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,199 +0,0 @@ - - Converts a MAF formatted file to FASTA format - - #if $fasta_target_type.fasta_type == "multiple" #maf_to_fasta_multiple_sets.py $input1 $out_file1 $fasta_target_type.species $fasta_target_type.complete_blocks - #else #maf_to_fasta_concat.py $fasta_target_type.species $input1 $out_file1 - #end if# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Types of MAF to FASTA conversion** - - * **Multiple Blocks** converts a single MAF block to a single FASTA block. For example, if you have 6 MAF blocks, they will be converted to 6 FASTA blocks. - * **One Sequence per Species** converts MAF blocks to a single aggregated FASTA block. For example, if you have 6 MAF blocks, they will be converted and concatenated into a single FASTA block. - -------- - -**What it does** - -This tool converts MAF blocks to FASTA format and concatenates them into a single FASTA block or outputs multiple FASTA blocks separated by empty lines. - -The interface for this tool contains two pages (steps): - - * **Step 1 of 2**. Choose multiple alignments from history to be converted to FASTA format. - * **Step 2 of 2**. Choose the type of output as well as the species from the alignment to be included in the output. - - Multiple Block output has additional options: - - * **Choose species** - the tool reads the alignment provided during Step 1 and generates a list of species contained within that alignment. Using checkboxes you can specify taxa to be included in the output (all species are selected by default). - * **Choose to include/exclude blocks with missing species** - if an alignment block does not contain any one of the species you selected within **Choose species** menu and this option is set to **exclude blocks with missing species**, then such a block **will not** be included in the output (see **Example 2** below). For example, if you want to extract human, mouse, and rat from a series of alignments and one of the blocks does not contain mouse sequence, then this block will not be converted to FASTA and will not be returned. - - ------ - -**Example 1**: - -In the concatenated approach, the following alignment:: - - ##maf version=1 - a score=68686.000000 - s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - a score=10289.000000 - s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - -will be converted to (**note** that because mm8 (mouse) and canFam2 (dog) are absent from the second block, they are replaced with gaps after concatenation):: - - >canFam2 - CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C------------------------------------- - >hg18 - GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - >mm8 - AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------------------------------------------- - >panTro2 - GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - >rheMac2 - GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - ------- - -**Example 2a**: Multiple Block Approach **Include all species** and **include blocks with missing species**: - -The following alignment:: - - ##maf version=1 - a score=68686.000000 - s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - a score=10289.000000 - s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - -will be converted to:: - - >hg18.chr20(+):56827368-56827443|hg18_0 - GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - >panTro2.chr20(+):56528685-56528760|panTro2_0 - GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - >rheMac2.chr10(-):89144112-89144181|rheMac2_0 - GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - >mm8.chr2(+):173910832-173910893|mm8_0 - AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - >canFam2.chr24(+):46551822-46551889|canFam2_0 - CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - >hg18.chr20(+):56827443-56827480|hg18_1 - ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - >panTro2.chr20(+):56528760-56528797|panTro2_1 - ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - >rheMac2.chr10(-):89144181-89144218|rheMac2_1 - ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - ------ - -**Example 2b**: Multiple Block Approach **Include hg18 and mm8** and **exclude blocks with missing species**: - -The following alignment:: - - ##maf version=1 - a score=68686.000000 - s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - a score=10289.000000 - s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - -will be converted to (**note** that the second MAF block, which does not have mm8, is not included in the output):: - - >hg18.chr20(+):56827368-56827443|hg18_0 - GACAGGGTGCATCTGGGAGGGCCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC - >mm8.chr2(+):173910832-173910893|mm8_0 - AGAAGGATCCACCT---------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------ - ------- - -.. class:: infomark - -**About formats** - - **MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. - - - The .maf format is line-oriented. Each multiple alignment ends with a blank line. - - Each sequence in an alignment is on a single line. - - Lines starting with # are considered to be comments. - - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment. - - Some MAF files may contain two optional line types: - - - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; - - An "e" line containing information about the size of the gap between the alignments that span the current block. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_fasta_concat.py --- a/tools/maf/maf_to_fasta_concat.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -#!/usr/bin/env python - -""" -Read a maf and output a single block fasta file, concatenating blocks - -usage %prog species1,species2 maf_file out_file -""" -#Dan Blankenberg -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.align import maf -from galaxy.tools.util import maf_utilities - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - try: - species = maf_utilities.parse_species_option( sys.argv[1] ) - except Exception, e: - maf_utilities.tool_fail( "Error determining species value: %s" % e ) - try: - input_filename = sys.argv[2] - except Exception, e: - maf_utilities.tool_fail( "Error reading MAF filename: %s" % e ) - try: - file_out = open( sys.argv[3], 'w' ) - except Exception, e: - maf_utilities.tool_fail( "Error opening file for output: %s" % e ) - - if species: - print "Restricted to species: %s" % ', '.join( species ) - else: - print "Not restricted to species." - - if not species: - try: - species = maf_utilities.get_species_in_maf( input_filename ) - except Exception, e: - maf_utilities.tool_fail( "Error determining species in input MAF: %s" % e ) - - for spec in species: - file_out.write( ">" + spec + "\n" ) - try: - for start_block in maf.Reader( open( input_filename, 'r' ) ): - for block in maf_utilities.iter_blocks_split_by_species( start_block ): - block.remove_all_gap_columns() #remove extra gaps - component = block.get_component_by_src_start( spec ) #blocks only have one occurrence of a particular species, so this is safe - if component: - file_out.write( component.text ) - else: - file_out.write( "-" * block.text_size ) - except Exception, e: - maf_utilities.tool_fail( "Your MAF file appears to be malformed: %s" % e ) - file_out.write( "\n" ) - file_out.close() - - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_fasta_multiple_sets.py --- a/tools/maf/maf_to_fasta_multiple_sets.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -#!/usr/bin/env python - -""" -Read a maf and output a multiple block fasta file. -""" -#Dan Blankenberg -import sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.align import maf -from galaxy.tools.util import maf_utilities - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - try: - maf_reader = maf.Reader( open( sys.argv[1] ) ) - except Exception, e: - maf_utilities.tool_fail( "Error opening input MAF: %s" % e ) - try: - file_out = open( sys.argv[2], 'w' ) - except Exception, e: - maf_utilities.tool_fail( "Error opening file for output: %s" % e ) - try: - species = maf_utilities.parse_species_option( sys.argv[3] ) - if species: - num_species = len( species ) - else: - num_species = 0 - except Exception, e: - maf_utilities.tool_fail( "Error determining species value: %s" % e ) - try: - partial = sys.argv[4] - except Exception, e: - maf_utilities.tool_fail( "Error determining keep partial value: %s" % e ) - - if species: - print "Restricted to species: %s" % ', '.join( species ) - else: - print "Not restricted to species." - - for block_num, block in enumerate( maf_reader ): - if species: - block = block.limit_to_species( species ) - if len( maf_utilities.get_species_in_block( block ) ) < num_species and partial == "partial_disallowed": continue - spec_counts = {} - for component in block.components: - spec, chrom = maf_utilities.src_split( component.src ) - if spec not in spec_counts: - spec_counts[ spec ] = 0 - else: - spec_counts[ spec ] += 1 - file_out.write( "%s\n" % maf_utilities.get_fasta_header( component, { 'block_index' : block_num, 'species' : spec, 'sequence_index' : spec_counts[ spec ] }, suffix = "%s_%i_%i" % ( spec, block_num, spec_counts[ spec ] ) ) ) - file_out.write( "%s\n" % component.text ) - file_out.write( "\n" ) - file_out.close() - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_interval.py --- a/tools/maf/maf_to_interval.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ -#!/usr/bin/env python - -""" -Read a maf and output intervals for specified list of species. -""" -import sys, os -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.align import maf -from galaxy.tools.util import maf_utilities - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - input_filename = sys.argv[1] - output_filename = sys.argv[2] - output_id = sys.argv[3] - #where to store files that become additional output - database_tmp_dir = sys.argv[4] - primary_spec = sys.argv[5] - species = sys.argv[6].split( ',' ) - all_species = sys.argv[7].split( ',' ) - partial = sys.argv[8] - keep_gaps = sys.argv[9] - out_files = {} - - if "None" in species: - species = [] - - if primary_spec not in species: - species.append( primary_spec ) - if primary_spec not in all_species: - all_species.append( primary_spec ) - - all_species.sort() - for spec in species: - if spec == primary_spec: - out_files[ spec ] = open( output_filename, 'wb+' ) - else: - out_files[ spec ] = open( os.path.join( database_tmp_dir, 'primary_%s_%s_visible_interval_%s' % ( output_id, spec, spec ) ), 'wb+' ) - out_files[ spec ].write( '#chrom\tstart\tend\tstrand\tscore\tname\t%s\n' % ( '\t'.join( all_species ) ) ) - num_species = len( all_species ) - - file_in = open( input_filename, 'r' ) - maf_reader = maf.Reader( file_in ) - - for i, m in enumerate( maf_reader ): - for j, block in enumerate( maf_utilities.iter_blocks_split_by_species( m ) ): - if len( block.components ) < num_species and partial == "partial_disallowed": continue - sequences = {} - for c in block.components: - spec, chrom = maf_utilities.src_split( c.src ) - if keep_gaps == 'remove_gaps': - sequences[ spec ] = c.text.replace( '-', '' ) - else: - sequences[ spec ] = c.text - sequences = '\t'.join( [ sequences.get( spec, '' ) for spec in all_species ] ) - for spec in species: - c = block.get_component_by_src_start( spec ) - if c is not None: - spec2, chrom = maf_utilities.src_split( c.src ) - assert spec2 == spec, Exception( 'Species name inconsistancy found in component: %s != %s' % ( spec, spec2 ) ) - out_files[ spec ].write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( chrom, c.forward_strand_start, c.forward_strand_end, c.strand, m.score, "%s_%s_%s" % (spec, i, j), sequences ) ) - file_in.close() - for file_out in out_files.values(): - file_out.close() - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/maf_to_interval.xml --- a/tools/maf/maf_to_interval.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ - - Converts a MAF formatted file to the Interval format - maf_to_interval.py $input1 $out_file1 $out_file1.id $__new_file_path__ $input1.dbkey $species $input1.metadata.species $complete_blocks $remove_gaps - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts every MAF block to a set of genomic intervals describing the position of that alignment block within a corresponding genome. Sequences from aligning species are also included in the output. - -The interface for this tool contains several options: - - * **MAF file to convert**. Choose multiple alignments from history to be converted to BED format. - * **Choose species**. Choose additional species from the alignment to be included in the output - * **Exclude blocks which have a species missing**. if an alignment block does not contain any one of the species found in the alignment set and this option is set to **exclude blocks with missing species**, then coordinates of such a block **will not** be included in the output (see **Example 2** below). - * **Remove Gap characters from sequences**. Gaps can be removed from sequences before they are output. - - ------ - -**Example 1**: **Include only reference genome** (hg18 in this case) and **include blocks with missing species**: - -For the following alignment:: - - ##maf version=1 - a score=68686.000000 - s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - a score=10289.000000 - s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - -the tool will create **a single** history item containing the following (**note** the name field is numbered iteratively: hg18_0_0, hg18_1_0 etc. where the first number is the block number and the second number is the iteration through the block (if a species appears twice in a block, that interval will be repeated) and sequences for each species are included in the order specified in the header: the field is left empty when no sequence is available for that species):: - - #chrom start end strand score name canFam2 hg18 mm8 panTro2 rheMac2 - chr20 56827368 56827443 + 68686.0 hg18_0_0 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - chr20 56827443 56827480 + 10289.0 hg18_1_0 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - - ------ - -**Example 2**: **Include hg18 and mm8** and **exclude blocks with missing species**: - -For the following alignment:: - - ##maf version=1 - a score=68686.000000 - s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- - s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- - s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C - - a score=10289.000000 - s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG - s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG - -the tool will create **two** history items (one for hg18 and one for mm8) containing the following (**note** that both history items contain only one line describing the first alignment block. The second MAF block is not included in the output because it does not contain mm8): - -History item **1** (for hg18):: - - #chrom start end strand score name canFam2 hg18 mm8 panTro2 rheMac2 - chr20 56827368 56827443 + 68686.0 hg18_0_0 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - - -History item **2** (for mm8):: - - #chrom start end strand score name canFam2 hg18 mm8 panTro2 rheMac2 - chr2 173910832 173910893 + 68686.0 mm8_0_0 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- - - -------- - -.. class:: infomark - -**About formats** - -**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. - - - The .maf format is line-oriented. Each multiple alignment ends with a blank line. - - Each sequence in an alignment is on a single line. - - Lines starting with # are considered to be comments. - - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment. - - Some MAF files may contain two optional line types: - - - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; - - An "e" line containing information about the size of the gap between the alignments that span the current block. - ------- - -**Citation** - -If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. <http://www.ncbi.nlm.nih.gov/pubmed/21775304>`_ - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/maf/vcf_to_maf_customtrack.py --- a/tools/maf/vcf_to_maf_customtrack.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,151 +0,0 @@ -#Dan Blankenberg -from optparse import OptionParser -import sys -import galaxy_utils.sequence.vcf - -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -import bx.align.maf - -UNKNOWN_NUCLEOTIDE = '*' - -class PopulationVCFParser( object ): - def __init__( self, reader, name ): - self.reader = reader - self.name = name - self.counter = 0 - def next( self ): - rval = [] - vc = self.reader.next() - for i, allele in enumerate( vc.alt ): - rval.append( ( '%s_%i.%i' % ( self.name, i + 1, self.counter + 1 ), allele ) ) - self.counter += 1 - return ( vc, rval ) - def __iter__( self ): - while True: - yield self.next() - -class SampleVCFParser( object ): - def __init__( self, reader ): - self.reader = reader - self.counter = 0 - def next( self ): - rval = [] - vc = self.reader.next() - alleles = [ vc.ref ] + vc.alt - - if 'GT' in vc.format: - gt_index = vc.format.index( 'GT' ) - for sample_name, sample_value in zip( vc.sample_names, vc.sample_values ): - gt_indexes = [] - for i in sample_value[ gt_index ].replace( '|', '/' ).replace( '\\', '/' ).split( '/' ): #Do we need to consider phase here? - try: - gt_indexes.append( int( i ) ) - except: - gt_indexes.append( None ) - for i, allele_i in enumerate( gt_indexes ): - if allele_i is not None: - rval.append( ( '%s_%i.%i' % ( sample_name, i + 1, self.counter + 1 ), alleles[ allele_i ] ) ) - self.counter += 1 - return ( vc, rval ) - def __iter__( self ): - while True: - yield self.next() - -def main(): - usage = "usage: %prog [options] output_file dbkey inputfile pop_name" - parser = OptionParser( usage=usage ) - parser.add_option( "-p", "--population", action="store_true", dest="population", default=False, help="Create MAF on a per population basis") - parser.add_option( "-s", "--sample", action="store_true", dest="sample", default=False, help="Create MAF on a per sample basis") - parser.add_option( "-n", "--name", dest="name", default='Unknown Custom Track', help="Name for Custom Track") - parser.add_option( "-g", "--galaxy", action="store_true", dest="galaxy", default=False, help="Tool is being executed by Galaxy (adds extra error messaging).") - - - ( options, args ) = parser.parse_args() - - if len ( args ) < 3: - if options.galaxy: - print >>sys.stderr, "It appears that you forgot to specify an input VCF file, click 'Add new VCF...' to add at least input.\n" - parser.error( "Need to specify an output file, a dbkey and at least one input file" ) - - if not ( options.population ^ options.sample ): - parser.error( 'You must specify either a per population conversion or a per sample conversion, but not both' ) - - out = open( args.pop(0), 'wb' ) - out.write( 'track name="%s" visibility=pack\n' % options.name.replace( "\"", "'" ) ) - - maf_writer = bx.align.maf.Writer( out ) - - dbkey = args.pop(0) - - vcf_files = [] - if options.population: - i = 0 - while args: - filename = args.pop( 0 ) - pop_name = args.pop( 0 ).replace( ' ', '_' ) - if not pop_name: - pop_name = 'population_%i' % ( i + 1 ) - vcf_files.append( PopulationVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ), pop_name ) ) - i += 1 - else: - while args: - filename = args.pop( 0 ) - vcf_files.append( SampleVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ) ) - - non_spec_skipped = 0 - for vcf_file in vcf_files: - for vc, variants in vcf_file: - num_ins = 0 - num_dels = 0 - for variant_name, variant_text in variants: - if 'D' in variant_text: - num_dels = max( num_dels, int( variant_text[1:] ) ) - elif 'I' in variant_text: - num_ins = max( num_ins, len( variant_text ) - 1 ) - - alignment = bx.align.maf.Alignment() - ref_text = vc.ref + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) - start_pos = vc.pos - 1 - if num_dels and start_pos: - ref_text = UNKNOWN_NUCLEOTIDE + ref_text - start_pos -= 1 - alignment.add_component( bx.align.maf.Component( src='%s.%s%s' % ( - dbkey, ("chr" if not vc.chrom.startswith("chr") else ""), vc.chrom ), - start = start_pos, size = len( ref_text.replace( '-', '' ) ), - strand = '+', src_size = start_pos + len( ref_text ), - text = ref_text ) ) - for variant_name, variant_text in variants: - #FIXME: - ## skip non-spec. compliant data, see: http://1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 for format spec - ## this check is due to data having indels not represented in the published format spec, - ## e.g. 1000 genomes pilot 1 indel data: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/indels/CEU.SRP000031.2010_03.indels.sites.vcf.gz - if variant_text and variant_text[0] in [ '-', '+' ]: - non_spec_skipped += 1 - continue - - #do we need a left padding unknown nucleotide (do we have deletions)? - if num_dels and start_pos: - var_text = UNKNOWN_NUCLEOTIDE - else: - var_text = '' - if 'D' in variant_text: - cur_num_del = int( variant_text[1:] ) - pre_del = min( len( vc.ref ), cur_num_del ) - post_del = cur_num_del - pre_del - var_text = var_text + '-' * pre_del + '-' * num_ins + '-' * post_del - var_text = var_text + UNKNOWN_NUCLEOTIDE * ( len( ref_text ) - len( var_text ) ) - elif 'I' in variant_text: - cur_num_ins = len( variant_text ) - 1 - var_text = var_text + vc.ref + variant_text[1:] + '-' * ( num_ins - cur_num_ins ) + UNKNOWN_NUCLEOTIDE * max( 0, ( num_dels - 1 ) ) - else: - var_text = var_text + variant_text + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) - alignment.add_component( bx.align.maf.Component( src=variant_name, start = 0, size = len( var_text.replace( '-', '' ) ), strand = '+', src_size = len( var_text.replace( '-', '' ) ), text = var_text ) ) - maf_writer.write( alignment ) - - maf_writer.close() - - if non_spec_skipped: - print 'Skipped %i non-specification compliant indels.' % non_spec_skipped - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/maf/vcf_to_maf_customtrack.xml --- a/tools/maf/vcf_to_maf_customtrack.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ - - for display at UCSC - vcf_to_maf_customtrack.py '$out_file1' - #if $vcf_source_type.vcf_file - '${vcf_source_type.vcf_file[0].vcf_input.dbkey}' - #else - '?' - #end if - ${vcf_source_type.vcf_source} -n '$track_name' - #for $vcf_repeat in $vcf_source_type.vcf_file - '${vcf_repeat.vcf_input}' - #if $vcf_source_type.vcf_source == '-p' - '${vcf_repeat.population_name}' - #end if - #end for - -g - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool converts a Variant Call Format (VCF) file into a Multiple Alignment Format (MAF) custom track file suitable for display at genome browsers. - -This file should be used for display purposes only (e.g as a UCSC Custom Track). Performing an analysis using the output created by this tool as input is not recommended; the source VCF file should be used when performing an analysis. - -*Unknown nucleotides* are represented as '*' as required to allow the display to draw properly; these include e.g. reference bases which appear before a deletion and are not available without querying the original reference sequence. - -**Example** - -Starting with a VCF:: - - ##fileformat=VCFv3.3 - ##fileDate=20090805 - ##source=myImputationProgramV3.1 - ##reference=1000GenomesPilot-NCBI36 - ##phasing=partial - ##INFO=NS,1,Integer,"Number of Samples With Data" - ##INFO=DP,1,Integer,"Total Depth" - ##INFO=AF,-1,Float,"Allele Frequency" - ##INFO=AA,1,String,"Ancestral Allele" - ##INFO=DB,0,Flag,"dbSNP membership, build 129" - ##INFO=H2,0,Flag,"HapMap2 membership" - ##FILTER=q10,"Quality below 10" - ##FILTER=s50,"Less than 50% of samples have data" - ##FORMAT=GT,1,String,"Genotype" - ##FORMAT=GQ,1,Integer,"Genotype Quality" - ##FORMAT=DP,1,Integer,"Read Depth" - ##FORMAT=HQ,2,Integer,"Haplotype Quality" - #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 - 20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1 - 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:-1,-1 - 20 1110696 rs6040355 A G,T 67 0 NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:-1,-1 - 20 1230237 . T . 47 0 NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:-1,-1 - 20 1234567 microsat1 G D4,IGA 50 0 NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 - - - - -Under the following conditions: **VCF Source type:** *Per Population (file)*, **Name for this population:** *CHB+JPT* -Results in the following MAF custom track:: - - track name="Galaxy Custom Track" visibility=pack - ##maf version=1 - a score=0 - s hg18.chr20 14369 1 + 14370 G - s CHB+JPT_1.1 0 1 + 1 A - - a score=0 - s hg18.chr20 17329 1 + 17330 T - s CHB+JPT_1.2 0 1 + 1 A - - a score=0 - s hg18.chr20 1110695 1 + 1110696 A - s CHB+JPT_1.3 0 1 + 1 G - s CHB+JPT_2.3 0 1 + 1 T - - a score=0 - s hg18.chr20 1230236 1 + 1230237 T - s CHB+JPT_1.4 0 1 + 1 . - - a score=0 - s hg18.chr20 1234565 5 + 1234572 *G--*** - s CHB+JPT_1.5 0 1 + 1 *------ - s CHB+JPT_2.5 0 7 + 7 *GGA*** - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/meme/._meme.xml Binary file tools/meme/._meme.xml has changed diff -r c2a356708570 -r 33c067c3ae34 tools/meme/fimo.xml --- a/tools/meme/fimo.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,230 +0,0 @@ - - - Find Individual Motif Occurrences - fimo_wrapper.py 'fimo --o "${$html_outfile.files_path}" --verbosity "1" - - #if str( $options_type.options_type_selector ) == 'advanced': - --max-seq-length "${options_type.max_seq_length}" - --max-stored-scores "${options_type.max_stored_scores }" - --motif-pseudo "${options_type.motif_pseudo}" - ${options_type.norc} - --output-pthresh "${options_type.output_pthresh}" - - - #for $motif in $options_type.motifs: - --motif "${motif.motif}" - #end for - - #if str( $options_type.bgfile_type.bgfile_type_selector ) == 'motif-file': - --bgfile "motif-file" - #elif str( $options_type.bgfile_type.bgfile_type_selector ) == 'motif-file': - --bgfile "${options_type.bgfile_type.bgfile}" - #end if - - #if str( $options_type.qvalue_type.qvalue_type_selector ) == 'no-qvalue': - --no-qvalue - #else: - --output-qthresh "${options_type.qvalue_type.output_qthresh}" - #end if - #end if - - "${input_motifs}" - - #if str( $fasta_type.fasta_type_selector ) == 'history': - "${fasta_type.input_database}" - #else: - "${ filter( lambda x: str( x[0] ) == str( $fasta_type.input_database ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][3] }" - #end if - - ' - - '${html_outfile.files_path}' - - '${html_outfile}' - - '${interval_outfile}' - - '${txt_outfile}' - - '${xml_outfile}' - - '${gff_outfile}' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - value == True - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.** - -.. class:: infomark - -**To cite FIMO:** -`Grant CE, Bailey TL, Noble WS. FIMO: scanning for occurrences of a given motif. Bioinformatics. 2011 Apr 1;27(7):1017-8. <http://www.ncbi.nlm.nih.gov/pubmed/21330290>`_ - - -For detailed information on FIMO, click here_. To view the license_. - -.. _here: http://meme.nbcr.net/meme/fimo-intro.html -.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html - - - diff -r c2a356708570 -r 33c067c3ae34 tools/meme/fimo_wrapper.py --- a/tools/meme/fimo_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg - -""" -Read text output from FIMO and create an interval file. -""" -import sys, tempfile, subprocess, shutil, os -from galaxy_utils.sequence.transform import DNA_reverse_complement - -buffsize = 1048576 - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - assert len( sys.argv ) == 8, "Wrong number of arguments" - sys.argv.pop(0) - fimo_cmd = sys.argv.pop(0) - html_path = sys.argv.pop(0) - html_out = sys.argv.pop(0) - interval_out = sys.argv.pop(0) - txt_out = sys.argv.pop(0) - xml_out = sys.argv.pop(0) - gff_out = sys.argv.pop(0) - - #run fimo - try: - tmp_stderr = tempfile.NamedTemporaryFile() - #tmp_stderr = open( tmp_filename, 'wb' ) - proc = subprocess.Popen( args=fimo_cmd, shell=True, stderr=tmp_stderr ) - returncode = proc.wait() - #tmp_stderr.close() - # get stderr, allowing for case where it's very large - #tmp_stderr = open( tmp, 'rb' ) - tmp_stderr.seek(0) - stderr = '' - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - - if returncode != 0: - raise Exception, stderr - except Exception, e: - raise Exception, 'Error running FIMO:\n' + str( e ) - - shutil.move( os.path.join( html_path, 'fimo.txt' ), txt_out ) - shutil.move( os.path.join( html_path, 'fimo.gff' ), gff_out ) - shutil.move( os.path.join( html_path, 'fimo.xml' ), xml_out ) - shutil.move( os.path.join( html_path, 'fimo.html' ), html_out ) - - out_file = open( interval_out, 'wb' ) - out_file.write( "#%s\n" % "\t".join( ( "chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value" ) ) ) - for line in open( txt_out ): - if line.startswith( '#' ): continue - fields = line.rstrip( "\n\r" ).split( "\t" ) - start, end = int( fields[2] ), int( fields[3] ) - sequence = fields[7] - if start > end: - start, end = end, start #flip start and end, and set strand - strand = "-" - sequence = DNA_reverse_complement( sequence ) #we want sequences relative to strand; FIMO always provides + stranded sequence - else: - strand = "+" - start -= 1 #make 0-based start position - out_file.write( "%s\n" % "\t".join( [ fields[1], str( start ), str( end ), fields[0], fields[4], strand, sequence, fields[5], fields[6] ] ) ) - out_file.close() - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/meme/meme.xml --- a/tools/meme/meme.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,349 +0,0 @@ - - meme - - Multiple Em for Motif Elicitation - meme "$input1" -o "${html_outfile.files_path}" - -nostatus - - ##-p 8 ##number of processors - - #if str( $options_type.options_type_selector ) == 'advanced': - -sf "${ str( $options_type.sf ).replace( ' ', '_' ) }" - -${options_type.alphabet_type.alphabet_type_selector} - -mod "${options_type.mod_type.mod_type_selector}" - -nmotifs "${options_type.nmotifs}" - -wnsites "${options_type.wnsites}" - -maxsize "${options_type.maxsize}" - - #if $options_type.evt < float('inf'): - -evt "${options_type.evt}" - #end if - - #if str( $options_type.mod_type.mod_type_selector ) != 'oops': - #if str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'nsites': - -nsites "${options_type.mod_type.motif_occurrence_type.nsites}" - #elif str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'min_max_sites': - -minsites "${options_type.mod_type.motif_occurrence_type.minsites}" -maxsites "${options_type.mod_type.motif_occurrence_type.maxsites}" - #end if - #end if - - #if str( $options_type.motif_width_type.motif_width_type_selector ) == 'exact': - -w "${options_type.motif_width_type.width}" - #else - -minw "${options_type.motif_width_type.minw}" -maxw "${options_type.motif_width_type.maxw}" - #end if - - #if str( $options_type.motif_trim_type.motif_trim_type_selector ) == 'nomatrim': - -nomatrim - #else - -wg "${options_type.motif_trim_type.wg}" -ws "${options_type.motif_trim_type.ws}" ${options_type.motif_trim_type.noendgaps} - #end if - - #if str( $options_type.bfile ) != 'None': - -bfile "${options_type.bfile}" - #end if - - #if str( $options_type.pspfile ) != 'None': - -psp "${options_type.pspfile}" - #end if - - #if str( $options_type.alphabet_type.alphabet_type_selector ) == "dna": - ${options_type.alphabet_type.revcomp} ${options_type.alphabet_type.pal} - #end if - - -maxiter "${options_type.maxiter}" -distance "${options_type.distance}" - - -prior "${options_type.alphabet_type.prior_type.prior_type_selector}" - #if str( $options_type.alphabet_type.prior_type.prior_type_selector ) != 'addone': - -b "${options_type.alphabet_type.prior_type.prior_b}" - #if str( $options_type.alphabet_type.prior_type.plib ) != 'None': - -plib "${options_type.alphabet_type.prior_type.plib}" - #end if - #end if - - #if str( $options_type.alphabet_type.spmap_type.spmap_type_selector ) == 'cons': - -cons "${options_type.alphabet_type.spmap_type.cons}" - #else - -spmap "${options_type.alphabet_type.spmap_type.spmap_type_selector}" - -spfuzz "${options_type.alphabet_type.spmap_type.spfuzz}" - #end if - - #if str( $options_type.branching_type.branching_type_selector ) == 'x_branch': - -x_branch -bfactor "${options_type.branching_type.bfactor}" -heapsize "${options_type.branching_type.heapsize}" - #end if - - ##-maxsize "1000000" ##remove hardcoded maxsize? should increase number of processors instead - - #end if - - 2>&1 || echo "Error running MEME." - - - && mv ${html_outfile.files_path}/meme.html ${html_outfile} - - && mv ${html_outfile.files_path}/meme.txt ${txt_outfile} - - && mv ${html_outfile.files_path}/meme.xml ${xml_outfile} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - value == True - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.** - -If you want to specify sequence weights, you must include them at the top of your input FASTA file. - -.. class:: infomark - -**To cite MEME:** -Timothy L. Bailey and Charles Elkan, "Fitting a mixture model by expectation maximization to discover motifs in biopolymers", Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology, pp. 28-36, AAAI Press, Menlo Park, California, 1994. - - -For detailed information on MEME, click here_. To view the license_. - -.. _here: http://meme.nbcr.net/meme/meme-intro.html -.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/blat_coverage_report.py --- a/tools/metag_tools/blat_coverage_report.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -#!/usr/bin/env python - -import os, sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def reverse_complement(s): - complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."} - reversed_s = [] - for i in s: - reversed_s.append(complement_dna[i]) - reversed_s.reverse() - return "".join(reversed_s) - -def __main__(): - nuc_index = {'a':0,'t':1,'c':2,'g':3} - diff_hash = {} # key = (chrom, index) - infile = sys.argv[1] - outfile = sys.argv[2] - invalid_lines = 0 - invalid_chars = 0 - data_id = '' - data_seq = '' - - for i, line in enumerate( open( infile ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - fields = line.split() - if len(fields) != 23: # standard number of pslx columns - invalid_lines += 1 - continue - if not fields[0].isdigit(): - invalid_lines += 1 - continue - read_id = fields[9] - chrom = fields[13] - try: - block_count = int(fields[17]) - except: - invalid_lines += 1 - continue - block_size = fields[18].split(',') - read_start = fields[19].split(',') - chrom_start = fields[20].split(',') - read_seq = fields[21].split(',') - chrom_seq = fields[22].split(',') - - for j in range(block_count): - try: - this_block_size = int(block_size[j]) - this_read_start = int(read_start[j]) - this_chrom_start = int(chrom_start[j]) - except: - invalid_lines += 1 - break - this_read_seq = read_seq[j] - this_chrom_seq = chrom_seq[j] - - if not this_read_seq.isalpha(): - continue - if not this_chrom_seq.isalpha(): - continue - - # brut force to check coverage - for k in range(this_block_size): - cur_index = this_chrom_start+k - sub_a = this_read_seq[k:(k+1)].lower() - sub_b = this_chrom_seq[k:(k+1)].lower() - if not diff_hash.has_key((chrom, cur_index)): - try: - diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()] # a, t, c, g, ref. nuc. - except Exception, e: - stop_err( str( e ) ) - if sub_a in ['a','t','c','g']: - diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1 - else: - invalid_chars += 1 - - outputfh = open(outfile, 'w') - outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" ) - keys = diff_hash.keys() - keys.sort() - for i in keys: - (chrom, location) = i - sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ] # did not include N's - if sum == 0: - continue - ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum - ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum - ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum - ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum - (title_head, title_tail) = os.path.split(chrom) - result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G ) - outputfh.write(result) - outputfh.close() - - if invalid_lines: - print 'Skipped %d invalid lines. ' % ( invalid_lines ) - if invalid_chars: - print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars) - -if __name__ == '__main__': __main__() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/blat_coverage_report.xml --- a/tools/metag_tools/blat_coverage_report.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - - the percentage of reads supporting each nucleotide at each location - blat_coverage_report.py $input1 $output1 - - - - - - - - - - - - - - -.. class:: warningmark - -**IMPORTANT**. Only works for BLAT **standard** or **pslx** output formats (hint: to output pslx format, add **-out=pslx** in the command). - ------ - -**What it does** - - The tool will generate a table of 6 columns as following: - -- 1st column: chromosome id. - -- 2nd column: chromosome location. - -- 3rd column: the nucleotide from reference genome at the chromosome location (2nd column). - -- 4th column: total coverage of the reads (number of reads that were mapped to the chromosome location). - -- 5th column: percentage of reads that support nucleotide **A** at this location. - -- 6th column: percentage of reads that support nucleotide **T** at this location. - -- 7th column: percentage of reads that support nucleotide **C** at this location. - -- 8th column: percentage of reads that support nucleotide **G** at this location. - - ------ - -**Example** - -- The BLAT pslx results look like the following (tab separated with sequence at the end):: - - 30 0 0 0 0 0 0 0 + seq0 30 0 30 chr 4639675 4549207 4549237 1 30, 0, 4549207, cggacagcgccgccaccaacaaagccacca, cggacagcgccgccaccaacaaagccacca, - 30 0 0 0 0 0 0 0 + seq1 30 0 30 chr 4639675 614777 614807 1 30, 0, 614777, aaaacaccggatgctccggcgctggcagat, aaaacaccggatgctccggcgctggcagat, - 28 1 0 0 0 0 0 0 + seq2 30 0 29 chr 4639675 3289283 3289312 1 29, 0, 3289283, tttgcttttagtacaccggattcagaacc, tttgctttcagtacaccggattcagaacc, - 30 0 0 0 0 0 0 0 + seq4 30 0 30 chr 4639675 2665584 2665614 1 30, 0, 2665584, cacgctacgtgcgcccccgcccagaaggcg, cacgctacgtgcgcccccgcccagaaggcg, - - The 14th column is the chromosome id, and the 16th and 17th columns shows the reads were mapped to chromosome start and end locations. - -- The report showed overall coverage of reads on each chromosome location (partial result):: - - +-------+----------+------+------+--------+------+--------+------+ - | title | location | ref. | cov. | A | T | C | G | - +-------+----------+------+------+--------+------+--------+------+ - | chr | 614777 | A | 1 | A(100) | T(0) | C(0) | G(0) | - | chr | 614778 | A | 1 | A(100) | T(0) | C(0) | G(0) | - | chr | 614779 | A | 1 | A(100) | T(0) | C(0) | G(0) | - +-------+----------+------+------+--------+------+--------+------+ - ------ - -**Reference** - - **BLAT**: Kent, W James, BLAT--the BLAST-like alignment tool. (2002) Genome Research:12(4) 656-664. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/blat_mapping.py --- a/tools/metag_tools/blat_mapping.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -#!/usr/bin/env python - -import os, sys - -assert sys.version_info[:2] >= ( 2, 4 ) - -def reverse_complement(s): - complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."} - reversed_s = [] - for i in s: - reversed_s.append(complement_dna[i]) - reversed_s.reverse() - return "".join(reversed_s) - -def __main__(): - nuc_index = {'a':0,'t':1,'c':2,'g':3,'n':4} - coverage = {} # key = (chrom, index) - invalid_lines = 0 - invalid_chrom = 0 - infile = sys.argv[1] - outfile = sys.argv[2] - - for i, line in enumerate( open( infile ) ): - line = line.rstrip('\r\n') - if not line or line.startswith('#'): - continue - fields = line.split() - if len(fields) < 21: # standard number of pslx columns - invalid_lines += 1 - continue - if not fields[0].isdigit(): - invalid_lines += 1 - continue - chrom = fields[13] - if not chrom.startswith( 'chr' ): - invalid_lines += 1 - invalid_chrom += 1 - continue - try: - block_count = int(fields[17]) - except: - invalid_lines += 1 - continue - block_size = fields[18].split(',') - chrom_start = fields[20].split(',') - - for j in range( block_count ): - try: - this_block_size = int(block_size[j]) - this_chrom_start = int(chrom_start[j]) - except: - invalid_lines += 1 - break - # brut force coverage - for k in range( this_block_size ): - cur_index = this_chrom_start + k - if coverage.has_key( ( chrom, cur_index ) ): - coverage[(chrom, cur_index)] += 1 - else: - coverage[(chrom, cur_index)] = 1 - - # generate a index file - outputfh = open(outfile, 'w') - keys = coverage.keys() - keys.sort() - previous_chrom = '' - for i in keys: - (chrom, location) = i - sum = coverage[(i)] - if chrom != previous_chrom: - outputfh.write( 'variableStep chrom=%s\n' % ( chrom ) ) - previous_chrom = chrom - outputfh.write( "%s\t%s\n" % ( location, sum ) ) - outputfh.close() - - if invalid_lines: - invalid_msg = "Skipped %d invalid lines" % invalid_lines - if invalid_chrom: - invalid_msg += ", including %d lines with chrom id errors which must begin with 'chr' to map correctly to the UCSC Genome Browser. " - -if __name__ == '__main__': __main__() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/blat_mapping.xml --- a/tools/metag_tools/blat_mapping.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ - - in wiggle format - blat_mapping.py $input1 $output1 - - - - - - - - - - - - - - -.. class:: warningmark - - To generate acceptable files, please use alignment program **BLAT** with option **-out=pslx**. - -.. class:: warningmark - - Please edit the database information by click on the pencil icon next to your dataset. Select the corresponding genome build. - ------ - -**What it does** - - This tool takes **BLAT pslx** output and returns a wig-like file showing the number of reads (coverage) mapped at each chromosome location. Use **Graph/Display Data --> Build custom track** tool to show the coverage mapping in UCSC Genome Browser. - ------ - -**Example** - - Showing reads coverage on human chromosome 22 (partial result) in UCSC Genome Browser Custom Track: - - .. image:: ./static/images/blat_mapping_example.png - :width: 600 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/blat_wrapper.py --- a/tools/metag_tools/blat_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ -#!/usr/bin/env python - -import os, sys, tempfile - -assert sys.version_info[:2] >= (2.4) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR ): - nib_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR - nib_path = '' - nibs = {} - for i, line in enumerate( file( nib_file ) ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( "#" ): - fields = line.split( '\t' ) - if len( fields ) < 3: - continue - if fields[0] == 'seq': - nibs[( fields[1] )] = fields[2] - if nibs.has_key( dbkey ): - nib_path = nibs[( dbkey )] - return nib_path - -def check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR ): - twobit_file = "%s/twobit.loc" % GALAXY_DATA_INDEX_DIR - twobit_path = '' - twobits = {} - for i, line in enumerate( file( twobit_file ) ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( "#" ): - fields = line.split( '\t' ) - if len( fields ) < 2: - continue - twobits[( fields[0] )] = fields[1] - if twobits.has_key( dbkey ): - twobit_path = twobits[( dbkey )] - return twobit_path - -def __main__(): - # I/O - source_format = sys.argv[1] # 0: dbkey; 1: upload file - target_file = sys.argv[2] - query_file = sys.argv[3] - output_file = sys.argv[4] - min_iden = sys.argv[5] - tile_size = sys.argv[6] - one_off = sys.argv[7] - - try: - float(min_iden) - except: - stop_err('Invalid value for minimal identity.') - - try: - test = int(tile_size) - assert test >= 6 and test <= 18 - except: - stop_err('Invalid value for tile size. DNA word size must be between 6 and 18.') - - try: - test = int(one_off) - assert test >= 0 and test <= int(tile_size) - except: - stop_err('Invalid value for mismatch numbers in the word') - - GALAXY_DATA_INDEX_DIR = sys.argv[8] - - all_files = [] - if source_format == '0': - # check target genome - dbkey = target_file - nib_path = check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR ) - twobit_path = check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR ) - if not os.path.exists( nib_path ) and not os.path.exists( twobit_path ): - stop_err("No sequences are available for %s, request them by reporting this error." % dbkey) - - # check the query file, see whether all of them are legitimate sequence - if nib_path and os.path.isdir( nib_path ): - compress_files = os.listdir(nib_path) - target_path = nib_path - elif twobit_path: - compress_files = [twobit_path] - target_path = "" - else: - stop_err("Requested genome build has no available sequence.") - - for file in compress_files: - file = "%s/%s" % ( target_path, file ) - file = os.path.normpath(file) - all_files.append(file) - else: - all_files = [target_file] - - for detail_file_path in all_files: - output_tempfile = tempfile.NamedTemporaryFile().name - command = "blat %s %s %s -oneOff=%s -tileSize=%s -minIdentity=%s -mask=lower -noHead -out=pslx 2>&1" % ( detail_file_path, query_file, output_tempfile, one_off, tile_size, min_iden ) - os.system( command ) - os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) - os.remove( output_tempfile ) - -if __name__ == '__main__': __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/blat_wrapper.xml --- a/tools/metag_tools/blat_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ - - compare sequencing reads against UCSC genome builds - - #if $source.source_select=="database" #blat_wrapper.py 0 $source.dbkey $input_query $output1 $iden $tile_size $one_off - #else #blat_wrapper.py 1 $source.input_target $input_query $output1 $iden $tile_size $one_off - #end if# ${GALAXY_DATA_INDEX_DIR} - - - - - - - - - - - - - - - - - - - - - - - - blat - - - - - - - - - - - - - - -.. class:: warningmark - -Using a smaller word size (*Minimal Size of Exact Match*) will increase the computational time. - -.. class:: warningmark - -Using a larger mismatch number (*Number of Mismatch in the Word*) will increase the computational time. - ------ - -**What it does** - -This tool currently uses the **BLAT** alignment program. Your short reads file is searched against a genome build or another uploaded file. - ------ - -**Example** - -- Input a multiple fasta file:: - - >seq1 - TGGTAATGGTGGTTTTTTTTTTTTTTTTTTATTTTT - -- Use the default settings: - - - alignment identity must be higher than or equal to 90%. - - - minimal size of exact match to trigger an alignment is 11. - - - allow 0 mismatches in the above exact match size. - -- Search against ce2 (C. elegans March 2004), partial result:: - - 25 1 0 0 0 0 0 0 + seq1 36 10 36 chrI 15080483 9704438 9704464 1 26, 10, 9704438, ggttttttttttttttttttattttt, ggtttttttttttttttttttttttt, - 27 0 0 0 0 0 1 32 + seq1 36 9 36 chrI 15080483 1302536 1302595 2 21,6, 9,30, 1302536,1302589, tggtttttttttttttttttt,attttt, tggtttttttttttttttttt,attttt, - ------ - -**Parameters** - -- *Minimal Identity* (**-minIdentity**) : In percent, the minimum sequence identity between the query and target alignment. Default is 90. - -- *Minimal Size of Exact Match* (**-tileSize**) : The size of a match that will trigger an alignment. Default is 11. Usually between 8 and 12. Must be between 6 and 18. - -- *Number of Mismatch in the Word* (**-oneOff**) : The number of mismatches allowed in the word (tile size) and still triggers an alignment. Default is 0. - ------ - -**Reference** - - **BLAT**: Kent, W James, BLAT--the BLAST-like alignment tool. (2002) Genome Research:12(4) 656-664. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/convert_SOLiD_color2nuc.py --- a/tools/metag_tools/convert_SOLiD_color2nuc.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -#!/usr/bin/env python -""" -convert SOLiD calor-base data to nucleotide sequence -example: T011213122200221123032111221021210131332222101 - TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT -""" - -import sys, os - -def stop_err(msg): - - sys.stderr.write(msg) - sys.stderr.write('\n') - sys.exit() - -def color2base(color_seq): - - first_nuc = ['A','C','G','T'] - code_matrix = {} - code_matrix['0'] = ['A','C','G','T'] - code_matrix['1'] = ['C','A','T','G'] - code_matrix['2'] = ['G','T','A','C'] - code_matrix['3'] = ['T','G','C','A'] - - overlap_nuc = '' - nuc_seq = '' - - seq_prefix = prefix = color_seq[0].upper() - color_seq = color_seq[1:] - - if not (seq_prefix in first_nuc): - stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix ) - - for code in color_seq: - - if not (code in ['0','1','2','3']): - stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code) - - second_nuc = code_matrix[code] - overlap_nuc = second_nuc[first_nuc.index(prefix)] - nuc_seq += overlap_nuc - prefix = overlap_nuc - - return seq_prefix, nuc_seq - -def __main__(): - - infilename = sys.argv[1] - keep_prefix = sys.argv[2].lower() - outfilename = sys.argv[3] - - outfile = open(outfilename,'w') - - prefix = '' - color_seq = '' - for i, line in enumerate(file(infilename)): - line = line.rstrip('\r\n') - - if not line: continue - if line.startswith("#"): continue - - if line.startswith(">"): - - if color_seq: - prefix, nuc_seq = color2base(color_seq) - - if keep_prefix == 'yes': - nuc_seq = prefix + nuc_seq - - outfile.write(title+'\n') - outfile.write(nuc_seq+'\n') - - title = line - color_seq = '' - else: - color_seq += line - - if color_seq: - prefix, nuc_seq = color2base(color_seq) - - if keep_prefix == 'yes': - nuc_seq = prefix + nuc_seq - - outfile.write(title+'\n') - outfile.write(nuc_seq+'\n') - - outfile.close() - -if __name__=='__main__': __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/convert_SOLiD_color2nuc.xml --- a/tools/metag_tools/convert_SOLiD_color2nuc.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ - - to Nucleotides -convert_SOLiD_color2nuc.py $input1 $input2 $output1 - - - - - - - - - - - - - - -.. class:: warningmark - -The tool was designed for color space files generated from an ABI SOLiD sequencer. The file format must be fasta-like: the title starts with a ">" character, and each color space sequence starts with a leading nucleotide. - ------ - -**What it does** - -This tool converts a color space sequence to nucleotides. The leading character must be a nucleotide: A, C, G, or T. - ------ - -**Example** - -- If the color space file looks like this:: - - >seq1 - A013 - >seq2 - T011213122200221123032111221021210131332222101 - -- If you would like to **keep** the leading nucleotide:: - - >seq1 - AACG - >seq2 - TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT - -- If you **do not want to keep** the leading nucleotide (the length of nucleotide sequence will be one less than the color-space sequence):: - - >seq1 - ACG - >seq2 - TGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT - ------ - -**ABI SOLiD Color Coding Alignment matrix** - - Each di-nucleotide is represented by a single digit: 0 to 3. The matrix is symmetric, thus the leading nucleotide is necessary to determine the sequence (otherwise there are four possibilities). - - - .. image:: ./static/images/dualcolorcode.png - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/fastqsolexa_to_fasta_qual.py --- a/tools/metag_tools/fastqsolexa_to_fasta_qual.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -#!/usr/bin/env python - -""" -convert fastqsolexa file to separated sequence and quality files. - -assume each sequence and quality score are contained in one line -the order should be: -1st line: @title_of_seq -2nd line: nucleotides -3rd line: +title_of_qualityscore (might be skipped) -4th line: quality scores -(in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.) - -Usage: -%python fastqsolexa_to_fasta_qual.py -""" - -import sys, os -from math import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s" % msg ) - sys.exit() - -def __main__(): - infile_name = sys.argv[1] - outfile_seq = open( sys.argv[2], 'w' ) - outfile_score = open( sys.argv[3], 'w' ) - datatype = sys.argv[4] - seq_title_startswith = '' - qual_title_startswith = '' - default_coding_value = 64 - fastq_block_lines = 0 - - for i, line in enumerate( file( infile_name ) ): - line = line.rstrip() - if not line or line.startswith( '#' ): - continue - fastq_block_lines = ( fastq_block_lines + 1 ) % 4 - line_startswith = line[0:1] - if fastq_block_lines == 1: - # first line is @title_of_seq - if not seq_title_startswith: - seq_title_startswith = line_startswith - if line_startswith != seq_title_startswith: - outfile_seq.close() - outfile_score.close() - stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) - read_title = line[1:] - outfile_seq.write( '>%s\n' % line[1:] ) - elif fastq_block_lines == 2: - # second line is nucleotides - read_length = len( line ) - outfile_seq.write( '%s\n' % line ) - elif fastq_block_lines == 3: - # third line is +title_of_qualityscore ( might be skipped ) - if not qual_title_startswith: - qual_title_startswith = line_startswith - if line_startswith != qual_title_startswith: - outfile_seq.close() - outfile_score.close() - stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) - quality_title = line[1:] - if quality_title and read_title != quality_title: - outfile_seq.close() - outfile_score.close() - stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) - if not quality_title: - outfile_score.write( '>%s\n' % read_title ) - else: - outfile_score.write( '>%s\n' % line[1:] ) - else: - # fourth line is quality scores - qual = '' - fastq_integer = True - # peek: ascii or digits? - val = line.split()[0] - try: - check = int( val ) - fastq_integer = True - except: - fastq_integer = False - - if fastq_integer: - # digits - qual = line - else: - # ascii - quality_score_length = len( line ) - if quality_score_length == read_length + 1: - # first char is qual_score_startswith - qual_score_startswith = ord( line[0:1] ) - line = line[1:] - elif quality_score_length == read_length: - qual_score_startswith = default_coding_value - else: - stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) - for j, char in enumerate( line ): - score = ord( char ) - qual_score_startswith # 64 - qual = "%s%s " % ( qual, str( score ) ) - outfile_score.write( '%s\n' % qual ) - - outfile_seq.close() - outfile_score.close() - -if __name__ == "__main__": __main__() - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/fastqsolexa_to_fasta_qual.xml --- a/tools/metag_tools/fastqsolexa_to_fasta_qual.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,91 +0,0 @@ - - extracts sequences and quality scores from FASTQSOLEXA data - fastqsolexa_to_fasta_qual.py $input1 $output1 $output2 $input1.extension - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -IMPORTANT: This tool currently only supports data where the quality scores are integers or ASCII quality scores with base 64. - ------ - -**What it does** - -This tool extracts sequences and quality scores from FASTQ data ( Solexa variant ), producing a FASTA dataset and a QUAL dataset. - ------ - -**Example1** - -- Converting the following Solexa fastq data:: - - @seq1 - GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT - +seq1 - hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh - @seq2 - GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG - +seq2 - hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO - -- will extract the following sequences:: - - >seq1 - GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT - >seq2 - GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG - -- and quality scores:: - - >seq1 - 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40 - >seq2 - 40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15 - -**Example2** - -- Converting the following Solexa fastq data:: - - @HANNIBAL_1_FC302VTAAXX:2:1:228:167 - GAATTGATCAGGACATAGGACAACTGTAGGCACCAT - +HANNIBAL_1_FC302VTAAXX:2:1:228:167 - 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 - @HANNIBAL_1_FC302VTAAXX:2:1:156:340 - GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG - +HANNIBAL_1_FC302VTAAXX:2:1:156:340 - 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 - -- will extract the following sequences:: - - >HANNIBAL_1_FC302VTAAXX:2:1:228:167 - GAATTGATCAGGACATAGGACAACTGTAGGCACCAT - >HANNIBAL_1_FC302VTAAXX:2:1:156:340 - GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG - -- and quality scores:: - - >HANNIBAL_1_FC302VTAAXX:2:1:228:167 - 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 - >HANNIBAL_1_FC302VTAAXX:2:1:156:340 - 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/mapping_to_ucsc.py --- a/tools/metag_tools/mapping_to_ucsc.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,204 +0,0 @@ -#!/usr/bin/env python - -from galaxy import eggs -import sys, tempfile, os - -assert sys.version_info[:2] >= (2.4) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - - out_fname = sys.argv[1] - in_fname = sys.argv[2] - chr_col = int(sys.argv[3])-1 - coord_col = int(sys.argv[4])-1 - track_type = sys.argv[5] - if track_type == 'coverage' or track_type == 'both': - coverage_col = int(sys.argv[6])-1 - cname = sys.argv[7] - cdescription = sys.argv[8] - ccolor = sys.argv[9].replace('-',',') - cvisibility = sys.argv[10] - if track_type == 'snp' or track_type == 'both': - if track_type == 'both': - j = 5 - else: - j = 0 - #sname = sys.argv[7+j] - sdescription = sys.argv[6+j] - svisibility = sys.argv[7+j] - #ref_col = int(sys.argv[10+j])-1 - read_col = int(sys.argv[8+j])-1 - - - # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically) - sorted_infile = tempfile.NamedTemporaryFile() - try: - os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname)) - except Exception, exc: - stop_err( 'Initialization error -> %s' %str(exc) ) - - #generate chr list - sorted_infile.seek(0) - chr_vals = [] - for line in file( sorted_infile.name ): - line = line.strip() - if not(line): - continue - try: - fields = line.split('\t') - chr = fields[chr_col] - if chr not in chr_vals: - chr_vals.append(chr) - except: - pass - if not(chr_vals): - stop_err("Skipped all lines as invalid.") - - if track_type == 'coverage' or track_type == 'both': - if track_type == 'coverage': - fout = open( out_fname, "w" ) - else: - fout = tempfile.NamedTemporaryFile() - fout.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ - % ( cname, cdescription, ccolor, cvisibility )) - if track_type == 'snp' or track_type == 'both': - fout_a = tempfile.NamedTemporaryFile() - fout_t = tempfile.NamedTemporaryFile() - fout_g = tempfile.NamedTemporaryFile() - fout_c = tempfile.NamedTemporaryFile() - fout_ref = tempfile.NamedTemporaryFile() - - fout_a.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ - % ( "Track A", sdescription, '255,0,0', svisibility )) - fout_t.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ - % ( "Track T", sdescription, '0,255,0', svisibility )) - fout_g.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ - % ( "Track G", sdescription, '0,0,255', svisibility )) - fout_c.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ - % ( "Track C", sdescription, '255,0,255', svisibility )) - - - sorted_infile.seek(0) - for line in file( sorted_infile.name ): - line = line.strip() - if not(line): - continue - try: - fields = line.split('\t') - chr = fields[chr_col] - start = int(fields[coord_col]) - assert start > 0 - except: - continue - try: - ind = chr_vals.index(chr) #encountered chr for the 1st time - del chr_vals[ind] - prev_start = '' - header = "variableStep chrom=%s\n" %(chr) - if track_type == 'coverage' or track_type == 'both': - coverage = int(fields[coverage_col]) - line1 = "%s\t%s\n" %(start,coverage) - fout.write("%s%s" %(header,line1)) - if track_type == 'snp' or track_type == 'both': - a = t = g = c = 0 - fout_a.write("%s" %(header)) - fout_t.write("%s" %(header)) - fout_g.write("%s" %(header)) - fout_c.write("%s" %(header)) - try: - #ref_nt = fields[ref_col].capitalize() - read_nt = fields[read_col].capitalize() - try: - nt_ind = ['A','T','G','C'].index(read_nt) - if nt_ind == 0: - a+=1 - elif nt_ind == 1: - t+=1 - elif nt_ind == 2: - g+=1 - else: - c+=1 - except ValueError: - pass - except: - pass - prev_start = start - except ValueError: - if start != prev_start: - if track_type == 'coverage' or track_type == 'both': - coverage = int(fields[coverage_col]) - fout.write("%s\t%s\n" %(start,coverage)) - if track_type == 'snp' or track_type == 'both': - if a: - fout_a.write("%s\t%s\n" %(prev_start,a)) - if t: - fout_t.write("%s\t%s\n" %(prev_start,t)) - if g: - fout_g.write("%s\t%s\n" %(prev_start,g)) - if c: - fout_c.write("%s\t%s\n" %(prev_start,c)) - a = t = g = c = 0 - try: - #ref_nt = fields[ref_col].capitalize() - read_nt = fields[read_col].capitalize() - try: - nt_ind = ['A','T','G','C'].index(read_nt) - if nt_ind == 0: - a+=1 - elif nt_ind == 1: - t+=1 - elif nt_ind == 2: - g+=1 - else: - c+=1 - except ValueError: - pass - except: - pass - prev_start = start - else: - if track_type == 'snp' or track_type == 'both': - try: - #ref_nt = fields[ref_col].capitalize() - read_nt = fields[read_col].capitalize() - try: - nt_ind = ['A','T','G','C'].index(read_nt) - if nt_ind == 0: - a+=1 - elif nt_ind == 1: - t+=1 - elif nt_ind == 2: - g+=1 - else: - c+=1 - except ValueError: - pass - except: - pass - - if track_type == 'snp' or track_type == 'both': - if a: - fout_a.write("%s\t%s\n" %(prev_start,a)) - if t: - fout_t.write("%s\t%s\n" %(prev_start,t)) - if g: - fout_g.write("%s\t%s\n" %(prev_start,g)) - if c: - fout_c.write("%s\t%s\n" %(prev_start,c)) - - fout_a.seek(0) - fout_g.seek(0) - fout_t.seek(0) - fout_c.seek(0) - - if track_type == 'snp': - os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) - elif track_type == 'both': - fout.seek(0) - os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) -if __name__ == "__main__": - main() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/mapping_to_ucsc.xml --- a/tools/metag_tools/mapping_to_ucsc.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ - - as UCSC custom track - - mapping_to_ucsc.py - $out_file1 - $input - $chr_col - $coord_col - $track.track_type - #if $track.track_type == "coverage" or $track.track_type == "both" - $track.coverage_col - "${track.cname}" - "${track.cdescription}" - "${track.ccolor}" - "${track.cvisibility}" - #end if - #if $track.track_type == "snp" or $track.track_type == "both" - "${track.sdescription}" - "${track.svisibility}" - $track.col2 - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -This tool turns mapping data generated by short read mappers into a format that can be displayed in the UCSC genome browser as a custom track. - ------ - -.. class:: warningmark - -**Note** - -This tool requires the mapping data to contain at least the following information: - -chromosome, genome coordinate, read nucleotide (if option to display is SNPs), read coverage (if option to display is Read coverage). - ------ - -**Example** - -For the following Mapping data:: - - #chr g_start read_id read_coord g_nt read_nt qual read_coverage - chrM 1 1:29:1672:1127/1 11 G G 40 134 - chrM 1 1:32:93:933/1 4 G A 40 134 - chrM 1 1:34:116:2032/1 11 G A 40 134 - chrM 1 1:39:207:964/1 1 G G 40 134 - chrM 2 1:3:359:848/1 1 G C 40 234 - chrM 2 1:40:1435:1013/1 1 G G 40 234 - chrM 3 1:40:730:972/1 9 G G 40 334 - chrM 4 1:42:1712:921/2 31 G T 35 434 - chrM 4 1:44:1649:493/1 4 G G 40 434 - -running this tool to display both SNPs and Read coverage will return the following tracks, containing aggregated data per genome co-ordinate:: - - track type=wiggle_0 name="Coverage Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1 - variableStep chrom=chrM - 1 134 - 2 234 - 3 334 - 4 434 - track type=wiggle_0 name="Track A" description="User Supplied SNP Track (from Galaxy)" color=255,0,0 visibility=1 - variableStep chrom=chrM - 1 2 - track type=wiggle_0 name="Track T" description="User Supplied SNP Track (from Galaxy)" color=0,255,0 visibility=1 - variableStep chrom=chrM - 4 1 - track type=wiggle_0 name="Track G" description="User Supplied SNP Track (from Galaxy)" color=0,0,255 visibility=1 - variableStep chrom=chrM - 1 2 - 2 1 - 3 1 - 4 1 - track type=wiggle_0 name="Track C" description="User Supplied SNP Track (from Galaxy)" color=255,0,255 visibility=1 - variableStep chrom=chrM - 2 1 - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/megablast_wrapper.py --- a/tools/metag_tools/megablast_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ -#!/usr/bin/env python -""" -run megablast for metagenomics data - -usage: %prog [options] - -d, --db_build=d: The database to use - -i, --input=i: Input FASTQ candidate file - -w, --word_size=w: Size of best perfect match - -c, --identity_cutoff=c: Report hits at or above this identity - -e, --eval_cutoff=e: Expectation value cutoff - -f, --filter_query=f: Filter out low complexity regions - -x, --index_dir=x: Data index directory - -o, --output=o: Output file - -usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file -""" - -import os, subprocess, sys, tempfile -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - query_filename = options.input.strip() - output_filename = options.output.strip() - mega_word_size = options.word_size # -W - mega_iden_cutoff = options.identity_cutoff # -p - mega_evalue_cutoff = options.eval_cutoff # -e - mega_temp_output = tempfile.NamedTemporaryFile().name - GALAXY_DATA_INDEX_DIR = options.index_dir - DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR - - # megablast parameters - try: - int( mega_word_size ) - except: - stop_err( 'Invalid value for word size' ) - try: - float( mega_iden_cutoff ) - except: - stop_err( 'Invalid value for identity cut-off' ) - try: - float( mega_evalue_cutoff ) - except: - stop_err( 'Invalid value for Expectation value' ) - - if not os.path.exists( os.path.split( options.db_build )[0] ): - stop_err( 'Cannot locate the target database directory. Please check your location file.' ) - - # arguments for megablast - megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null" \ - % ( options.db_build, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query ) - - print megablast_command - - tmp = tempfile.NamedTemporaryFile().name - try: - tmp_stderr = open( tmp, 'wb' ) - proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - if os.path.exists( tmp ): - os.unlink( tmp ) - except Exception, e: - if os.path.exists( mega_temp_output ): - os.unlink( mega_temp_output ) - if os.path.exists( tmp ): - os.unlink( tmp ) - stop_err( 'Error indexing reference sequence. ' + str( e ) ) - - output = open( output_filename, 'w' ) - invalid_lines = 0 - for i, line in enumerate( file( mega_temp_output ) ): - line = line.rstrip( '\r\n' ) - fields = line.split() - try: - # get gi and length of that gi seq - gi, gi_len = fields[1].split( '_' ) - # convert the last column (causing problem in filter tool) to float - fields[-1] = float( fields[-1] ) - new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] ) - except: - new_line = line - invalid_lines += 1 - output.write( "%s\n" % new_line ) - output.close() - - if os.path.exists( mega_temp_output ): - os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of - - if invalid_lines: - print "Unable to parse %d lines. Keep the default format." % invalid_lines - - # megablast generates a file called error.log, if empty, delete it, if not, show the contents - if os.path.exists( './error.log' ): - for i, line in enumerate( file( './error.log' ) ): - line = line.rstrip( '\r\n' ) - print line - os.remove( './error.log' ) - -if __name__ == "__main__" : __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/megablast_wrapper.xml --- a/tools/metag_tools/megablast_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ - - compare short reads against htgs, nt, and wgs databases - - megablast_wrapper.py - --db_build="${ filter( lambda x: str( x[0] ) == str( $source_select ), $__app__.tool_data_tables[ 'blastdb' ].get_fields() )[0][-1] }" - --input=$input_query - --word_size=$word_size - --identity_cutoff=$iden_cutoff - --eval_cutoff=$evalue_cutoff - --filter_query=$filter_query - --index_dir=${GALAXY_DATA_INDEX_DIR} - --output=$output1 - - - - - - - - - - - - - - - - - - - - - - megablast - - - - - - - - - - - - - - - -.. class:: warningmark - -**Note**. Database searches may take substantial amount of time. For large input datasets it is advisable to allow overnight processing. - ------ - -**What it does** - -This tool runs **megablast** (for information about megablast, please see the reference below) a high performance nucleotide local aligner developed by Webb Miller and colleagues. - ------ - -**Output format** - -Output of this tool contains 13 columns delimited by Tabs: - -1. Id of your sequence -2. GI of the database hit -3. Length of the database hit -4. % identity -5. Alignment length -6. # mismatches -7. # gaps -8. Start position in your sequence -9. End position in your sequence -10. Start position in database hit -11. End position in database hit -12. E-value -13. Bit score - -------- - -**Reference** - -Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/megablast_xml_parser.py --- a/tools/metag_tools/megablast_xml_parser.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -#!/usr/bin/env python - -import sys, os, re - -if sys.version_info[:2] >= ( 2, 5 ): - import xml.etree.cElementTree as ElementTree -else: - from galaxy import eggs - import pkg_resources; pkg_resources.require( "elementtree" ) - from elementtree import ElementTree - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - source = sys.argv[1] - hspTags = [ - "Hsp_bit-score", - "Hsp_evalue", - "Hsp_query-from", - "Hsp_query-to", - "Hsp_hit-from", - "Hsp_hit-to", - "Hsp_query-frame", - "Hsp_hit-frame", - "Hsp_identity", - "Hsp_align-len", - "Hsp_qseq", - "Hsp_hseq", - "Hsp_midline" - ] - hspData = [] - - # get an iterable - try: - context = ElementTree.iterparse( source, events=( "start", "end" ) ) - except: - stop_err( "Invalid data format." ) - # turn it into an iterator - context = iter( context ) - # get the root element - try: - event, root = context.next() - except: - stop_err( "Invalid data format." ) - - outfile = open( sys.argv[2], 'w' ) - try: - for event, elem in context: - # for every tag - if event == "end" and elem.tag == "Iteration": - query = elem.findtext( "Iteration_query-def" ) - qLen = elem.findtext( "Iteration_query-len" ) - # for every within - for hit in elem.findall( "Iteration_hits/Hit" ): - subject = hit.findtext( "Hit_id" ) - if re.search( '^gi', subject ): - subject = subject.split('|')[1] - sLen = hit.findtext( "Hit_len" ) - # for every within - for hsp in hit.findall( "Hit_hsps/Hsp" ): - outfile.write( "%s\t%s\t%s\t%s" % ( query, qLen, subject, sLen ) ) - for tag in hspTags: - outfile.write("\t%s" %(hsp.findtext( tag ))) - #hspData.append( hsp.findtext( tag ) ) - #hspData = [] - outfile.write('\n') - # prevents ElementTree from growing large datastructure - root.clear() - elem.clear() - except: - outfile.close() - stop_err( "The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1] ) - - outfile.close() - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/megablast_xml_parser.xml --- a/tools/metag_tools/megablast_xml_parser.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - - -megablast_xml_parser.py $input1 $output1 - - - - - - - - - - - - - - -**What it does** - -This tool processes the XML output of any NCBI blast tool (if you run your own blast jobs, the XML output can be generated with **-m 7** option). - ------ - -**Output fields** - -This tools returns tab-delimited output with the following fields:: - - Description Example - ----------------------------------------- ----------------- - - 1. Name of the query sequence Seq1 - 2. Length of the query sequence 30 - 3. Name of target sequence gnl|BL_ORD_ID|0 - 4. Length of target sequence 5528445 - 5. Alignment bit score 59.96 - 6. E-value 8.38112e-11 - 7. Start of alignment within query 1 - 8. End of alignment within query 30 - 9. Start of alignment within target 5436010 - 10. End of alignment within target 5436039 - 11. Query frame 1 - 12. Target frame 1 - 13. Number of identical bases within 29 - the alignment - 14. Alignment length 30 - 15. Aligned portion (sequence) of query CGGACAGCGCCGCCACCAACAAAGCCACCA - 16. Aligned portion (sequence) of target CGGACAGCGCCGCCACCAACAAAGCCATCA - 17. Midline indicating positions of ||||||||||||||||||||||||||| || - matches within the alignment - ------- - -.. class:: infomark - -Note that this form of output does not contain alignment identify value. However, it can be computed by dividing the number of identical bases within the alignment (Field 13) by the alignment length (Field 14) using *Text Manipulation->Compute* tool - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/rmap_wrapper.py --- a/tools/metag_tools/rmap_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ -#!/usr/bin/env python - -import os, sys, tempfile - -assert sys.version_info[:2] >= (2.4) - -def stop_err( msg ): - - sys.stderr.write( "%s\n" % msg ) - sys.exit() - - -def __main__(): - - # I/O - target_path = sys.argv[1] - infile = sys.argv[2] - read_len = sys.argv[3] # -w - align_len = sys.argv[4] # -h - mismatch = sys.argv[5] # -m - output_file = sys.argv[6] - - # first guess the read length - guess_read_len = 0 - seq = '' - for i, line in enumerate(open(infile)): - line = line.rstrip('\r\n') - if line.startswith('>'): - if seq: - guess_read_len = len(seq) - break - else: - seq += line - - try: - test = int(read_len) - if test == 0: - read_len = str(guess_read_len) - else: - assert test >= 20 and test <= 64 - except: - stop_err('Invalid value for read length. Must be between 20 and 64.') - - try: - int(align_len) - except: - stop_err('Invalid value for minimal length of a hit.') - - try: - int(mismatch) - #assert test >= 0 and test <= int(0.1*int(read_len)) - except: - stop_err('Invalid value for mismatch numbers in an alignment.') - - all_files = [] - if os.path.isdir(target_path): - - # check target genome - fa_files = os.listdir(target_path) - - for file in fa_files: - file = "%s/%s" % ( target_path, file ) - file = os.path.normpath(file) - all_files.append(file) - else: - stop_err("No sequences for %s are available for search, please report this error." %(target_path)) - - for detail_file_path in all_files: - output_tempfile = tempfile.NamedTemporaryFile().name - command = "rmap -h %s -w %s -m %s -c %s %s -o %s 2>&1" % ( align_len, read_len, mismatch, detail_file_path, infile, output_tempfile ) - #print command - try: - os.system( command ) - except Exception, e: - stop_err( str( e ) ) - - try: - os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) - except Exception, e: - stop_err( str( e ) ) - - try: - os.remove( output_tempfile ) - except: - pass - - -if __name__ == '__main__': __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/rmap_wrapper.xml --- a/tools/metag_tools/rmap_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ - - for Solexa Short Reads Alignment - - #if $trim.choice=="No": #rmap_wrapper.py $database $input_seq 0 $align_len $mismatch $output1 - #else: #rmap_wrapper.py $database $input_seq $trim.read_len $align_len $mismatch $output1 - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rmap - - - - -.. class:: warningmark - - RMAP was developed for **Solexa** reads. - -.. class:: infomark - -**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Reads length* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. - ------ - -**What it does** - -This tool runs **rmap** (for more information, please see the reference below), mapping Solexa reads onto a genome build. - ------ - -**Parameters** - -- *Minimal Length of a Hit* (**-h**) : this is the seed length or the minimal exact match length -- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment -- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end. - ------ - -**Reference** - - **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/rmapq_wrapper.py --- a/tools/metag_tools/rmapq_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -#!/usr/bin/env python - -import os, sys, tempfile - -assert sys.version_info[:2] >= (2.4) - -def stop_err( msg ): - - sys.stderr.write( "%s\n" % msg ) - sys.exit() - - -def __main__(): - - # I/O - target_path = sys.argv[1] - infile = sys.argv[2] - scorefile = sys.argv[3] - high_score = sys.argv[4] # -q - high_len = sys.argv[5] # -M - read_len = sys.argv[6] # -w - align_len = sys.argv[7] # -h - mismatch = sys.argv[8] # -m - output_file = sys.argv[9] - - try: - float(high_score) - except: - stop_err('Invalid value for minimal quality score.') - - try: - int(high_len) - except: - stop_err('Invalid value for minimal high quality bases.') - - # first guess the read length - guess_read_len = 0 - seq = '' - for i, line in enumerate(open(infile)): - line = line.rstrip('\r\n') - if line.startswith('>'): - if seq: - guess_read_len = len(seq) - break - else: - seq += line - - try: - test = int(read_len) - if test == 0: - read_len = str(guess_read_len) - else: - assert test >= 20 and test <= 64 - except: - stop_err('Invalid value for read length. Must be between 20 and 64.') - - - try: - int(align_len) - except: - stop_err('Invalid value for minimal length of a hit.') - - try: - int(mismatch) - except: - stop_err('Invalid value for mismatch numbers in an alignment.') - - all_files = [] - if os.path.isdir(target_path): - # check target genome - fa_files = os.listdir(target_path) - - for file in fa_files: - file = "%s/%s" % ( target_path, file ) - file = os.path.normpath(file) - all_files.append(file) - else: - stop_err("No sequences for %s are available for search, please report this error." %(target_path)) - - for detail_file_path in all_files: - output_tempfile = tempfile.NamedTemporaryFile().name - command = "rmapq -q %s -M %s -h %s -w %s -m %s -Q %s -c %s %s -o %s 2>&1" % ( high_score, high_len, align_len, read_len, mismatch, scorefile, detail_file_path, infile, output_tempfile ) - #print command - try: - os.system( command ) - except Exception, e: - stop_err( str( e ) ) - - try: - assert os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) == 0 - except Exception, e: - stop_err( str( e ) ) - - try: - os.remove( output_tempfile ) - except: - pass - - -if __name__ == '__main__': __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/rmapq_wrapper.xml --- a/tools/metag_tools/rmapq_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ - - for Solexa Short Reads Alignment with Quality Scores - - #if $trim.choice=="No": #rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len 0 $align_len $mismatch $output1 - #else: #rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len $trim.read_len $align_len $mismatch $output1 - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rmapq - - - - -.. class:: warningmark - - RMAPQ was developed for **Solexa** reads. - -.. class:: infomark - -**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Maximal Length of the Reads* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. - ------ - -**What it does** - -This tool runs **rmapq** (for more information, please see the reference below), searching against a genome build with sequence qualities. - ------ - -**Parameters** - -- *Minimal High-quality Bases* (**-M**): the minimal length of the high quality score bases -- *Minimum Score for High-quality Base* (**-q**) : the minimal quality score -- *Minimal Length of a Hit* (**-h**) : the minimal length of an exact match or seed -- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment -- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end. - ------ - -**Reference** - - **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/short_reads_figure_high_quality_length.py --- a/tools/metag_tools/short_reads_figure_high_quality_length.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,165 +0,0 @@ -#!/usr/bin/env python - -import os, sys, math, tempfile, zipfile, re -from rpy import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def unzip( filename ): - zip_file = zipfile.ZipFile( filename, 'r' ) - tmpfilename = tempfile.NamedTemporaryFile().name - for name in zip_file.namelist(): - file( tmpfilename, 'a' ).write( zip_file.read( name ) ) - zip_file.close() - return tmpfilename - -def __main__(): - infile_score_name = sys.argv[1].strip() - outfile_R_name = sys.argv[2].strip() - - try: - score_threshold = int( sys.argv[3].strip() ) - except: - stop_err( 'Threshold for quality score must be numerical.' ) - - infile_is_zipped = False - if zipfile.is_zipfile( infile_score_name ): - infile_is_zipped = True - infile_name = unzip( infile_score_name ) - else: - infile_name = infile_score_name - - # detect whether it's tabular or fasta format - seq_method = None - data_type = None - for i, line in enumerate( file( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if data_type == None: - if line.startswith( '>' ): - data_type = 'fasta' - continue - elif len( line.split( '\t' ) ) > 0: - fields = line.split() - for score in fields: - try: - int( score ) - data_type = 'tabular' - seq_method = 'solexa' - break - except: - break - elif data_type == 'fasta': - fields = line.split() - for score in fields: - try: - int( score ) - seq_method = '454' - break - except: - break - if i == 100: - break - - if data_type is None: - stop_err( 'This tool can only use fasta data or tabular data.' ) - if seq_method is None: - stop_err( 'Invalid data for fasta format.') - - cont_high_quality = [] - invalid_lines = 0 - invalid_scores = 0 - if seq_method == 'solexa': - for i, line in enumerate( open( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - locs = line.split( '\t' ) - for j, base in enumerate( locs ): - nuc_errors = base.split() - try: - nuc_errors[0] = int( nuc_errors[0] ) - nuc_errors[1] = int( nuc_errors[1] ) - nuc_errors[2] = int( nuc_errors[2] ) - nuc_errors[3] = int( nuc_errors[3] ) - big = max( nuc_errors ) - except: - invalid_scores += 1 - big = 0 - if j == 0: - cont_high_quality.append(1) - else: - if big >= score_threshold: - cont_high_quality[ len( cont_high_quality ) - 1 ] += 1 - else: - cont_high_quality.append(1) - else: # seq_method == '454' - tmp_score = '' - for i, line in enumerate( open( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if line.startswith( '>' ): - if len( tmp_score ) > 0: - locs = tmp_score.split() - for j, base in enumerate( locs ): - try: - base = int( base ) - except: - invalid_scores += 1 - base = 0 - if j == 0: - cont_high_quality.append(1) - else: - if base >= score_threshold: - cont_high_quality[ len( cont_high_quality ) - 1 ] += 1 - else: - cont_high_quality.append(1) - tmp_score = '' - else: - tmp_score = "%s %s" % ( tmp_score, line ) - if len( tmp_score ) > 0: - locs = tmp_score.split() - for j, base in enumerate( locs ): - try: - base = int( base ) - except: - invalid_scores += 1 - base = 0 - if j == 0: - cont_high_quality.append(1) - else: - if base >= score_threshold: - cont_high_quality[ len( cont_high_quality ) - 1 ] += 1 - else: - cont_high_quality.append(1) - - # generate pdf figures - cont_high_quality = array ( cont_high_quality ) - outfile_R_pdf = outfile_R_name - r.pdf( outfile_R_pdf ) - title = "Histogram of continuous high quality scores" - xlim_range = [ 1, max( cont_high_quality ) ] - nclass = max( cont_high_quality ) - if nclass > 100: - nclass = 100 - r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass) - r.dev_off() - - if infile_is_zipped and os.path.exists( infile_name ): - # Need to delete temporary file created when we unzipped the infile archive - os.remove( infile_name ) - - if invalid_lines > 0: - print 'Skipped %d invalid lines. ' % invalid_lines - if invalid_scores > 0: - print 'Skipped %d invalid scores. ' % invalid_scores - - r.quit( save="no" ) - -if __name__=="__main__":__main__() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/short_reads_figure_high_quality_length.xml --- a/tools/metag_tools/short_reads_figure_high_quality_length.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - - of high quality score reads - -short_reads_figure_high_quality_length.py $input1 $output1 $input2 - - - - - - - - - - - - rpy - - - - - - - - - - - - - - - - -.. class:: warningmark - -To use this tool, your dataset needs to be in the *Quality Score* format. Click the pencil icon next to your dataset to set the datatype to *Quality Score* (see below for examples). - ------ - -**What it does** - -This tool takes Quality Files generated by Roche (454), Illumina (Solexa), or ABI SOLiD machines and builds a histogram of lengths of high quality reads. - ------ - -**Examples of Quality Data** - -Roche (454) or ABI SOLiD data:: - - >seq1 - 23 33 34 25 28 28 28 32 23 34 27 4 28 28 31 21 28 - -Illumina (Solexa) data:: - - -40 -40 40 -40 -40 -40 -40 40 - ------ - -**Note** - -- Quality score data:: - - >seq1 - 23 33 34 25 28 28 28 32 23 34 27 4 28 28 31 21 28 - -- If the threshold is set to 20: - - - a low quality score 4 in the middle separated two segments of lengths 11 and 5. - - - The histogram will be built based on the numbers (11, 5). - -- For Illumina (Solexa) data, only the maximal of the 4 values will be used. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/short_reads_figure_score.py --- a/tools/metag_tools/short_reads_figure_score.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,248 +0,0 @@ -#!/usr/bin/env python -""" -boxplot: -- box: first quartile and third quartile -- line inside the box: median -- outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile - IQR = third quartile - first quartile -- The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. -""" - -import os, sys, math, tempfile, re -from rpy import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def merge_to_20_datapoints( score ): - number_of_points = 20 - read_length = len( score ) - step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) - scores = [] - point = 1 - point_sum = 0 - step_average = 0 - score_points = 0 - - for i in xrange( 1, read_length ): - if i < ( point * step ): - point_sum += int( score[i] ) - step_average += 1 - else: - point_avg = point_sum * 1.0 / step_average - scores.append( point_avg ) - point += 1 - point_sum = 0 - step_average = 0 - if step_average > 0: - point_avg = point_sum * 1.0 / step_average - scores.append( point_avg ) - if len( scores ) > number_of_points: - last_avg = 0 - for j in xrange( number_of_points - 1, len( scores ) ): - last_avg += scores[j] - last_avg = last_avg / ( len(scores) - number_of_points + 1 ) - else: - last_avg = scores[-1] - score_points = [] - for k in range( number_of_points - 1 ): - score_points.append( scores[k] ) - score_points.append( last_avg ) - return score_points - -def __main__(): - - invalid_lines = 0 - - infile_score_name = sys.argv[1].strip() - outfile_R_name = sys.argv[2].strip() - - infile_name = infile_score_name - - # Determine tabular or fasta format within the first 100 lines - seq_method = None - data_type = None - for i, line in enumerate( file( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if data_type == None: - if line.startswith( '>' ): - data_type = 'fasta' - continue - elif len( line.split( '\t' ) ) > 0: - fields = line.split() - for score in fields: - try: - int( score ) - data_type = 'tabular' - seq_method = 'solexa' - break - except: - break - elif data_type == 'fasta': - fields = line.split() - for score in fields: - try: - int( score ) - seq_method = '454' - break - except: - break - if i == 100: - break - - if data_type is None: - stop_err( 'This tool can only use fasta data or tabular data.' ) - if seq_method is None: - stop_err( 'Invalid data for fasta format.') - - # Determine fixed length or variable length within the first 100 lines - read_length = 0 - variable_length = False - if seq_method == 'solexa': - for i, line in enumerate( file( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - scores = line.split('\t') - if read_length == 0: - read_length = len( scores ) - if read_length != len( scores ): - variable_length = True - break - if i == 100: - break - elif seq_method == '454': - score = '' - for i, line in enumerate( file( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if line.startswith( '>' ): - if len( score ) > 0: - score = score.split() - if read_length == 0: - read_length = len( score ) - if read_length != len( score ): - variable_length = True - break - score = '' - else: - score = score + ' ' + line - if i == 100: - break - - if variable_length: - number_of_points = 20 - else: - number_of_points = read_length - read_length_threshold = 100 # minimal read length for 454 file - score_points = [] - score_matrix = [] - invalid_scores = 0 - - if seq_method == 'solexa': - for i, line in enumerate( open( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - tmp_array = [] - scores = line.split( '\t' ) - for bases in scores: - nuc_errors = bases.split() - try: - nuc_errors[0] = int( nuc_errors[0] ) - nuc_errors[1] = int( nuc_errors[1] ) - nuc_errors[2] = int( nuc_errors[2] ) - nuc_errors[3] = int( nuc_errors[3] ) - big = max( nuc_errors ) - except: - #print 'Invalid numbers in the file. Skipped.' - invalid_scores += 1 - big = 0 - tmp_array.append( big ) - score_points.append( tmp_array ) - elif seq_method == '454': - # skip the last fasta sequence - score = '' - for i, line in enumerate( open( infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if line.startswith( '>' ): - if len( score ) > 0: - score = ['0'] + score.split() - read_length = len( score ) - tmp_array = [] - if not variable_length: - score.pop(0) - score_points.append( score ) - tmp_array = score - elif read_length > read_length_threshold: - score_points_tmp = merge_to_20_datapoints( score ) - score_points.append( score_points_tmp ) - tmp_array = score_points_tmp - score = '' - else: - score = "%s %s" % ( score, line ) - if len( score ) > 0: - score = ['0'] + score.split() - read_length = len( score ) - if not variable_length: - score.pop(0) - score_points.append( score ) - elif read_length > read_length_threshold: - score_points_tmp = merge_to_20_datapoints( score ) - score_points.append( score_points_tmp ) - tmp_array = score_points_tmp - - # reverse the matrix, for R - for i in range( number_of_points - 1 ): - tmp_array = [] - for j in range( len( score_points ) ): - try: - tmp_array.append( int( score_points[j][i] ) ) - except: - invalid_lines += 1 - score_matrix.append( tmp_array ) - - # generate pdf figures - #outfile_R_pdf = outfile_R_name - #r.pdf( outfile_R_pdf ) - outfile_R_png = outfile_R_name - r.bitmap( outfile_R_png ) - - title = "boxplot of quality scores" - empty_score_matrix_columns = 0 - for i, subset in enumerate( score_matrix ): - if not subset: - empty_score_matrix_columns += 1 - score_matrix[i] = [0] - - if not variable_length: - r.boxplot( score_matrix, xlab="location in read length", main=title ) - else: - r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) - x_old_range = [] - x_new_range = [] - step = read_length_threshold / number_of_points - for i in xrange( 0, read_length_threshold, step ): - x_old_range.append( ( i / step ) ) - x_new_range.append( i ) - r.axis( 1, x_old_range, x_new_range ) - r.dev_off() - - if invalid_scores > 0: - print 'Skipped %d invalid scores. ' % invalid_scores - if invalid_lines > 0: - print 'Skipped %d invalid lines. ' % invalid_lines - if empty_score_matrix_columns > 0: - print '%d missing scores in score_matrix. ' % empty_score_matrix_columns - - r.quit(save = "no") - -if __name__=="__main__":__main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/short_reads_figure_score.xml --- a/tools/metag_tools/short_reads_figure_score.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ - - - -short_reads_figure_score.py $input1 $output1 - - - - - - - - - - - - rpy - - - - - - - - - - - - - -.. class:: warningmark - -To use this tool, your dataset needs to be in the *Quality Score* format. Click the pencil icon next to your dataset to set the datatype to *Quality Score* (see below for examples). - ------ - -**What it does** - -This tool takes Quality Files generated by Roche (454), Illumina (Solexa), or ABI SOLiD machines and builds a graph showing score distribution like the one below. Such graph allows you to perform initial evaluation of data quality in a single pass. - ------ - -**Examples of Quality Data** - -Roche (454) or ABI SOLiD data:: - - >seq1 - 23 33 34 25 28 28 28 32 23 34 27 4 28 28 31 21 28 - -Illumina (Solexa) data:: - - -40 -40 40 -40 -40 -40 -40 40 - ------ - -**Output example** - -Quality scores are summarized as boxplot (Roche 454 FLX data): - -.. image:: ./static/images/short_reads_boxplot.png - -where the **X-axis** is coordinate along the read and the **Y-axis** is quality score adjusted to comply with the Phred score metric. Units on the X-axis depend on whether your data comes from Roche (454) or Illumina (Solexa) and ABI SOLiD machines: - - - For Roche (454) X-axis (shown above) indicates **relative** position (in %) within reads as this technology produces reads of different lengths; - - For Illumina (Solexa) and ABI SOLiD X-axis shows **absolute** position in nucleotides within reads. - -Every box on the plot shows the following values:: - - o <---- Outliers - o - -+- <---- Upper Extreme Value that is no more - | than box length away from the box - | - +--+--+ <---- Upper Quartile - | | - +-----+ <---- Median - | | - +--+--+ <---- Lower Quartile - | - | - -+- <---- Lower Extreme Value that is no more - than box length away from the box - o <---- Outlier - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/short_reads_trim_seq.py --- a/tools/metag_tools/short_reads_trim_seq.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,234 +0,0 @@ -#!/usr/bin/env python -""" -trim reads based on the quality scores -input: read file and quality score file -output: trimmed read file -""" - -import os, sys, math, tempfile, re - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def append_to_outfile( outfile_name, seq_title, segments ): - segments = segments.split( ',' ) - if len( segments ) > 1: - outfile = open( outfile_name, 'a' ) - for i in range( len( segments ) ): - outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) ) - outfile.close() - elif segments[0]: - outfile = open( outfile_name, 'a' ) - outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) ) - outfile.close() - -def trim_seq( seq, score, arg, trim_score, threshold ): - seq_method = '454' - trim_pos = 0 - # trim after a certain position - if arg.isdigit(): - keep_homopolymers = False - trim_pos = int( arg ) - if trim_pos > 0 and trim_pos < len( seq ): - seq = seq[0:trim_pos] - else: - keep_homopolymers = arg=='yes' - - new_trim_seq = '' - max_segment = 0 - - for i in range( len( seq ) ): - if i >= len( score ): - score.append(-1) - if int( score[i] ) >= trim_score: - pass_nuc = seq[ i:( i + 1 ) ] - else: - if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ): - pass_nuc = seq[ i:( i + 1 ) ] - else: - pass_nuc = ' ' - new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc ) - # find the max substrings - segments = new_trim_seq.split() - max_segment = '' - len_max_segment = 0 - if threshold == 0: - for seg in segments: - if len_max_segment < len( seg ): - max_segment = '%s,' % seg - len_max_segment = len( seg ) - elif len_max_segment == len( seg ): - max_segment = '%s%s,' % ( max_segment, seg ) - else: - for seg in segments: - if len( seg ) >= threshold: - max_segment = '%s%s,' % ( max_segment, seg ) - return max_segment[ 0:-1 ] - -def __main__(): - - try: - threshold_trim = int( sys.argv[1].strip() ) - except: - stop_err( "Minimal quality score must be numeric." ) - try: - threshold_report = int( sys.argv[2].strip() ) - except: - stop_err( "Minimal length of trimmed reads must be numeric." ) - outfile_seq_name = sys.argv[3].strip() - infile_seq_name = sys.argv[4].strip() - infile_score_name = sys.argv[5].strip() - arg = sys.argv[6].strip() - - seq_infile_name = infile_seq_name - score_infile_name = infile_score_name - - - # Determine quailty score format: tabular or fasta format within the first 100 lines - seq_method = None - data_type = None - for i, line in enumerate( file( score_infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if data_type == None: - if line.startswith( '>' ): - data_type = 'fasta' - continue - elif len( line.split( '\t' ) ) > 0: - fields = line.split() - for score in fields: - try: - int( score ) - data_type = 'tabular' - seq_method = 'solexa' - break - except: - break - elif data_type == 'fasta': - fields = line.split() - for score in fields: - try: - int( score ) - seq_method = '454' - break - except: - break - if i == 100: - break - - if data_type is None: - stop_err( 'This tool can only use fasta data or tabular data.' ) - if seq_method is None: - stop_err( 'Invalid data for fasta format.') - - if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ): - seq = None - score = None - score_found = False - - score_file = open( score_infile_name, 'r' ) - - for i, line in enumerate( open( seq_infile_name ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if line.startswith( '>' ): - if seq: - scores = [] - if data_type == 'fasta': - score = None - score_found = False - score_line = 'start' - while not score_found and score_line: - score_line = score_file.readline().rstrip( '\r\n' ) - if not score_line or score_line.startswith( '#' ): - continue - if score_line.startswith( '>' ): - if score: - scores = score.split() - score_found = True - score = None - else: - for val in score_line.split(): - try: - int( val ) - except: - score_file.close() - stop_err( "Non-numerical value '%s' in score file." % val ) - if not score: - score = score_line - else: - score = '%s %s' % ( score, score_line ) - elif data_type == 'tabular': - score = score_file.readline().rstrip('\r\n') - loc = score.split( '\t' ) - for base in loc: - nuc_error = base.split() - try: - nuc_error[0] = int( nuc_error[0] ) - nuc_error[1] = int( nuc_error[1] ) - nuc_error[2] = int( nuc_error[2] ) - nuc_error[3] = int( nuc_error[3] ) - big = max( nuc_error ) - except: - score_file.close() - stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) - scores.append( big ) - if scores: - new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) - append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) - - seq_title = line - seq = None - else: - if not seq: - seq = line - else: - seq = "%s%s" % ( seq, line ) - if seq: - scores = [] - if data_type == 'fasta': - score = None - while score_line: - score_line = score_file.readline().rstrip( '\r\n' ) - if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ): - continue - for val in score_line.split(): - try: - int( val ) - except: - score_file.close() - stop_err( "Non-numerical value '%s' in score file." % val ) - if not score: - score = score_line - else: - score = "%s %s" % ( score, score_line ) - if score: - scores = score.split() - elif data_type == 'tabular': - score = score_file.readline().rstrip('\r\n') - loc = score.split( '\t' ) - for base in loc: - nuc_error = base.split() - try: - nuc_error[0] = int( nuc_error[0] ) - nuc_error[1] = int( nuc_error[1] ) - nuc_error[2] = int( nuc_error[2] ) - nuc_error[3] = int( nuc_error[3] ) - big = max( nuc_error ) - except: - score_file.close() - stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) - scores.append( big ) - if scores: - new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) - append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) - score_file.close() - else: - stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) ) - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/short_reads_trim_seq.xml --- a/tools/metag_tools/short_reads_trim_seq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,93 +0,0 @@ - - - - - short_reads_trim_seq.py $trim $length $output1 $input1 $input2 $sequencing_method_choice.input3 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -To use this tool, your dataset needs to be in the *Quality Score* format. Click the pencil icon next to your dataset to set the datatype to *Quality Score* (see below for examples). - ------ - -**What it does** - -This tool finds high quality segments within sequencing reads generated by by Roche (454), Illumina (Solexa), or ABI SOLiD machines. - ------ - -**Example** - - -Suppose this is your sequencing read:: - - 5'---------*-------------*------**----3' - -where **dashes** (-) are HIGH quality bases (above 20) and **asterisks** (*) are LOW quality bases (below 20). If the **Minimal length of contiguous segment** is set to **5** (of course, only for the purposes of this example), the tool will return:: - - 5'--------- - ------------- - ------- - -you can see that the tool simply splits the read on low quality bases and then returns all segments longer than 5. **Note**, that the output of this tool will likely contain higher number of shorter sequences compared to the original input. If we set the **Minimal length of contiguous segment** to **0**, the tool will only return the single longest segment:: - - ------------- - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/shrimp_color_wrapper.py --- a/tools/metag_tools/shrimp_color_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -#!/usr/bin/env python - -""" -SHRiMP wrapper : Color space -""" - -import os, sys, tempfile, os.path, re - -assert sys.version_info[:2] >= (2.4) - -def stop_err( msg ): - - sys.stderr.write( "%s\n" % msg ) - sys.exit() - - -def __main__(): - - # SHRiMP path - shrimp = 'rmapper-cs' - - # I/O - input_target_file = sys.argv[1] # fasta - input_query_file = sys.argv[2] - shrimp_outfile = sys.argv[3] # shrimp output - - # SHRiMP parameters - spaced_seed = '1111001111' - seed_matches_per_window = '2' - seed_hit_taboo_length = '4' - seed_generation_taboo_length = '0' - seed_window_length = '115.0' - max_hits_per_read = '100' - max_read_length = '1000' - kmer = '-1' - sw_match_value = '100' - sw_mismatch_value = '-150' - sw_gap_open_ref = '-400' - sw_gap_open_query = '-400' - sw_gap_ext_ref = '-70' - sw_gap_ext_query = '-70' - sw_crossover_penalty = '-140' - sw_full_hit_threshold = '68.0' - sw_vector_hit_threshold = '60.0' - - # TODO: put the threshold on each of these parameters - if len(sys.argv) > 4: - - try: - if sys.argv[4].isdigit(): - spaced_seed = sys.argv[4] - else: - stop_err('Error in assigning parameter: Spaced seed.') - except: - stop_err('Spaced seed must be a combination of 1s and 0s.') - - seed_matches_per_window = sys.argv[5] - seed_hit_taboo_length = sys.argv[6] - seed_generation_taboo_length = sys.argv[7] - seed_window_length = sys.argv[8] - max_hits_per_read = sys.argv[9] - max_read_length = sys.argv[10] - kmer = sys.argv[11] - sw_match_value = sys.argv[12] - sw_mismatch_value = sys.argv[13] - sw_gap_open_ref = sys.argv[14] - sw_gap_open_query = sys.argv[15] - sw_gap_ext_ref = sys.argv[16] - sw_gap_ext_query = sys.argv[17] - sw_crossover_penalty = sys.argv[18] - sw_full_hit_threshold = sys.argv[19] - sw_vector_hit_threshold = sys.argv[20] - - # temp file for shrimp log file - shrimp_log = tempfile.NamedTemporaryFile().name - - # SHRiMP command - command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-x', sw_crossover_penalty, '-h', sw_full_hit_threshold, '-v', sw_vector_hit_threshold, input_query_file, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) - - try: - os.system(command) - except Exception, e: - if os.path.exists(query_fasta): os.remove(query_fasta) - if os.path.exists(query_qual): os.remove(query_qual) - stop_err(str(e)) - - # check SHRiMP output: count number of lines - num_hits = 0 - if shrimp_outfile: - for i, line in enumerate(file(shrimp_outfile)): - line = line.rstrip('\r\n') - if not line or line.startswith('#'): continue - try: - fields = line.split() - num_hits += 1 - except Exception, e: - stop_err(str(e)) - - if num_hits == 0: # no hits generated - err_msg = '' - if shrimp_log: - for i, line in enumerate(file(shrimp_log)): - if line.startswith('error'): # deal with memory error: - err_msg += line # error: realloc failed: Cannot allocate memory - if re.search('Reads Matched', line): # deal with zero hits - if int(line[8:].split()[2]) == 0: - err_msg = 'Zero hits found.\n' - stop_err('SHRiMP Failed due to:\n' + err_msg) - - - # remove temp. files - if os.path.exists(shrimp_log): os.remove(shrimp_log) - - -if __name__ == '__main__': __main__() - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/shrimp_color_wrapper.xml --- a/tools/metag_tools/shrimp_color_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,181 +0,0 @@ - - reads mapping against reference sequence - - #if $param.skip_or_full=="skip" #shrimp_color_wrapper.py $input_target $input_query $output1 - #else #shrimp_color_wrapper.py $input_target $input_query $output1 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_crossover_penalty $param.sw_full_hit_threshold $param.sw_vector_hit_threshold - #end if# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rmapper-cs - - - - - - - - - - - -.. class:: warningmark - -To use this tool your dataset needs to be in the *csfasta* (as ABI SOLiD color-space sequences) format. Click pencil icon next to your dataset to set the datatype to *csfasta*. - - ------ - -**What it does** - -SHRiMP (SHort Read Mapping Package) is a software package for aligning genomic reads against a target genome. - - ------ - -**Input formats** - -A multiple color-space file, for example:: - - >2_263_779_F3 - T132032030200202202003211302222202230022110222 - - ------ - -**Outputs** - -The tool returns the default SHRiMP output:: - - - 1 2 3 4 5 6 7 8 9 10 - -------------------------------------------------------------------------------------------------------------------- - >2_263_779_F3 Streptococcus_suis + 814344 814388 1 45 45 3660 8x19x3x2x6x4x3 - -where:: - - 1. (>2_263_779_F3) - Read id - 2. (Streptococcus_suis) - Reference sequence id - 3. (+) - Strand of the read - 4. (814344) - Start position of the alignment in the reference - 5. (814388) - End position of the alignment in the reference - 6. (1) - Start position of the alignment in the read - 7. (45) - End position of the alignment in the read - 8. (45) - Length of the read - 9. (3660) - Score - 10. (8x19x3x2x6x4x3) - Edit string - - ------ - -**SHRiMP parameter list** - -The commonly used parameters with default value setting:: - - -s Spaced Seed (default: 111111011111) - The spaced seed is a single contiguous string of 0's and 1's. - 0's represent wildcards, or positions which will always be - considered as matching, whereas 1's dictate positions that - must match. A string of all 1's will result in a simple kmer scan. - -n Seed Matches per Window (default: 2) - The number of seed matches per window dictates how many seeds - must match within some window length of the genome before that - region is considered for Smith-Waterman alignment. A lower - value will increase sensitivity while drastically increasing - running time. Higher values will have the opposite effect. - -t Seed Hit Taboo Length (default: 4) - The seed taboo length specifies how many target genome bases - or colours must exist prior to a previous seed match in order - to count another seed match as a hit. - -9 Seed Generation Taboo Length (default: 0) - - -w Seed Window Length (default: 115.00%) - This parameter specifies the genomic span in bases (or colours) - in which *seed_matches_per_window* must exist before the read - is given consideration by the Simth-Waterman alignment machinery. - -o Maximum Hits per Read (default: 100) - This parameter specifies how many hits to remember for each read. - If more hits are encountered, ones with lower scores are dropped - to make room. - -r Maximum Read Length (default: 1000) - This parameter specifies the maximum length of reads that will - be encountered in the dataset. If larger reads than the default - are used, an appropriate value must be passed to *rmapper*. - -d Kmer Std. Deviation Limit (default: -1 [None]) - This option permits pruning read kmers, which occur with - frequencies greater than *kmer_std_dev_limit* standard - deviations above the average. This can shorten running - time at the cost of some sensitivity. - *Note*: A negative value disables this option. - -m S-W Match Value (default: 100) - The value applied to matches during the Smith-Waterman score calculation. - -i S-W Mismatch Value (default: -150) - The value applied to mismatches during the Smith-Waterman - score calculation. - -g S-W Gap Open Penalty (Reference) (default: -400) - The value applied to gap opens along the reference sequence - during the Smith-Waterman score calculation. - *Note*: Note that for backward compatibility, if -g is set - and -q is not set, the gap open penalty for the query will - be set to the same value as specified for the reference. - -q S-W Gap Open Penalty (Query) (default: -400) - The value applied to gap opens along the query sequence during - the Smith-Waterman score calculation. - -e S-W Gap Extend Penalty (Reference) (default: -70) - The value applied to gap extends during the Smith-Waterman score calculation. - *Note*: Note that for backward compatibility, if -e is set - and -f is not set, the gap exten penalty for the query will - be set to the same value as specified for the reference. - -f S-W Gap Extend Penalty (Query) (default: -70) - The value applied to gap extends during the Smith-Waterman score calculation. - -x - -h S-W Full Hit Threshold (default: 68.00%) - In letter-space, this parameter determines the threshold - score for both vectored and full Smith-Waterman alignments. - Any values less than this quantity will be thrown away. - *Note* This option differs slightly in meaning between letter-space and color-space. - -v - - ------ - -**Reference** - - **SHRiMP**: Stephen M. Rumble, Michael Brudno, Phil Lacroute, Vladimir Yanovsky, Marc Fiume, Adrian Dalca. shrimp at cs dot toronto dot edu. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/shrimp_wrapper.py --- a/tools/metag_tools/shrimp_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,624 +0,0 @@ -#!/usr/bin/env python - -""" -TODO -1. decrease memory usage -2. multi-fasta fastq file, ex. 454 -3. split reads into small chuncks? - -SHRiMP wrapper - -Inputs: -1. reference seq -2. reads - -Outputs: -1. table of 8 columns: - chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage -2. SHRiMP output - -Parameters: - -s Spaced Seed (default: 111111011111) - -n Seed Matches per Window (default: 2) - -t Seed Hit Taboo Length (default: 4) - -9 Seed Generation Taboo Length (default: 0) - -w Seed Window Length (default: 115.00%) - -o Maximum Hits per Read (default: 100) - -r Maximum Read Length (default: 1000) - -d Kmer Std. Deviation Limit (default: -1 [None]) - - -m S-W Match Value (default: 100) - -i S-W Mismatch Value (default: -150) - -g S-W Gap Open Penalty (Reference) (default: -400) - -q S-W Gap Open Penalty (Query) (default: -400) - -e S-W Gap Extend Penalty (Reference) (default: -70) - -f S-W Gap Extend Penalty (Query) (default: -70) - -h S-W Hit Threshold (default: 68.00%) - -Command: -%rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold > 2> - -SHRiMP output: ->7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13 ->7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14 ->7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16 - -""" - -import os, sys, tempfile, os.path, re - -assert sys.version_info[:2] >= (2.4) - -def stop_err( msg ): - - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def reverse_complement(s): - - complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"} - reversed_s = [] - for i in s: - reversed_s.append(complement_dna[i]) - reversed_s.reverse() - return "".join(reversed_s) - -def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size): - - invalid_editstring_char = 0 - - all_score_file = score_files.split(',') - - if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.') - - temp_table_name = tempfile.NamedTemporaryFile().name - temp_table = open(temp_table_name, 'w') - - outfile = open(table_outfile,'w') - - # reference seq: not a single fasta seq - refseq = {} - chrom_cov = {} - seq = '' - - for i, line in enumerate(file(ref_file)): - line = line.rstrip() - if not line or line.startswith('#'): continue - - if line.startswith('>'): - if seq: - if refseq.has_key(title): - pass - else: - refseq[title] = seq - chrom_cov[title] = {} - seq = '' - title = line[1:] - else: - seq += line - if seq: - if not refseq.has_key(title): - refseq[title] = seq - chrom_cov[title] = {} - - # find hits : one end and/or the other - hits = {} - for i, line in enumerate(file(result_file)): - line = line.rstrip() - if not line or line.startswith('#'): continue - - #FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring - fields = line.split('\t') - readname = fields[0][1:] - chrom = fields[1] - strand = fields[2] - chrom_start = int(fields[3]) - 1 - chrom_end = int(fields[4]) - read_start = fields[5] - read_end = fields[6] - read_len = fields[7] - score = fields[8] - editstring = fields[9] - - if hit_per_read == 1: - endindex = '1' - else: - readname, endindex = readname.split('/') - - if hits.has_key(readname): - if hits[readname].has_key(endindex): - hits[readname][endindex].append([strand, editstring, chrom_start, chrom_end, read_start, chrom]) - else: - hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]] - else: - hits[readname] = {} - hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]] - - # find score : one end and the other end - hits_score = {} - readname = '' - score = '' - for num_score_file in range(len(all_score_file)): - score_file = all_score_file[num_score_file] - for i, line in enumerate(file(score_file)): - line = line.rstrip() - if not line or line.startswith('#'): continue - - if line.startswith('>'): - if score: - if hits.has_key(readname): - if len(hits[readname]) == hit_per_read: - if hits_score.has_key(readname): - if hits_score[readname].has_key(endindex): - pass - else: - hits_score[readname][endindex] = score - else: - hits_score[readname] = {} - hits_score[readname][endindex] = score - score = '' - if hit_per_read == 1: - readname = line[1:] - endindex = '1' - else: - readname, endindex = line[1:].split('/') - else: - score = line - - if score: # the last one - if hits.has_key(readname): - if len(hits[readname]) == hit_per_read: - if hits_score.has_key(readname): - if hits_score[readname].has_key(endindex): - pass - else: - hits_score[readname][endindex] = score - else: - hits_score[readname] = {} - hits_score[readname][endindex] = score - - # call to all mappings - for readkey in hits.keys(): - if len(hits[readkey]) != hit_per_read: continue - - matches = [] - match_count = 0 - - if hit_per_read == 1: - if len(hits[readkey]['1']) == 1: - matches = [ hits[readkey]['1'] ] - match_count = 1 - else: - end1_data = hits[readkey]['1'] - end2_data = hits[readkey]['2'] - - for i, end1_hit in enumerate(end1_data): - crin_strand = {'+': False, '-': False} - crin_insertSize = {'+': False, '-': False} - - crin_strand[end1_hit[0]] = True - crin_insertSize[end1_hit[0]] = int(end1_hit[2]) - - for j, end2_hit in enumerate(end2_data): - crin_strand[end2_hit[0]] = True - crin_insertSize[end2_hit[0]] = int(end2_hit[2]) - - if end1_hit[-1] != end2_hit[-1] : continue - - if crin_strand['+'] and crin_strand['-']: - if (crin_insertSize['-'] - crin_insertSize['+']) <= insertion_size: - matches.append([end1_hit, end2_hit]) - match_count += 1 - - if match_count == 1: - - for x, end_data in enumerate(matches[0]): - - end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data - end_read_start = int(end_read_start) - 1 - - if end_strand == '-': - refsegment = reverse_complement(refseq[end_chrom][end_chr_start:end_chr_end]) - else: - refsegment = refseq[end_chrom][end_chr_start:end_chr_end] - - match_len = 0 - editindex = 0 - gap_read = 0 - - while editindex < len(end_editstring): - - editchr = end_editstring[editindex] - chrA = '' - chrB = '' - locIndex = [] - - if editchr.isdigit(): - editcode = '' - - while editchr.isdigit() and editindex < len(end_editstring): - editcode += editchr - editindex += 1 - if editindex < len(end_editstring): editchr = end_editstring[editindex] - - for baseIndex in range(int(editcode)): - chrA += refsegment[match_len+baseIndex] - chrB = chrA - - match_len += int(editcode) - - elif editchr == 'x': - # crossover: inserted between the appropriate two bases - # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers) - # Treated as errors in the reads; Do nothing. - editindex += 1 - - elif editchr.isalpha(): - editcode = editchr - editindex += 1 - chrA = refsegment[match_len] - chrB = editcode - match_len += len(editcode) - - elif editchr == '-': - editcode = editchr - editindex += 1 - chrA = refsegment[match_len] - chrB = editcode - match_len += len(editcode) - gap_read += 1 - - elif editchr == '(': - editcode = '' - - while editchr != ')' and editindex < len(end_editstring): - if editindex < len(end_editstring): editchr = end_editstring[editindex] - editcode += editchr - editindex += 1 - - editcode = editcode[1:-1] - chrA = '-'*len(editcode) - chrB = editcode - - else: - invalid_editstring_char += 1 - - if end_strand == '-': - - chrA = reverse_complement(chrA) - chrB = reverse_complement(chrB) - - pos_line = '' - rev_line = '' - - for mappingIndex in range(len(chrA)): - # reference - chrAx = chrA[mappingIndex] - # read - chrBx = chrB[mappingIndex] - - if chrAx and chrBx and chrBx.upper() != 'N': - - if end_strand == '+': - - chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex - read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read - - if chrAx == '-': chrom_loc -= 1 - - if chrBx == '-': - scoreBx = '-1' - else: - scoreBx = hits_score[readkey][str(x+1)].split()[read_loc] - - # 1-based on chrom_loc and read_loc - pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n' - - else: - - chrom_loc = end_chr_end-match_len+mappingIndex - read_loc = end_read_start+match_len-1-mappingIndex-gap_read - - if chrAx == '-': chrom_loc -= 1 - - if chrBx == '-': - scoreBx = '-1' - else: - scoreBx = hits_score[readkey][str(x+1)].split()[read_loc] - - # 1-based on chrom_loc and read_loc - rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line - - if chrom_cov.has_key(end_chrom): - - if chrom_cov[end_chrom].has_key(chrom_loc): - chrom_cov[end_chrom][chrom_loc] += 1 - else: - chrom_cov[end_chrom][chrom_loc] = 1 - - else: - - chrom_cov[end_chrom] = {} - chrom_cov[end_chrom][chrom_loc] = 1 - - if pos_line: temp_table.write('%s\n' %(pos_line.rstrip('\r\n'))) - if rev_line: temp_table.write('%s\n' %(rev_line.rstrip('\r\n'))) - - temp_table.close() - - # chrom-wide coverage - for i, line in enumerate(open(temp_table_name)): - - line = line.rstrip() - if not line or line.startswith('#'): continue - - fields = line.split() - chrom = fields[0] - eachBp = int(fields[1]) - readname = fields[2] - - if hit_per_read == 1: - fields[2] = readname.split('/')[0] - - if chrom_cov[chrom].has_key(eachBp): - outfile.write('%s\t%d\n' %('\t'.join(fields), chrom_cov[chrom][eachBp])) - else: - outfile.write('%s\t%d\n' %('\t'.join(fields), 0)) - - outfile.close() - - if os.path.exists(temp_table_name): os.remove(temp_table_name) - - if invalid_editstring_char: - print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings' - - return True - -def convert_fastqsolexa_to_fasta_qual(infile_name, query_fasta, query_qual): - - outfile_seq = open( query_fasta, 'w' ) - outfile_score = open( query_qual, 'w' ) - - seq_title_startswith = '' - qual_title_startswith = '' - - default_coding_value = 64 # Solexa ascii-code - fastq_block_lines = 0 - - for i, line in enumerate( file( infile_name ) ): - line = line.rstrip() - if not line or line.startswith( '#' ): continue - - fastq_block_lines = ( fastq_block_lines + 1 ) % 4 - line_startswith = line[0:1] - - if fastq_block_lines == 1: - # first line is @title_of_seq - if not seq_title_startswith: - seq_title_startswith = line_startswith - - if line_startswith != seq_title_startswith: - outfile_seq.close() - outfile_score.close() - stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) - - read_title = line[1:] - outfile_seq.write( '>%s\n' % line[1:] ) - - elif fastq_block_lines == 2: - # second line is nucleotides - read_length = len( line ) - outfile_seq.write( '%s\n' % line ) - - elif fastq_block_lines == 3: - # third line is +title_of_qualityscore ( might be skipped ) - if not qual_title_startswith: - qual_title_startswith = line_startswith - - if line_startswith != qual_title_startswith: - outfile_seq.close() - outfile_score.close() - stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) - - quality_title = line[1:] - if quality_title and read_title != quality_title: - outfile_seq.close() - outfile_score.close() - stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) - - if not quality_title: - outfile_score.write( '>%s\n' % read_title ) - else: - outfile_score.write( '>%s\n' % line[1:] ) - - else: - # fourth line is quality scores - qual = '' - fastq_integer = True - # peek: ascii or digits? - val = line.split()[0] - try: - check = int( val ) - fastq_integer = True - except: - fastq_integer = False - - if fastq_integer: - # digits - qual = line - else: - # ascii - quality_score_length = len( line ) - if quality_score_length == read_length + 1: - # first char is qual_score_startswith - qual_score_startswith = ord( line[0:1] ) - line = line[1:] - elif quality_score_length == read_length: - qual_score_startswith = default_coding_value - else: - stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) - - for j, char in enumerate( line ): - score = ord( char ) - qual_score_startswith # 64 - qual = "%s%s " % ( qual, str( score ) ) - - outfile_score.write( '%s\n' % qual ) - - outfile_seq.close() - outfile_score.close() - - return True - -def __main__(): - - # SHRiMP path - shrimp = 'rmapper-ls' - - # I/O - input_target_file = sys.argv[1] # fasta - shrimp_outfile = sys.argv[2] # shrimp output - table_outfile = sys.argv[3] # table output - single_or_paired = sys.argv[4].split(',') - - insertion_size = 600 - - if len(single_or_paired) == 1: # single or paired - type_of_reads = 'single' - hit_per_read = 1 - input_query = single_or_paired[0] - query_fasta = tempfile.NamedTemporaryFile().name - query_qual = tempfile.NamedTemporaryFile().name - - else: # paired-end - type_of_reads = 'paired' - hit_per_read = 2 - input_query_end1 = single_or_paired[0] - input_query_end2 = single_or_paired[1] - insertion_size = int(single_or_paired[2]) - query_fasta_end1 = tempfile.NamedTemporaryFile().name - query_fasta_end2 = tempfile.NamedTemporaryFile().name - query_qual_end1 = tempfile.NamedTemporaryFile().name - query_qual_end2 = tempfile.NamedTemporaryFile().name - - # SHRiMP parameters: total = 15, default values - spaced_seed = '111111011111' - seed_matches_per_window = '2' - seed_hit_taboo_length = '4' - seed_generation_taboo_length = '0' - seed_window_length = '115.0' - max_hits_per_read = '100' - max_read_length = '1000' - kmer = '-1' - sw_match_value = '100' - sw_mismatch_value = '-150' - sw_gap_open_ref = '-400' - sw_gap_open_query = '-400' - sw_gap_ext_ref = '-70' - sw_gap_ext_query = '-70' - sw_hit_threshold = '68.0' - - # TODO: put the threshold on each of these parameters - if len(sys.argv) > 5: - - try: - if sys.argv[5].isdigit(): - spaced_seed = sys.argv[5] - else: - stop_err('Error in assigning parameter: Spaced seed.') - except: - stop_err('Spaced seed must be a combination of 1s and 0s.') - - seed_matches_per_window = sys.argv[6] - seed_hit_taboo_length = sys.argv[7] - seed_generation_taboo_length = sys.argv[8] - seed_window_length = sys.argv[9] - max_hits_per_read = sys.argv[10] - max_read_length = sys.argv[11] - kmer = sys.argv[12] - sw_match_value = sys.argv[13] - sw_mismatch_value = sys.argv[14] - sw_gap_open_ref = sys.argv[15] - sw_gap_open_query = sys.argv[16] - sw_gap_ext_ref = sys.argv[17] - sw_gap_ext_query = sys.argv[18] - sw_hit_threshold = sys.argv[19] - - # temp file for shrimp log file - shrimp_log = tempfile.NamedTemporaryFile().name - - # convert fastq to fasta and quality score files - if type_of_reads == 'single': - return_value = convert_fastqsolexa_to_fasta_qual(input_query, query_fasta, query_qual) - else: - return_value = convert_fastqsolexa_to_fasta_qual(input_query_end1, query_fasta_end1, query_qual_end1) - return_value = convert_fastqsolexa_to_fasta_qual(input_query_end2, query_fasta_end2, query_qual_end2) - - # SHRiMP command - if type_of_reads == 'single': - command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) - - try: - os.system(command) - except Exception, e: - if os.path.exists(query_fasta): os.remove(query_fasta) - if os.path.exists(query_qual): os.remove(query_qual) - stop_err(str(e)) - - else: # paired - command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) - command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log]) - - try: - os.system(command_end1) - os.system(command_end2) - except Exception, e: - if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1) - if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2) - if os.path.exists(query_qual_end1): os.remove(query_qual_end1) - if os.path.exists(query_qual_end2): os.remove(query_qual_end2) - stop_err(str(e)) - - # check SHRiMP output: count number of lines - num_hits = 0 - if shrimp_outfile: - for i, line in enumerate(file(shrimp_outfile)): - line = line.rstrip('\r\n') - if not line or line.startswith('#'): continue - try: - fields = line.split() - num_hits += 1 - except Exception, e: - stop_err(str(e)) - - if num_hits == 0: # no hits generated - err_msg = '' - if shrimp_log: - for i, line in enumerate(file(shrimp_log)): - if line.startswith('error'): # deal with memory error: - err_msg += line # error: realloc failed: Cannot allocate memory - if re.search('Reads Matched', line): # deal with zero hits - if int(line[8:].split()[2]) == 0: - err_msg = 'Zero hits found.\n' - stop_err('SHRiMP Failed due to:\n' + err_msg) - - # convert to table - if type_of_reads == 'single': - return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size) - else: - return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size) - - # remove temp. files - if type_of_reads == 'single': - if os.path.exists(query_fasta): os.remove(query_fasta) - if os.path.exists(query_qual): os.remove(query_qual) - else: - if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1) - if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2) - if os.path.exists(query_qual_end1): os.remove(query_qual_end1) - if os.path.exists(query_qual_end2): os.remove(query_qual_end2) - - if os.path.exists(shrimp_log): os.remove(shrimp_log) - - -if __name__ == '__main__': __main__() - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/shrimp_wrapper.xml --- a/tools/metag_tools/shrimp_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,279 +0,0 @@ - - reads mapping against reference sequence - - #if ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip") #shrimp_wrapper.py $input_target $output1 $output2 $input_query - #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip") #shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size - #elif ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full") #shrimp_wrapper.py $input_target $output1 $output2 $input_query $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold - #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full") #shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold - #end if# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rmapper-ls - - - - - - - - - - - - - -.. class:: warningmark - -IMPORTANT: This tool currently only supports data where the quality scores are integers or ASCII quality scores with base 64. Click pencil icon next to your dataset to set datatype to *fastqsolexa*. - - ------ - -**What it does** - -SHRiMP (SHort Read Mapping Package) is a software package for aligning genomic reads against a target genome. - -This wrapper post-processes the default SHRiMP/rmapper-ls output and generates a table with all information from reads and reference for the mapping. The tool takes single- or paired-end reads. For single-end reads, only uniquely mapped alignment is considered. In paired-end reads, only pairs that meet the following criteria will be used to generate the table: 1). the ends fall within the insertion size; 2). the ends are mapped at the opposite directions. If there are still multiple mappings after applying the criteria, this paired-end read will be discarded. - - ------ - -**Input formats** - -A multiple-fastq file, for example:: - - @seq1 - TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT - +seq1 - hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh - - ------ - -**Outputs** - -The tool gives two outputs. - -**Table output** - -Table output contains 8 columns:: - - 1 2 3 4 5 6 7 8 - ---------------------------------------------------- - chrM 14711 seq1 0 T A 40 1 - chrM 14712 seq1 1 T T 40 1 - -where:: - - 1. (chrM) - Reference sequence id - 2. (14711) - Position of the mapping in the reference - 3. (seq1) - Read id - 4. (0) - Position of the mapping in the read - 5. (T) - Nucleotide in the reference - 6. (A) - Nucleotide in the read - 7. (40) - Quality score for the nucleotide in the position of the read - 8. (1) - The number of times this position is covered by reads - - -**SHRiMP output** - -This is the default output from SHRiMP/rmapper-ls:: - - 1 2 3 4 5 6 7 8 9 10 - ------------------------------------------------------------------- - seq1 chrM + 3644 3679 1 36 36 3600 36 - -where:: - - 1. (seq1) - Read id - 2. (chrM) - Reference sequence id - 3. (+) - Strand of the read - 4. (3466) - Start position of the alignment in the reference - 5. (3679) - End position of the alignment in the reference - 6. (1) - Start position of the alignment in the read - 7. (36) - End position of the alignment in the read - 8. (36) - Length of the read - 9. (3600) - Score - 10. (36) - Edit string - - ------ - -**SHRiMP parameter list** - -The commonly used parameters with default value setting:: - - -s Spaced Seed (default: 111111011111) - The spaced seed is a single contiguous string of 0's and 1's. - 0's represent wildcards, or positions which will always be - considered as matching, whereas 1's dictate positions that - must match. A string of all 1's will result in a simple kmer scan. - -n Seed Matches per Window (default: 2) - The number of seed matches per window dictates how many seeds - must match within some window length of the genome before that - region is considered for Smith-Waterman alignment. A lower - value will increase sensitivity while drastically increasing - running time. Higher values will have the opposite effect. - -t Seed Hit Taboo Length (default: 4) - The seed taboo length specifies how many target genome bases - or colors must exist prior to a previous seed match in order - to count another seed match as a hit. - -9 Seed Generation Taboo Length (default: 0) - - -w Seed Window Length (default: 115.00%) - This parameter specifies the genomic span in bases (or colours) - in which *seed_matches_per_window* must exist before the read - is given consideration by the Simth-Waterman alignment machinery. - -o Maximum Hits per Read (default: 100) - This parameter specifies how many hits to remember for each read. - If more hits are encountered, ones with lower scores are dropped - to make room. - -r Maximum Read Length (default: 1000) - This parameter specifies the maximum length of reads that will - be encountered in the dataset. If larger reads than the default - are used, an appropriate value must be passed to *rmapper*. - -d Kmer Std. Deviation Limit (default: -1 [None]) - This option permits pruning read kmers, which occur with - frequencies greater than *kmer_std_dev_limit* standard - deviations above the average. This can shorten running - time at the cost of some sensitivity. - *Note*: A negative value disables this option. - -m S-W Match Value (default: 100) - The value applied to matches during the Smith-Waterman score calculation. - -i S-W Mismatch Value (default: -150) - The value applied to mismatches during the Smith-Waterman - score calculation. - -g S-W Gap Open Penalty (Reference) (default: -400) - The value applied to gap opens along the reference sequence - during the Smith-Waterman score calculation. - *Note*: Note that for backward compatibility, if -g is set - and -q is not set, the gap open penalty for the query will - be set to the same value as specified for the reference. - -q S-W Gap Open Penalty (Query) (default: -400) - The value applied to gap opens along the query sequence during - the Smith-Waterman score calculation. - -e S-W Gap Extend Penalty (Reference) (default: -70) - The value applied to gap extends during the Smith-Waterman score calculation. - *Note*: Note that for backward compatibility, if -e is set - and -f is not set, the gap exten penalty for the query will - be set to the same value as specified for the reference. - -f S-W Gap Extend Penalty (Query) (default: -70) - The value applied to gap extends during the Smith-Waterman score calculation. - -h S-W Hit Threshold (default: 68.00%) - In letter-space, this parameter determines the threshold - score for both vectored and full Smith-Waterman alignments. - Any values less than this quantity will be thrown away. - *Note* This option differs slightly in meaning between letter-space and color-space. - - ------ - -**Reference** - - **SHRiMP**: Stephen M. Rumble, Michael Brudno, Phil Lacroute, Vladimir Yanovsky, Marc Fiume, Adrian Dalca. shrimp at cs dot toronto dot edu. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/split_paired_reads.py --- a/tools/metag_tools/split_paired_reads.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -#!/usr/bin/env python - -""" -Split fixed length paired end reads -""" - -import os, sys - -if __name__ == '__main__': - - infile = sys.argv[1] - outfile_end1 = open(sys.argv[2], 'w') - outfile_end2 = open(sys.argv[3], 'w') - - i = 0 - - for line in file( infile ): - line = line.rstrip() - - if not line: - continue - - end1 = '' - end2 = '' - - line_index = i % 4 - - if line_index == 0: - end1 = line + '/1' - end2 = line + '/2' - - elif line_index == 1: - seq_len = len(line)/2 - end1 = line[0:seq_len] - end2 = line[seq_len:] - - elif line_index == 2: - end1 = line + '/1' - end2 = line + '/2' - - else: - qual_len = len(line)/2 - end1 = line[0:qual_len] - end2 = line[qual_len:] - - outfile_end1.write('%s\n' %(end1)) - outfile_end2.write('%s\n' %(end2)) - - i += 1 - - if i % 4 != 0 : - sys.stderr.write("WARNING: Number of lines in the input file was not divisible by 4.\nCheck consistency of the input fastq file.\n") - outfile_end1.close() - outfile_end2.close() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/metag_tools/split_paired_reads.xml --- a/tools/metag_tools/split_paired_reads.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ - - - - split_paired_reads.py $input $output1 $output2 - - - - - - - - - - - - - - - - - -**What it does** - -Splits a single fastq dataset representing paired-end run into two datasets (one for each end). This tool works only for datasets where both ends have **the same** length. - ------ - -**Input formats** - -A multiple-fastq file, for example:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758 - GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA - +HWI-EAS91_1_30788AAXX:7:21:1542:1758 - hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR - - ------ - -**Outputs** - -One end:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 - GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC - +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 - hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh - -The other end:: - - @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 - GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA - +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 - hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR - - - diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/cca.py --- a/tools/multivariate_stats/cca.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,159 +0,0 @@ -#!/usr/bin/env python - -from galaxy import eggs -import sys, string -from rpy import * -import numpy - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -infile = sys.argv[1] -x_cols = sys.argv[2].split(',') -y_cols = sys.argv[3].split(',') - -x_scale = x_center = "FALSE" -if sys.argv[4] == 'both': - x_scale = x_center = "TRUE" -elif sys.argv[4] == 'center': - x_center = "TRUE" -elif sys.argv[4] == 'scale': - x_scale = "TRUE" - -y_scale = y_center = "FALSE" -if sys.argv[5] == 'both': - y_scale = y_center = "TRUE" -elif sys.argv[5] == 'center': - y_center = "TRUE" -elif sys.argv[5] == 'scale': - y_scale = "TRUE" - -std_scores = "FALSE" -if sys.argv[6] == "yes": - std_scores = "TRUE" - -outfile = sys.argv[7] -outfile2 = sys.argv[8] - -fout = open(outfile,'w') -elems = [] -for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - -if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - -x_vals = [] - -for k,col in enumerate(x_cols): - x_cols[k] = int(col)-1 - x_vals.append([]) - -y_vals = [] - -for k,col in enumerate(y_cols): - y_cols[k] = int(col)-1 - y_vals.append([]) - -skipped = 0 -for ind,line in enumerate( file( infile )): - if line and not line.startswith( '#' ): - try: - fields = line.strip().split("\t") - valid_line = True - for col in x_cols+y_cols: - try: - assert float(fields[col]) - except: - skipped += 1 - valid_line = False - break - if valid_line: - for k,col in enumerate(x_cols): - try: - xval = float(fields[col]) - except: - xval = NaN# - x_vals[k].append(xval) - for k,col in enumerate(y_cols): - try: - yval = float(fields[col]) - except: - yval = NaN# - y_vals[k].append(yval) - except: - skipped += 1 - -x_vals1 = numpy.asarray(x_vals).transpose() -y_vals1 = numpy.asarray(y_vals).transpose() - -x_dat= r.list(array(x_vals1)) -y_dat= r.list(array(y_vals1)) - -try: - r.suppressWarnings(r.library("yacca")) -except: - stop_err("Missing R library yacca.") - -set_default_mode(NO_CONVERSION) -try: - xcolnames = ["c%d" %(el+1) for el in x_cols] - ycolnames = ["c%d" %(el+1) for el in y_cols] - cc = r.cca(x=x_dat, y=y_dat, xlab=xcolnames, ylab=ycolnames, xcenter=r(x_center), ycenter=r(y_center), xscale=r(x_scale), yscale=r(y_scale), standardize_scores=r(std_scores)) - ftest = r.F_test_cca(cc) -except RException, rex: - stop_err("Encountered error while performing CCA on the input data: %s" %(rex)) - -set_default_mode(BASIC_CONVERSION) -summary = r.summary(cc) - -ncomps = len(summary['corr']) -comps = summary['corr'].keys() -corr = summary['corr'].values() -xlab = summary['xlab'] -ylab = summary['ylab'] - -for i in range(ncomps): - corr[comps.index('CV %s' %(i+1))] = summary['corr'].values()[i] - -ftest=ftest.as_py() -print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -print >>fout, "#Correlation\t%s" %("\t".join(["%.4g" % el for el in corr])) -print >>fout, "#F-statistic\t%s" %("\t".join(["%.4g" % el for el in ftest['statistic']])) -print >>fout, "#p-value\t%s" %("\t".join(["%.4g" % el for el in ftest['p.value']])) - -print >>fout, "#X-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for i,val in enumerate(summary['xcoef']): - print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#Y-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for i,val in enumerate(summary['ycoef']): - print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#X-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for i,val in enumerate(summary['xstructcorr']): - print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#Y-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for i,val in enumerate(summary['ystructcorr']): - print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#X-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for i,val in enumerate(summary['xcrosscorr']): - print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#Y-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for i,val in enumerate(summary['ycrosscorr']): - print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val])) - -r.pdf( outfile2, 8, 8 ) -#r.plot(cc) -for i in range(ncomps): - r.helio_plot(cc, cv = i+1, main = r.paste("Explained Variance for CV",i+1), type = "variance") -r.dev_off() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/cca.xml --- a/tools/multivariate_stats/cca.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ - - - - cca.py - $input1 - $x_cols - $y_cols - $x_scale - $y_scale - $std_scores - $out_file1 - $out_file2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rpy - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* - ------ - -.. class:: infomark - -**What it does** - -This tool uses functions from 'yacca' library from R statistical package to perform Canonical Correlation Analysis (CCA) on the input data. It outputs two files, one containing the summary statistics of the performed CCA, and the other containing helioplots, which display structural loadings of X and Y variables on different canonical components. - -*Carter T. Butts (2009). yacca: Yet Another Canonical Correlation Analysis Package. R package version 1.1.* - ------ - -.. class:: warningmark - -**Note** - -- This tool currently treats all predictor and response variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. - -- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. - -- The summary statistics in the output are described below: - - - correlation: Canonical correlation between the canonical variates (i.e. transformed variables) - - F-statistic: F-value obtained from F Test for Canonical Correlations Using Rao's Approximation - - p-value: denotes significance of canonical correlations - - Coefficients: represent the coefficients of X and Y variables on each canonical variate - - Loadings: represent the correlations between the original variables in each set and their respective canonical variates - - CrossLoadings: represent the correlations between the original variables in each set and the opposite canonical variates - - - diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/kcca.py --- a/tools/multivariate_stats/kcca.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,146 +0,0 @@ -#!/usr/bin/env python - -""" -Run kernel CCA using kcca() from R 'kernlab' package - -usage: %prog [options] - -i, --input=i: Input file - -o, --output1=o: Summary output - -x, --x_cols=x: X-Variable columns - -y, --y_cols=y: Y-Variable columns - -k, --kernel=k: Kernel function - -f, --features=f: Number of canonical components to return - -s, --sigma=s: sigma - -d, --degree=d: degree - -l, --scale=l: scale - -t, --offset=t: offset - -r, --order=r: order - -usage: %prog input output1 x_cols y_cols kernel features sigma(or_None) degree(or_None) scale(or_None) offset(or_None) order(or_None) -""" - -from galaxy import eggs -import sys, string -from rpy import * -import numpy -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -#Parse Command Line -options, args = doc_optparse.parse( __doc__ ) -#{'options= kernel': 'rbfdot', 'var_cols': '1,2,3,4', 'degree': 'None', 'output2': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_260.dat', 'output1': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_259.dat', 'scale': 'None', 'offset': 'None', 'input': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_256.dat', 'sigma': '1.0', 'order': 'None'} - -infile = options.input -x_cols = options.x_cols.split(',') -y_cols = options.y_cols.split(',') -kernel = options.kernel -outfile = options.output1 -ncomps = int(options.features) -fout = open(outfile,'w') - -if ncomps < 1: - print "You chose to return '0' canonical components. Please try rerunning the tool with number of components = 1 or more." - sys.exit() -elems = [] -for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - -if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - -x_vals = [] -for k,col in enumerate(x_cols): - x_cols[k] = int(col)-1 - x_vals.append([]) -y_vals = [] -for k,col in enumerate(y_cols): - y_cols[k] = int(col)-1 - y_vals.append([]) -NA = 'NA' -skipped = 0 -for ind,line in enumerate( file( infile )): - if line and not line.startswith( '#' ): - try: - fields = line.strip().split("\t") - valid_line = True - for col in x_cols+y_cols: - try: - assert float(fields[col]) - except: - skipped += 1 - valid_line = False - break - if valid_line: - for k,col in enumerate(x_cols): - try: - xval = float(fields[col]) - except: - xval = NaN# - x_vals[k].append(xval) - for k,col in enumerate(y_cols): - try: - yval = float(fields[col]) - except: - yval = NaN# - y_vals[k].append(yval) - except: - skipped += 1 - -x_vals1 = numpy.asarray(x_vals).transpose() -y_vals1 = numpy.asarray(y_vals).transpose() - -x_dat= r.list(array(x_vals1)) -y_dat= r.list(array(y_vals1)) - -try: - r.suppressWarnings(r.library('kernlab')) -except: - stop_err('Missing R library kernlab') - -set_default_mode(NO_CONVERSION) -if kernel=="rbfdot" or kernel=="anovadot": - pars = r.list(sigma=float(options.sigma)) -elif kernel=="polydot": - pars = r.list(degree=float(options.degree),scale=float(options.scale),offset=float(options.offset)) -elif kernel=="tanhdot": - pars = r.list(scale=float(options.scale),offset=float(options.offset)) -elif kernel=="besseldot": - pars = r.list(degree=float(options.degree),sigma=float(options.sigma),order=float(options.order)) -elif kernel=="anovadot": - pars = r.list(degree=float(options.degree),sigma=float(options.sigma)) -else: - pars = rlist() - -try: - kcc = r.kcca(x=x_dat, y=y_dat, kernel=kernel, kpar=pars, ncomps=ncomps) -except RException, rex: - stop_err("Encountered error while performing kCCA on the input data: %s" %(rex)) - -set_default_mode(BASIC_CONVERSION) -kcor = r.kcor(kcc) -if ncomps == 1: - kcor = [kcor] -xcoef = r.xcoef(kcc) -ycoef = r.ycoef(kcc) - -print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) - -print >>fout, "#Correlation\t%s" %("\t".join(["%.4g" % el for el in kcor])) - -print >>fout, "#Estimated X-coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for obs,val in enumerate(xcoef): - print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#Estimated Y-coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for obs,val in enumerate(ycoef): - print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val])) diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/kcca.xml --- a/tools/multivariate_stats/kcca.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,150 +0,0 @@ - - - - kcca.py - --input=$input1 - --output1=$out_file1 - --x_cols=$x_cols - --y_cols=$y_cols - --kernel=$kernelChoice.kernel - --features=$features - #if $kernelChoice.kernel == "rbfdot" or $kernelChoice.kernel == "anovadot": - --sigma=$kernelChoice.sigma - --degree="None" - --scale="None" - --offset="None" - --order="None" - #elif $kernelChoice.kernel == "polydot": - --sigma="None" - --degree=$kernelChoice.degree - --scale=$kernelChoice.scale - --offset=$kernelChoice.offset - --order="None" - #elif $kernelChoice.kernel == "tanhdot": - --sigma="None" - --degree="None" - --scale=$kernelChoice.scale - --offset=$kernelChoice.offset - --order="None" - #elif $kernelChoice.kernel == "besseldot": - --sigma=$kernelChoice.sigma - --degree=$kernelChoice.degree - --scale="None" - --offset="None" - --order=$kernelChoice.order - #elif $kernelChoice.kernel == "anovadot": - --sigma=$kernelChoice.sigma - --degree=$kernelChoice.degree - --scale="None" - --offset="None" - --order="None" - #else: - --sigma="None" - --degree="None" - --scale="None" - --offset="None" - --order="None" - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rpy - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* - ------ - -.. class:: infomark - -**What it does** - -This tool uses functions from 'kernlab' library from R statistical package to perform Kernel Canonical Correlation Analysis (kCCA) on the input data. - -*Alexandros Karatzoglou, Alex Smola, Kurt Hornik, Achim Zeileis (2004). kernlab - An S4 Package for Kernel Methods in R. Journal of Statistical Software 11(9), 1-20. URL http://www.jstatsoft.org/v11/i09/* - ------ - -.. class:: warningmark - -**Note** - -This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/kpca.py --- a/tools/multivariate_stats/kpca.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,134 +0,0 @@ -#!/usr/bin/env python - -""" -Run kernel PCA using kpca() from R 'kernlab' package - -usage: %prog [options] - -i, --input=i: Input file - -o, --output1=o: Summary output - -p, --output2=p: Figures output - -c, --var_cols=c: Variable columns - -k, --kernel=k: Kernel function - -f, --features=f: Number of principal components to return - -s, --sigma=s: sigma - -d, --degree=d: degree - -l, --scale=l: scale - -t, --offset=t: offset - -r, --order=r: order - -usage: %prog input output1 output2 var_cols kernel features sigma(or_None) degree(or_None) scale(or_None) offset(or_None) order(or_None) -""" - -from galaxy import eggs -import sys, string -from rpy import * -import numpy -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -#Parse Command Line -options, args = doc_optparse.parse( __doc__ ) -#{'options= kernel': 'rbfdot', 'var_cols': '1,2,3,4', 'degree': 'None', 'output2': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_260.dat', 'output1': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_259.dat', 'scale': 'None', 'offset': 'None', 'input': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_256.dat', 'sigma': '1.0', 'order': 'None'} - -infile = options.input -x_cols = options.var_cols.split(',') -kernel = options.kernel -outfile = options.output1 -outfile2 = options.output2 -ncomps = int(options.features) -fout = open(outfile,'w') - -elems = [] -for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - -if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - -x_vals = [] - -for k,col in enumerate(x_cols): - x_cols[k] = int(col)-1 - x_vals.append([]) - -NA = 'NA' -skipped = 0 -for ind,line in enumerate( file( infile )): - if line and not line.startswith( '#' ): - try: - fields = line.strip().split("\t") - for k,col in enumerate(x_cols): - try: - xval = float(fields[col]) - except: - #xval = r('NA') - xval = NaN# - x_vals[k].append(xval) - except: - skipped += 1 - -x_vals1 = numpy.asarray(x_vals).transpose() -dat= r.list(array(x_vals1)) - -try: - r.suppressWarnings(r.library('kernlab')) -except: - stop_err('Missing R library kernlab') - -set_default_mode(NO_CONVERSION) -if kernel=="rbfdot" or kernel=="anovadot": - pars = r.list(sigma=float(options.sigma)) -elif kernel=="polydot": - pars = r.list(degree=float(options.degree),scale=float(options.scale),offset=float(options.offset)) -elif kernel=="tanhdot": - pars = r.list(scale=float(options.scale),offset=float(options.offset)) -elif kernel=="besseldot": - pars = r.list(degree=float(options.degree),sigma=float(options.sigma),order=float(options.order)) -elif kernel=="anovadot": - pars = r.list(degree=float(options.degree),sigma=float(options.sigma)) -else: - pars = r.list() - -try: - kpc = r.kpca(x=r.na_exclude(dat), kernel=kernel, kpar=pars, features=ncomps) -except RException, rex: - stop_err("Encountered error while performing kPCA on the input data: %s" %(rex)) -set_default_mode(BASIC_CONVERSION) - -eig = r.eig(kpc) -pcv = r.pcv(kpc) -rotated = r.rotated(kpc) - -comps = eig.keys() -eigv = eig.values() -for i in range(ncomps): - eigv[comps.index('Comp.%s' %(i+1))] = eig.values()[i] - -print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) - -print >>fout, "#Eigenvalue\t%s" %("\t".join(["%.4g" % el for el in eig.values()])) - -print >>fout, "#Principal component vectors\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for obs,val in enumerate(pcv): - print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#Rotated values\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -for obs,val in enumerate(rotated): - print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val])) - -r.pdf( outfile2, 8, 8 ) -if ncomps != 1: - r.pairs(rotated,labels=r.list(range(1,ncomps+1)),main="Scatterplot of rotated values") -else: - r.plot(rotated, ylab='Comp.1', main="Scatterplot of rotated values") -r.dev_off() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/kpca.xml --- a/tools/multivariate_stats/kpca.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,140 +0,0 @@ - - - - kpca.py - --input=$input1 - --output1=$out_file1 - --output2=$out_file2 - --var_cols=$var_cols - --kernel=$kernelChoice.kernel - --features=$features - #if $kernelChoice.kernel == "rbfdot" or $kernelChoice.kernel == "anovadot": - --sigma=$kernelChoice.sigma - --degree="None" - --scale="None" - --offset="None" - --order="None" - #elif $kernelChoice.kernel == "polydot": - --sigma="None" - --degree=$kernelChoice.degree - --scale=$kernelChoice.scale - --offset=$kernelChoice.offset - --order="None" - #elif $kernelChoice.kernel == "tanhdot": - --sigma="None" - --degree="None" - --scale=$kernelChoice.scale - --offset=$kernelChoice.offset - --order="None" - #elif $kernelChoice.kernel == "besseldot": - --sigma=$kernelChoice.sigma - --degree=$kernelChoice.degree - --scale="None" - --offset="None" - --order=$kernelChoice.order - #elif $kernelChoice.kernel == "anovadot": - --sigma=$kernelChoice.sigma - --degree=$kernelChoice.degree - --scale="None" - --offset="None" - --order="None" - #else: - --sigma="None" - --degree="None" - --scale="None" - --offset="None" - --order="None" - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rpy - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* - ------ - -.. class:: infomark - -**What it does** - -This tool uses functions from 'kernlab' library from R statistical package to perform Kernel Principal Component Analysis (kPCA) on the input data. It outputs two files, one containing the summary statistics of the performed kPCA, and the other containing a scatterplot matrix of rotated values reported by kPCA. - -*Alexandros Karatzoglou, Alex Smola, Kurt Hornik, Achim Zeileis (2004). kernlab - An S4 Package for Kernel Methods in R. Journal of Statistical Software 11(9), 1-20. URL http://www.jstatsoft.org/v11/i09/* - ------ - -.. class:: warningmark - -**Note** - -This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/pca.py --- a/tools/multivariate_stats/pca.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ -#!/usr/bin/env python - -from galaxy import eggs -import sys, string -from rpy import * -import numpy - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -infile = sys.argv[1] -x_cols = sys.argv[2].split(',') -method = sys.argv[3] -outfile = sys.argv[4] -outfile2 = sys.argv[5] - -if method == 'svd': - scale = center = "FALSE" - if sys.argv[6] == 'both': - scale = center = "TRUE" - elif sys.argv[6] == 'center': - center = "TRUE" - elif sys.argv[6] == 'scale': - scale = "TRUE" - -fout = open(outfile,'w') -elems = [] -for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - -if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - -x_vals = [] - -for k,col in enumerate(x_cols): - x_cols[k] = int(col)-1 - x_vals.append([]) - -NA = 'NA' -skipped = 0 -for ind,line in enumerate( file( infile )): - if line and not line.startswith( '#' ): - try: - fields = line.strip().split("\t") - valid_line = True - for k,col in enumerate(x_cols): - try: - xval = float(fields[col]) - except: - skipped += 1 - valid_line = False - break - if valid_line: - for k,col in enumerate(x_cols): - xval = float(fields[col]) - x_vals[k].append(xval) - except: - skipped += 1 - -x_vals1 = numpy.asarray(x_vals).transpose() -dat= r.list(array(x_vals1)) - -set_default_mode(NO_CONVERSION) -try: - if method == "cor": - pc = r.princomp(r.na_exclude(dat), cor = r("TRUE")) - elif method == "cov": - pc = r.princomp(r.na_exclude(dat), cor = r("FALSE")) - elif method=="svd": - pc = r.prcomp(r.na_exclude(dat), center = r(center), scale = r(scale)) -except RException, rex: - stop_err("Encountered error while performing PCA on the input data: %s" %(rex)) - -set_default_mode(BASIC_CONVERSION) -summary = r.summary(pc, loadings="TRUE") -ncomps = len(summary['sdev']) - -if type(summary['sdev']) == type({}): - comps_unsorted = summary['sdev'].keys() - comps=[] - sd = summary['sdev'].values() - for i in range(ncomps): - sd[i] = summary['sdev'].values()[comps_unsorted.index('Comp.%s' %(i+1))] - comps.append('Comp.%s' %(i+1)) -elif type(summary['sdev']) == type([]): - comps=[] - for i in range(ncomps): - comps.append('Comp.%s' %(i+1)) - sd = summary['sdev'] - -print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -print >>fout, "#Std. deviation\t%s" %("\t".join(["%.4g" % el for el in sd])) -total_var = 0 -vars = [] -for s in sd: - var = s*s - total_var += var - vars.append(var) -for i,var in enumerate(vars): - vars[i] = vars[i]/total_var - -print >>fout, "#Proportion of variance explained\t%s" %("\t".join(["%.4g" % el for el in vars])) - -print >>fout, "#Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -xcolnames = ["c%d" %(el+1) for el in x_cols] -if 'loadings' in summary: #in case of princomp - loadings = 'loadings' -elif 'rotation' in summary: #in case of prcomp - loadings = 'rotation' -for i,val in enumerate(summary[loadings]): - print >>fout, "%s\t%s" %(xcolnames[i], "\t".join(["%.4g" % el for el in val])) - -print >>fout, "#Scores\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) -if 'scores' in summary: #in case of princomp - scores = 'scores' -elif 'x' in summary: #in case of prcomp - scores = 'x' -for obs,sc in enumerate(summary[scores]): - print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in sc])) - -r.pdf( outfile2, 8, 8 ) -r.biplot(pc) -r.dev_off() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/multivariate_stats/pca.xml --- a/tools/multivariate_stats/pca.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ - - - - pca.py - $input1 - $var_cols - $methodChoice.method - $out_file1 - $out_file2 - #if $methodChoice.method == "svd": - $methodChoice.scale - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rpy - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* - ------ - -.. class:: infomark - -**What it does** - -This tool performs Principal Component Analysis on the given numeric input data using functions from R statistical package - 'princomp' function (for Eigenvector based solution) and 'prcomp' function (for Singular value decomposition based solution). It outputs two files, one containing the summary statistics of PCA, and the other containing biplots of the observations and principal components. - -*R Development Core Team (2009). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.* - ------ - -.. class:: warningmark - -**Note** - -- This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. - -- The summary statistics in the output are described below: - - - Std. deviation: Standard deviations of the principal components - - Loadings: a list of eigen-vectors/variable loadings - - Scores: Scores of the input data on the principal components - - - diff -r c2a356708570 -r 33c067c3ae34 tools/mutation/visualize.py --- a/tools/mutation/visualize.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,391 +0,0 @@ -#!/usr/bin/env python - -''' -Mutation Visualizer tool -''' - -from __future__ import division - -import sys, csv, os, math -import optparse - -from galaxy import eggs -import pkg_resources -pkg_resources.require( "SVGFig" ) -import svgfig as svg - - -SVGPan = """ -/** - * SVGPan library 1.2 - * ==================== - * - * Given an unique existing element with id "viewport", including the - * the library into any SVG adds the following capabilities: - * - * - Mouse panning - * - Mouse zooming (using the wheel) - * - Object dargging - * - * Known issues: - * - * - Zooming (while panning) on Safari has still some issues - * - * Releases: - * - * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui - * Fixed a bug with browser mouse handler interaction - * - * 1.1, Wed Feb 3 17:39:33 GMT 2010, Zeng Xiaohui - * Updated the zoom code to support the mouse wheel on Safari/Chrome - * - * 1.0, Andrea Leofreddi - * First release - * - * This code is licensed under the following BSD license: - * - * Copyright 2009-2010 Andrea Leofreddi (a.leofreddi@itcharm.com). All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are - * permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, this list - * of conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those of the - * authors and should not be interpreted as representing official policies, either expressed - * or implied, of Andrea Leofreddi. - */ - -var root = document.documentElement; - -var state = 'none', stateTarget, stateOrigin, stateTf; - -setupHandlers(root); - -/** - * Register handlers - */ -function setupHandlers(root){ - setAttributes(root, { - "onmouseup" : "add(evt)", - "onmousedown" : "handleMouseDown(evt)", - "onmousemove" : "handleMouseMove(evt)", - "onmouseup" : "handleMouseUp(evt)", - //"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element - }); - - if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0) - window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari - else - window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others -} - -/** - * Instance an SVGPoint object with given event coordinates. - */ -function getEventPoint(evt) { - var p = root.createSVGPoint(); - - p.x = evt.clientX; - p.y = evt.clientY; - - return p; -} - -/** - * Sets the current transform matrix of an element. - */ -function setCTM(element, matrix) { - var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")"; - - element.setAttribute("transform", s); -} - -/** - * Dumps a matrix to a string (useful for debug). - */ -function dumpMatrix(matrix) { - var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\\n " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\\n 0, 0, 1 ]"; - - return s; -} - -/** - * Sets attributes of an element. - */ -function setAttributes(element, attributes){ - for (i in attributes) - element.setAttributeNS(null, i, attributes[i]); -} - -/** - * Handle mouse move event. - */ -function handleMouseWheel(evt) { - if(evt.preventDefault) - evt.preventDefault(); - - evt.returnValue = false; - - var svgDoc = evt.target.ownerDocument; - - var delta; - - if(evt.wheelDelta) - delta = evt.wheelDelta / 3600; // Chrome/Safari - else - delta = evt.detail / -90; // Mozilla - - var z = 1 + delta; // Zoom factor: 0.9/1.1 - - var g = svgDoc.getElementById("viewport"); - - var p = getEventPoint(evt); - - p = p.matrixTransform(g.getCTM().inverse()); - - // Compute new scale matrix in current mouse position - var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y); - - setCTM(g, g.getCTM().multiply(k)); - - stateTf = stateTf.multiply(k.inverse()); -} - -/** - * Handle mouse move event. - */ -function handleMouseMove(evt) { - if(evt.preventDefault) - evt.preventDefault(); - - evt.returnValue = false; - - var svgDoc = evt.target.ownerDocument; - - var g = svgDoc.getElementById("viewport"); - - if(state == 'pan') { - // Pan mode - var p = getEventPoint(evt).matrixTransform(stateTf); - - setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y)); - } else if(state == 'move') { - // Move mode - var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse()); - - setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM())); - - stateOrigin = p; - } -} - -/** - * Handle click event. - */ -function handleMouseDown(evt) { - if(evt.preventDefault) - evt.preventDefault(); - - evt.returnValue = false; - - var svgDoc = evt.target.ownerDocument; - - var g = svgDoc.getElementById("viewport"); - - if(evt.target.tagName == "svg") { - // Pan mode - state = 'pan'; - - stateTf = g.getCTM().inverse(); - - stateOrigin = getEventPoint(evt).matrixTransform(stateTf); - } - /*else { - // Move mode - state = 'move'; - - stateTarget = evt.target; - - stateTf = g.getCTM().inverse(); - - stateOrigin = getEventPoint(evt).matrixTransform(stateTf); - }*/ -} -/** - * Handle mouse button release event. - */ -function handleMouseUp(evt) { - if(evt.preventDefault) - evt.preventDefault(); - - evt.returnValue = false; - - var svgDoc = evt.target.ownerDocument; - - if(state == 'pan' || state == 'move') { - // Quit pan mode - state = ''; - } -} -""" - -COLS_PER_SAMPLE = 7 -HEADER_COLS = 4 - -HEIGHT = 6 -WIDTH = 12 -BAR_WIDTH = 1.5 -GAP = 2 - - -colors = {'A':'blue', 'C':'green', 'G':'orange', 'T':'red'} -bases = ['A', 'C', 'G', 'T' ] - -def stop_error(message): - print >> sys.stderr, message - sys.exit(1) - -def validate_bases(n_a, n_c, n_g, n_t, total): - if n_a > total: - return 'A' - elif n_c > total: - return 'C' - elif n_g > total: - return 'G' - elif n_t > total: - return 'T' - return None - -def main(opts, args): - s = svg.SVG('g', id='viewport') - - # display legend - for i, b in enumerate( bases ): - bt = svg.SVG("tspan", b, style="font-family:Verdana;font-size:20%") - s.append(svg.SVG("text", bt, x=12+(i*10), y=3, stroke="none", fill="black")) - s.append(svg.SVG("rect", x=14+(i*10), y=0, width=4, height=3, - stroke="none", fill=colors[b], fill_opacity=0.5)) - - reader = open(opts.input_file, 'U') - - samples = [] - for i in range(int(len(args)/3)): - index = i*3 - samples.append(dict(name=args[index], - a_col=args[index+1], - totals_col=args[index+2])) - - if opts.zoom == 'interactive': - y = 35 - else: - y = 25 - for i, sample in enumerate(samples): - x = 23+(i*(WIDTH+GAP)) - t = svg.SVG("text", svg.SVG("tspan", sample['name'], style="font-family:Verdana;font-size:25%"), - x=x, y=y, transform="rotate(-90 %i,%i)" % (x, y), stroke="none", fill="black") - s.append(t) - - count=1 - for line in reader: - row = line.split('\t') - highlighted_position = False - show_pos = True - position = row[int(opts.position_col)-1] - ref = row[int(opts.ref_col)-1].strip().upper() - # validate - if ref not in bases: - stop_error( "The reference column (col%s) contains invalid character '%s' at row %i of the dataset." % ( opts.ref_col, ref, count ) ) - # display positions - if opts.zoom == 'interactive': - textx = 0 - else: - textx = 7 - bt = svg.SVG("tspan", str(position), style="font-family:Verdana;font-size:25%") - s.append(svg.SVG("text", bt, x=textx, y=34+(count*(HEIGHT+GAP)), stroke="none", fill="black")) - s.append(svg.SVG("rect", x=0, y=30+(count*(HEIGHT+GAP)), width=14, height=HEIGHT, - stroke='none', fill=colors[ref.upper()], fill_opacity=0.2)) - - for sample_index, sample in enumerate(samples): - n_a = int(row[int(sample['a_col'])-1]) - n_c = int(row[int(sample['a_col'])+1-1]) - n_g = int(row[int(sample['a_col'])+2-1]) - n_t = int(row[int(sample['a_col'])+3-1]) - total = int(row[int(sample['totals_col'])-1]) - # validate - base_error = validate_bases(n_a, n_c, n_g, n_t, total) - if base_error: - stop_error("For sample %i (%s), the number of base %s reads is more than the coverage on row %i." % (sample_index+1, - sample['name'], - base_error, - count)) - - if total: - x = 16+(sample_index*(WIDTH+GAP)) - y = 30+(count*(HEIGHT+GAP)) - width = WIDTH - height = HEIGHT - if count%2: - s.append(svg.SVG("rect", x=x, y=y, width=width, height=height, - stroke='none', fill='grey', fill_opacity=0.25)) - else: - s.append(svg.SVG("rect", x=x, y=y, width=width, height=height, - stroke='none', fill='grey', fill_opacity=0.25)) - - for base, value in enumerate([n_a, n_c, n_g, n_t]): - width = int(math.ceil(value / total * WIDTH)) - s.append(svg.SVG("rect", x=x, y=y, width=width, height=BAR_WIDTH, - stroke='none', fill=colors[bases[base]], fill_opacity=0.6)) - y = y + BAR_WIDTH - - count=count+1 - - if opts.zoom == 'interactive': - canv = svg.canvas(s) - canv.save(opts.output_file) - import fileinput - flag = False - for line in fileinput.input(opts.output_file, inplace=1): - if line.startswith('' - flag = True - continue - else: - if flag: - print '' % SVGPan - flag = False - print line, - else: - zoom = int(opts.zoom) - w = "%ipx" % (x*(10+zoom)) - h = "%ipx" % (y*(2+zoom)) - canv = svg.canvas(s, width=w, height=h, viewBox="0 0 %i %i" %(x+100, y+100)) - canv.save(opts.output_file) - -if __name__ == '__main__': - parser = optparse.OptionParser() - parser.add_option('-i', '--input-file', dest='input_file', action='store') - parser.add_option('-o', '--output-file', dest='output_file', action='store') - parser.add_option('-z', '--zoom', dest='zoom', action='store', default='1') - parser.add_option('-p', '--position_col', dest='position_col', action='store', default='c0') - parser.add_option('-r', '--ref_col', dest='ref_col', action='store', default='c1') - (opts, args) = parser.parse_args() - main(opts, args) - sys.exit(1) - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/mutation/visualize.xml --- a/tools/mutation/visualize.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ - - - - visualize.py - --input-file=$input1 - --output-file=$out_file1 - --zoom=$zoom_value - --position_col=$position_col - --ref_col=$ref_col - #for $f in $sample_chooser: - "${f.name}" - ${f.a_col} - ${f.totals_col} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool allows you to visualize mutations described in a tabular input file. It generates an SVG image which can be viewed in any web browser. - -You will need to specify the position and reference columns in the input file. Then click on the 'Add new Sample' to add samples in the input file that you would like to visualize. For each sample you select, specify the column for base 'A', the totals column and enter a name. -This tool assumes the columns specifying bases A, C, G, T are placed consecutively and in that order in an input file. - -Interactivity: If interactive zoom option is selected, then the resultant image can be zoomed in or out using the scroll mouse wheel and can be panned by dragging the image using left mouse button. - ------ - -**General Example** - -Given the input file:: - - chrM 72 73 G 26394 4 49 0 26447 26398 1 23389 3 45 0 23437 23392 1 - chrM 149 150 T 11 50422 2 96 50531 50435 1 4 45417 1 65 45487 45422 1 - -To visualize the two samples in the input file, the following parameters are selected before running the tool:: - - Position column: 2 - Reference Base column: 4 - Sample 1 Label: gm blood - Sample 1 Base A column: 5 - Sample 1 Totals column: 9 - Sample 2 Label: gm cheek - Sample 2 Base A column: 12 - Sample 2 Totals column: 16 - -Visualization output: - -.. image:: ./static/images/mutation_visualization_example.png - :width: 150 - -Here the left-most column represents the position and the background color is the reference base color. Each column on its right describe each sample. -In the output above, the blue bar is the longest, which means that base A is maximum in position 72 for both the samples. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/blastxml_to_tabular.py --- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,254 +0,0 @@ -#!/usr/bin/env python -"""Convert a BLAST XML file to 12 column tabular output - -Takes three command line options, input BLAST XML filename, output tabular -BLAST filename, output format (std for standard 12 columns, or ext for the -extended 24 columns offered in the BLAST+ wrappers). - -The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart -qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which -mean: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The additional columns offered in the Galaxy BLAST+ wrappers are: - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -Most of these fields are given explicitly in the XML file, others some like -the percentage identity and the number of gap openings must be calculated. - -Be aware that the sequence in the extended tabular output or XML direct from -BLAST+ may or may not use XXXX masking on regions of low complexity. This -can throw the off the calculation of percentage identity and gap openings. -[In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard, -with these numbers changing depending on whether or not the low complexity -filter is used.] - -This script attempts to produce identical output to what BLAST+ would have done. -However, check this with "diff -b ..." since BLAST+ sometimes includes an extra -space character (probably a bug). -""" -import sys -import re - -if sys.version_info[:2] >= ( 2, 5 ): - import xml.etree.cElementTree as ElementTree -else: - from galaxy import eggs - import pkg_resources; pkg_resources.require( "elementtree" ) - from elementtree import ElementTree - -def stop_err( msg ): - sys.stderr.write("%s\n" % msg) - sys.exit(1) - -#Parse Command Line -try: - in_file, out_file, out_fmt = sys.argv[1:] -except: - stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)") - -if out_fmt == "std": - extended = False -elif out_fmt == "x22": - stop_err("Format argument x22 has been replaced with ext (extended 24 columns)") -elif out_fmt == "ext": - extended = True -else: - stop_err("Format argument should be std (12 column) or ext (extended 24 columns)") - - -# get an iterable -try: - context = ElementTree.iterparse(in_file, events=("start", "end")) -except: - stop_err("Invalid data format.") -# turn it into an iterator -context = iter(context) -# get the root element -try: - event, root = context.next() -except: - stop_err( "Invalid data format." ) - - -re_default_query_id = re.compile("^Query_\d+$") -assert re_default_query_id.match("Query_101") -assert not re_default_query_id.match("Query_101a") -assert not re_default_query_id.match("MyQuery_101") -re_default_subject_id = re.compile("^Subject_\d+$") -assert re_default_subject_id.match("Subject_1") -assert not re_default_subject_id.match("Subject_") -assert not re_default_subject_id.match("Subject_12a") -assert not re_default_subject_id.match("TheSubject_1") - - -outfile = open(out_file, 'w') -blast_program = None -for event, elem in context: - if event == "end" and elem.tag == "BlastOutput_program": - blast_program = elem.text - # for every tag - if event == "end" and elem.tag == "Iteration": - #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA - # sp|Q9BS26|ERP44_HUMAN - # Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 - # 406 - # - # - #Or, from BLAST 2.2.24+ run online - # Query_1 - # Sample - # 516 - # ... - qseqid = elem.findtext("Iteration_query-ID") - if re_default_query_id.match(qseqid): - #Place holder ID, take the first word of the query definition - qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] - qlen = int(elem.findtext("Iteration_query-len")) - - # for every within - for hit in elem.findall("Iteration_hits/Hit"): - #Expecting either this, - # gi|3024260|sp|P56514.1|OPSD_BUFBU - # RecName: Full=Rhodopsin - # P56514 - #or, - # Subject_1 - # gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus] - # Subject_1 - # - #apparently depending on the parse_deflines switch - sseqid = hit.findtext("Hit_id").split(None,1)[0] - hit_def = sseqid + " " + hit.findtext("Hit_def") - if re_default_subject_id.match(sseqid) \ - and sseqid == hit.findtext("Hit_accession"): - #Place holder ID, take the first word of the subject definition - hit_def = hit.findtext("Hit_def") - sseqid = hit_def.split(None,1)[0] - # for every within - for hsp in hit.findall("Hit_hsps/Hsp"): - nident = hsp.findtext("Hsp_identity") - length = hsp.findtext("Hsp_align-len") - pident = "%0.2f" % (100*float(nident)/float(length)) - - q_seq = hsp.findtext("Hsp_qseq") - h_seq = hsp.findtext("Hsp_hseq") - m_seq = hsp.findtext("Hsp_midline") - assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) - gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \ - len(h_seq.replace('-', ' ').split())-1) - - mismatch = m_seq.count(' ') + m_seq.count('+') \ - - q_seq.count('-') - h_seq.count('-') - #TODO - Remove this alternative mismatch calculation and test - #once satisifed there are no problems - expected_mismatch = len(q_seq) \ - - sum(1 for q,h in zip(q_seq, h_seq) \ - if q == h or q == "-" or h == "-") - xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X") - if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): - stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \ - % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), - int(mismatch), expected_mismatch)) - - #TODO - Remove this alternative identity calculation and test - #once satisifed there are no problems - expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h) - if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): - stop_err("%s vs %s identities, expected %i <= %i <= %i" \ - % (qseqid, sseqid, expected_identity, int(nident), - expected_identity + q_seq.count("X"))) - - - evalue = hsp.findtext("Hsp_evalue") - if evalue == "0": - evalue = "0.0" - else: - evalue = "%0.0e" % float(evalue) - - bitscore = float(hsp.findtext("Hsp_bit-score")) - if bitscore < 100: - #Seems to show one decimal place for lower scores - bitscore = "%0.1f" % bitscore - else: - #Note BLAST does not round to nearest int, it truncates - bitscore = "%i" % bitscore - - values = [qseqid, - sseqid, - pident, - length, #hsp.findtext("Hsp_align-len") - str(mismatch), - gapopen, - hsp.findtext("Hsp_query-from"), #qstart, - hsp.findtext("Hsp_query-to"), #qend, - hsp.findtext("Hsp_hit-from"), #sstart, - hsp.findtext("Hsp_hit-to"), #send, - evalue, #hsp.findtext("Hsp_evalue") in scientific notation - bitscore, #hsp.findtext("Hsp_bit-score") rounded - ] - - if extended: - sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) - #print hit_def, "-->", sallseqid - positive = hsp.findtext("Hsp_positive") - ppos = "%0.2f" % (100*float(positive)/float(length)) - qframe = hsp.findtext("Hsp_query-frame") - sframe = hsp.findtext("Hsp_hit-frame") - if blast_program == "blastp": - #Probably a bug in BLASTP that they use 0 or 1 depending on format - if qframe == "0": qframe = "1" - if sframe == "0": sframe = "1" - slen = int(hit.findtext("Hit_len")) - values.extend([sallseqid, - hsp.findtext("Hsp_score"), #score, - nident, - positive, - hsp.findtext("Hsp_gaps"), #gaps, - ppos, - qframe, - sframe, - #NOTE - for blastp, XML shows original seq, tabular uses XXX masking - q_seq, - h_seq, - str(qlen), - str(slen), - ]) - #print "\t".join(values) - outfile.write("\t".join(values) + "\n") - # prevents ElementTree from growing large datastructure - root.clear() - elem.clear() -outfile.close() diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/blastxml_to_tabular.xml --- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ - - Convert BLAST XML output to tabular - - blastxml_to_tabular.py $blastxml_file $tabular_file $out_format - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of -formats including tabular and a more detailed XML format. A complex workflow -may need both the XML and the tabular output - but running BLAST twice is -slow and wasteful. - -This tool takes the BLAST XML output and by default converts it into the -standard 12 column tabular equivalent: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 22 column tabular -BLAST output. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -Beware that the XML file (and thus the conversion) and the tabular output -direct from BLAST+ may differ in the presence of XXXX masking on regions -low complexity (columns 21 and 22), and thus also calculated figures like -the percentage idenity (column 3). - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/hide_stderr.py --- a/tools/ncbi_blast_plus/hide_stderr.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -#!/usr/bin/env python -"""A simple script to redirect stderr to stdout when the return code is zero. - -See https://bitbucket.org/galaxy/galaxy-central/issue/325/ - -Currently Galaxy ignores the return code from command line tools (even if it -is non-zero which by convention indicates an error) and treats any output on -stderr as an error (even though by convention stderr is used for errors or -warnings). - -This script runs the given command line, capturing all stdout and stderr in -memory, and gets the return code. For a zero return code, any stderr (which -should be warnings only) is added to the stdout. That way Galaxy believes -everything is fine. For a non-zero return code, we output stdout as is, and -any stderr, plus the return code to ensure there is some output on stderr. -That way Galaxy treats this as an error. - -Once issue 325 is fixed, this script will not be needed. -""" -import sys -import subprocess - -#Avoid using shell=True when we call subprocess to ensure if the Python -#script is killed, so too is the BLAST process. -try: - words = [] - for w in sys.argv[1:]: - if " " in w: - words.append('"%s"' % w) - else: - words.append(w) - cmd = " ".join(words) - child = subprocess.Popen(sys.argv[1:], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) -except Exception, err: - sys.stderr.write("Error invoking command:\n%s\n\n%s\n" % (cmd, err)) - sys.exit(1) -#Use .communicate as can get deadlocks with .wait(), -stdout, stderr = child.communicate() -return_code = child.returncode - -if return_code: - sys.stdout.write(stdout) - sys.stderr.write(stderr) - sys.stderr.write("Return error code %i from command:\n" % return_code) - sys.stderr.write("%s\n" % cmd) -else: - sys.stdout.write(stdout) - sys.stdout.write(stderr) diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,209 +0,0 @@ - - Search nucleotide database with nucleotide query sequence(s) - blastn -version - hide_stderr.py -## The command is a Cheetah template which allows some Python based syntax. -## Lines starting hash hash are comments. Galaxy will turn newlines into spaces -blastn --query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#else: - -subject "$db_opts.subject" -#end if --task $blast_type --evalue $evalue_cutoff --out $output1 -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 -#if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query -$adv_opts.strand -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -$adv_opts.ungapped -$adv_opts.parse_deflines -## End of advanced options: -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - blastn - - - -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ - -**What it does** - -Search a *nucleotide database* using a *nucleotide query*, -using the NCBI BLAST+ blastn command line tool. -Algorithms include blastn, megablast, and discontiguous megablast. - ------ - -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). - -------- - -**References** - -Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,276 +0,0 @@ - - Search protein database with protein query sequence(s) - blastp -version - hide_stderr.py -## The command is a Cheetah template which allows some Python based syntax. -## Lines starting hash hash are comments. Galaxy will turn newlines into spaces -blastp --query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#else: - -subject "$db_opts.subject" -#end if --task $blast_type --evalue $evalue_cutoff --out $output1 -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 -#if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query --matrix $adv_opts.matrix -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -##Ungapped disabled for now - see comments below -##$adv_opts.ungapped -$adv_opts.parse_deflines -## End of advanced options: -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - blastp - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ - -**What it does** - -Search a *protein database* using a *protein query*, -using the NCBI BLAST+ blastp command line tool. - ------ - -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). - -------- - -**References** - -Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. - -Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,240 +0,0 @@ - - Search protein database with translated nucleotide query sequence(s) - blastx -version - hide_stderr.py -## The command is a Cheetah template which allows some Python based syntax. -## Lines starting hash hash are comments. Galaxy will turn newlines into spaces -blastx --query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#else: - -subject "$db_opts.subject" -#end if --evalue $evalue_cutoff --out $output1 -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 -#if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query -$adv_opts.strand --matrix $adv_opts.matrix -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -$adv_opts.ungapped -$adv_opts.parse_deflines -## End of advanced options: -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - blastx - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ - -**What it does** - -Search a *protein database* using a *translated nucleotide query*, -using the NCBI BLAST+ blastx command line tool. - ------ - -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). - -------- - -**References** - -Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,286 +0,0 @@ - - Search translated nucleotide database with protein query sequence(s) - tblastn -version - hide_stderr.py -## The command is a Cheetah template which allows some Python based syntax. -## Lines starting hash hash are comments. Galaxy will turn newlines into spaces -tblastn --query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#else: - -subject "$db_opts.subject" -#end if --evalue $evalue_cutoff --out $output1 -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 -#if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query --matrix $adv_opts.matrix -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -##Ungapped disabled for now - see comments below -##$adv_opts.ungapped -$adv_opts.parse_deflines -## End of advanced options: -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - tblastn - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ - -**What it does** - -Search a *translated nucleotide database* using a *protein query*, -using the NCBI BLAST+ tblastn command line tool. - ------ - -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). - -------- - -**References** - -Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,206 +0,0 @@ - - Search translated nucleotide database with translated nucleotide query sequence(s) - tblastx -version - hide_stderr.py -## The command is a Cheetah template which allows some Python based syntax. -## Lines starting hash hash are comments. Galaxy will turn newlines into spaces -tblastx --query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#else: - -subject "$db_opts.subject" -#end if --evalue $evalue_cutoff --out $output1 -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 -#if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query -$adv_opts.strand --matrix $adv_opts.matrix -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -$adv_opts.parse_deflines -## End of advanced options: -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - tblastx - - - -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ - -**What it does** - -Search a *translated nucleotide database* using a *protein query*, -using the NCBI BLAST+ tblastx command line tool. - ------ - -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). - -------- - -**References** - -Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/basecoverage.xml --- a/tools/new_operations/basecoverage.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ - - of all intervals - gops_basecoverage.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - -This operation counts the total bases covered by a set of intervals. Bases that are covered by more than one interval are **not** counted more than once towards the total. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/cluster.xml --- a/tools/new_operations/cluster.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ - - the intervals of a dataset - gops_cluster.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -d $distance -m $minregions -o $returntype - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Maximum distance** is greatest distance in base pairs allowed between intervals that will be considered "clustered". **Negative** values for distance are allowed, and are useful for clustering intervals that overlap. -- **Minimum intervals per cluster** allow a threshold to be set on the minimum number of intervals to be considered a cluster. Any area with less than this minimum will not be included in the output. -- **Merge clusters into single intervals** outputs intervals that span the entire cluster. -- **Find cluster intervals; preserve comments and order** filters out non-cluster intervals while maintaining the original ordering and comments in the file. -- **Find cluster intervals; output grouped by clusters** filters out non-cluster intervals, but outputs the cluster intervals so that they are grouped together. Comments and original ordering in the file are lost. - ------ - -**Example** - -.. image:: ./static/operation_icons/gops_cluster.gif - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/column_join.py --- a/tools/new_operations/column_join.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,290 +0,0 @@ -#!/usr/bin/env python - -""" -This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped. - -usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f [other_input1 [other_input2 [other_input3 ...]]] - -o, output=0: the output pileup - -1, input1=1: the pileup file to start with - -2, input2=2: the second pileup file to join - -g, hinge=h: the columns to be used for matching - -c, columns=c: the columns that should appear in the output - -f, fill_options_file=f: the file specifying the fill value to use - other_inputs: the other input files to join -""" - -import optparse, os, re, struct, sys, tempfile - -try: - simple_json_exception = None - from galaxy import eggs - from galaxy.util.bunch import Bunch - from galaxy.util import stringify_dictionary_keys - import pkg_resources - pkg_resources.require("simplejson") - import simplejson -except Exception, e: - simplejson_exception = e - simplejson = None - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def split_nums( text ): - """ - Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ] - """ - split_t = [] - c = '' - n = '' - for ch in text: - try: - v = int( ch ) - n += ch - if c: - split_t.append( ''.join( c ) ) - c = '' - except ValueError: - c += ch - if n: - split_t.append( int( ''.join( n ) ) ) - n = '' - if c: - split_t.append( ''.join( c ) ) - if n: - split_t.append( int( ''.join( n ) ) ) - return split_t - -def hinge_compare( hinge1, hinge2 ): - """ - Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that - first part handled as text but last part as number - """ - split_hinge1 = hinge1.split( '\t' ) - split_hinge2 = hinge2.split( '\t' ) - # quick check if either hinge is empty - if not ''.join( split_hinge2 ): - if ''.join( split_hinge1 ): - return 1 - elif not ''.join( split_hinge1 ): - return 0 - else: - if not ''.join( split_hinge1 ): - return -1 - # go through all parts of the hinges and compare - for i, sh1 in enumerate( split_hinge1 ): - # if these hinge segments are the same, just move on to the next ones - if sh1 == split_hinge2[ i ]: - continue - # check all parts of each hinge - h1 = split_nums( sh1 ) - h2 = split_nums( split_hinge2[ i ] ) - for j, h in enumerate( h1 ): - # if second hinge has no more parts, first is considered larger - if j > 0 and len( h2 ) <= j: - return 1 - # if these two parts are the same, move on to next - if h == h2[ j ]: - continue - # do actual comparison, depending on whether letter or number - if type( h ) == int: - if type( h2[ j ] ) == int: - if h > h2[ j ]: - return 1 - elif h < h2[ j ]: - return -1 - # numbers are less than letters - elif type( h2[ j ] ) == str: - return -1 - elif type( h ) == str: - if type( h2[ j ] ) == str: - if h > h2[ j ]: - return 1 - elif h < h2[ j ]: - return -1 - # numbers are less than letters - elif type( h2[ j ] ) == int: - return 1 - # if all else has failed, just do basic string comparison - if hinge1 > hinge2: - return 1 - elif hinge1 == hinge2: - return 0 - elif hinge1 < hinge2: - return -1 - -def hinge_sort( infile, outfile, hinge ): - """Given input file name, sorts logically (text vs. numeric) into provided output file name.""" - hinge_locs = {} - bad_lines = [] - fin = open( infile, 'rb' ) - line = fin.readline() - while line.strip(): - try: - hinge_parts = line.split( '\t' )[ :hinge ] - try: - hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) ) - except KeyError: - hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ] - except ValueError: - bad_line.append( line ) - line = fin.readline() - fin.close() - fin = open( infile, 'rb' ) - fout = open( outfile, 'wb' ) - hinge_locs_sorted = hinge_locs.keys() - hinge_locs_sorted.sort( hinge_compare ) - for hinge_loc in hinge_locs_sorted: - locs = hinge_locs[ hinge_loc ] - for loc in locs: - fin.seek( loc ) - fout.write( fin.readline() ) - fout.close() - fin.close() - -def __main__(): - parser = optparse.OptionParser() - parser.add_option( '-o', '--output', dest='output', help='The name of the output file' ) - parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' ) - parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' ) - parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' ) - parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' ) - parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' ) - (options, args) = parser.parse_args() - hinge = int( options.hinge ) - cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ] - inputs = [ options.input1, options.input2 ] - if options.fill_options_file == 'None': - inputs.extend( args ) - elif len( args ) > 0: - inputs.extend( args ) - fill_options = None - if options.fill_options_file != 'None' and options.fill_options_file is not None: - try: - if simplejson is None: - raise simplejson_exception - fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) - except Exception, e: - print 'Warning: Ignoring fill options due to simplejson error (%s).' % e - if fill_options is None: - fill_options = Bunch() - if 'file1_columns' not in fill_options: - fill_options.file1_columns = None - if fill_options and fill_options.file1_columns: - fill_empty = {} - for col in cols: - fill_empty[ col ] = fill_options.file1_columns[ col - 1 ] - else: - fill_empty = None - assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge' - delimiter = '\t' - # make sure all files are sorted in same way, ascending - tmp_input_files = [] - input_files = inputs[:] - for in_file in input_files: - tmp_file = tempfile.NamedTemporaryFile() - tmp_file_name = tmp_file.name - tmp_file.close() - hinge_sort( in_file, tmp_file_name, hinge ) - tmp_file = open( tmp_file_name, 'rb' ) - tmp_input_files.append( tmp_file ) - # cycle through files, getting smallest line of all files one at a time - # also have to keep track of vertical position of extra columns - fout = file( options.output, 'w' ) - old_current = '' - first_line = True - current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ] - last_lines = ''.join( current_lines ) - last_loc = -1 - while last_lines: - # get the "minimum" hinge, which should come first, and the file location in list - hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ] - hinge_dict = {} - for i in range( len( hinges ) ): - if not hinge_dict.has_key( hinges[ i ] ): - hinge_dict[ hinges[ i ] ] = i - hinges.sort( hinge_compare ) - hinges = [ h for h in hinges if h ] - current, loc = hinges[0], hinge_dict[ hinges[0] ] - # first output empty columns for vertical alignment (account for "missing" files) - # write output for leading and trailing empty columns - # columns missing from actual file handled further below - current_data = [] - if current != old_current: - # fill trailing empty columns with appropriate fill value - if not first_line: - if last_loc < len( inputs ) - 1: - if not fill_empty: - filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] - else: - filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] - fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) ) - # insert line break before current line - fout.write( '\n' ) - # fill leading empty columns with appropriate fill value - if loc > 0: - if not fill_empty: - current_data = [ '' for col in range( loc * len( cols ) ) ] - else: - current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ] - else: - if loc - last_loc > 1: - if not fill_empty: - current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ] - else: - current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ] - # now output actual data - split_line = current_lines[ loc ].split( delimiter ) - # fill empties within actual line if appropriate - if fill_empty: - new_split_line = split_line[:] - split_line = [] - for i, item in enumerate( new_split_line ): - col = i + 1 - if not item: - try: - split_line.append( fill_empty[ i + 1 ] ) - except KeyError: - split_line.append( item ) - else: - split_line.append( item ) - # add actual data to be output below - if ''.join( split_line ): - for col in cols: - if col > hinge: - # if this column doesn't exist, add the appropriate filler or empty column - try: - new_item = split_line[ col - 1 ] - except IndexError: - if fill_empty: - new_item = fill_empty[ col ] - else: - new_item = '' - current_data.append( new_item ) - # grab next line for selected file - current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' ) - # write relevant data to file - if current == old_current and current_data: - fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) ) - elif current_data: - fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) ) - last_lines = ''.join( current_lines ) - else: - last_lines = None - last_loc = loc - old_current = current - first_line = False - # fill trailing empty columns for final line - if last_loc < len( inputs ) - 1: - if not fill_empty: - filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] - else: - filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] - fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) ) - fout.write( '\n' ) - fout.close() - for f in tmp_input_files: - os.unlink( f.name ) - -if __name__ == "__main__" : __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/column_join.xml --- a/tools/new_operations/column_join.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,260 +0,0 @@ - - - - column_join.py - --output=$output - --input1=$input1 - --input2=$input2 - --hinge=$hinge - --columns=$columns - #if $fill_empty_columns.fill_empty_columns_switch == "fill_empty": - --fill_options_file=$fill_options_file - #end if - #for $f in $file_chooser: - ${f.input} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <% -import simplejson -%> -#set $__fill_options = {} -#if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty': - #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value': - #set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value - #else: - #set $__start_fill = "" - #end if - #set $__fill_options['file1_columns'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ] - #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column': - #for column_fill in $fill_empty_columns['do_fill_empty_columns']['column_fill']: - #set $__fill_options['file1_columns'][ int( column_fill['column_number'].value ) - 1 ] = column_fill['fill_value'].value - #end for - #end if -#end if -${simplejson.dumps( __fill_options )} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool allows you to join several files with the same column structure into one file, removing certain columns if necessary. The user needs to select a 'hinge', which is the number of left-most columns to match on. They also need to select the columns to include in the join, which should include the hinge columns, too. - -Note that the files are expected to have the same number of columns. If for some reason the join column is missing (this only applies to the last column(s)), the tool attempts to handle this situation by inserting an empty item (or the appropriate filler) for that column on that row. This could lead to the situation where a row has a hinge but entirely empty or filled columns, if the hinge exists in at least one file but every file that has it is missing the join column. Also, note that the tool does not distinguish between a file missing the hinge altogether and a file having the hinge but missing the column (in both cases the column would be empty or filled). There is an example of this below. - ------ - -**General Example** - -Given the following files:: - - FILE 1 - chr2 1 T 6 .C..., I$$III - chr2 2 G 6 ..N.., III@II - chr2 3 C 7 ..C..., I$IIIII - chr2 4 G 7 .G...., I#IIIII - chr2 5 G 7 ...N.., IIII#BI - chr2 6 A 7 ..T..., I$IDIII - chr1 1 C 1 ^:. I - chr1 2 G 2 .^:. $I - chr1 3 A 2 .. I% - chr1 4 C 2 .. I$ - chr1 5 T 3 ..^:. I#I - chr1 6 G 3 ..^:, I#I - - FILE 2 - chr1 3 T 1 ^:. I - chr1 4 G 2 .^:. $I - chr1 5 T 2 .. I% - chr1 6 C 3 ..^:. III - chr1 7 G 3 ..^:. I#I - chr1 8 T 4 ...^:, I#II - chr2 77 C 6 .G..., I$$III - chr2 78 G 6 ..N.., III@II - chr2 79 T 7 ..N..., I$IIIII - chr2 80 C 7 .G...., I#IIIII - chr2 81 G 7 ...A.., IIII#BI - chr2 82 A 8 ...G..., I$IDIIII - chr2 83 T 8 .A.....N IIIIIIII - chr2 84 A 9 ......T. I$IIIIIII - - FILE 3 - chr1 1 A 1 . I - chr1 2 T 2 G. I$ - chr1 3 C 2 ., I@ - chr1 4 C 3 ..N III - chr1 42 C 5 ...N^:. III@I - chr1 43 C 5 .N..^:. IIIII - chr1 44 T 5 .A.., IA@II - chr1 45 A 6 .N...^:. IIIII$ - chr1 46 G 6 .GN..^:. I@IIII - chr1 47 A 7 ....^:.., IIIII$I - chr2 73 T 5 .N.., II$II - chr2 74 A 5 ...., IIIII - chr2 75 T 5 ...., IIIII - chr2 76 T 5 ...., IIIII - chr2 77 C 5 ...., IIIBI - chr2 78 T 5 ...., IDIII - -To join on columns 3 and 4 combining on columns 1 and 2, columns 1-4 should be selected for the 'Include these columns' option, and column 2 selected for the 'hinge'. With these settings, the following would be output:: - - chr1 1 C 1 A 1 - chr1 2 G 2 T 2 - chr1 3 A 2 T 1 C 2 - chr1 4 C 2 G 2 C 3 - chr1 5 T 3 T 2 - chr1 6 G 3 C 3 - chr1 7 G 3 - chr1 8 T 4 - chr1 42 C 5 - chr1 43 C 5 - chr1 44 T 5 - chr1 45 A 6 - chr1 46 G 6 - chr1 47 A 7 - chr2 1 T 6 - chr2 2 G 6 - chr2 3 C 7 - chr2 4 G 7 - chr2 5 G 7 - chr2 6 A 7 - chr2 73 T 5 - chr2 74 A 5 - chr2 75 T 5 - chr2 76 T 5 - chr2 77 C 6 C 5 - chr2 78 G 6 T 5 - chr2 79 T 7 - chr2 80 C 7 - chr2 81 G 7 - chr2 82 A 8 - chr2 83 T 8 - chr2 84 A 9 - -**Example with missing columns** - -Given the following input files:: - - FILE 1 - 1 A - 2 B b - 4 C c - 5 D - 6 E e - - FILE 2 - 1 M m - 2 N - 3 O o - 4 P p - 5 Q - 7 R r - -if we join only column 3 using column 1 as the hinge and with a fill value of '0', this is what will be output:: - - 1 0 m - 2 b 0 - 3 0 o - 4 c p - 5 0 0 - 6 e 0 - 7 0 r - -Row 5 appears in both files with the missing column, so it's got nothing but fill values in the output file. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/complement.xml --- a/tools/new_operations/complement.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ - - intervals of a dataset - gops_complement.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -l ${chromInfo} $allchroms - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - -This operation complements the regions of a set of intervals. Regions are returned that represent the empty space in the input interval. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Genome-wide complement** will complement all chromosomes of the genome. Leaving this option unchecked will only complement chromosomes present in the dataset. - ------ - -**Example** - -.. image:: ./static/operation_icons/gops_complement.gif - - - diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/concat.xml --- a/tools/new_operations/concat.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ - - two datasets into one dataset - gops_concat.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} $sameformat - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu -> it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Both datasets are exactly the same filetype** will preserve all extra fields in both files. Leaving this unchecked will force the second dataset to use the same column assignments for chrom, start, end and strand, but will fill extra fields with a period(.). In both cases, the output fields are truncated or padded with fields of periods to maintain a truly tabular output. - ------ - -**Example** - -.. image:: ./static/operation_icons/gops_concatenate.gif - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/coverage.xml --- a/tools/new_operations/coverage.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,91 +0,0 @@ - - of a set of intervals on second set of intervals - gops_coverage.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu -> it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - -Find the coverage of intervals in the first dataset on intervals in the second dataset. The coverage is added as two columns, the first being bases covered, and the second being the fraction of bases covered by that interval. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Example** - - - if **First dataset** are genes :: - - chr11 5203271 5204877 NM_000518 0 - - chr11 5210634 5212434 NM_000519 0 - - chr11 5226077 5227663 NM_000559 0 - - chr11 5226079 5232587 BC020719 0 - - chr11 5230996 5232587 NM_000184 0 - - - and **Second dataset** are repeats:: - - chr11 5203895 5203991 L1MA6 500 + - chr11 5204163 5204239 A-rich 219 + - chr11 5211034 5211167 (CATATA)n 245 + - chr11 5211642 5211673 AT_rich 24 + - chr11 5226551 5226606 (CA)n 303 + - chr11 5228782 5228825 (TTTTTG)n 208 + - chr11 5229045 5229121 L1PA11 440 + - chr11 5229133 5229319 MER41A 1106 + - chr11 5229374 5229485 L2 244 - - chr11 5229751 5230083 MLT1A 913 - - chr11 5231469 5231526 (CA)n 330 + - - the Result is the coverage density of repeats in the genes:: - - chr11 5203271 5204877 NM_000518 0 - 172 0.107098 - chr11 5210634 5212434 NM_000519 0 - 164 0.091111 - chr11 5226077 5227663 NM_000559 0 - 55 0.034678 - chr11 5226079 5232587 BC020719 0 - 860 0.132145 - chr11 5230996 5232587 NM_000184 0 - 57 0.035827 - - For example, the following line of output:: - - chr11 5203271 5204877 NM_000518 0 - 172 0.107098 - - implies that 172 nucleotides accounting for 10.7% of the this interval (chr11:5203271-5204877) overlap with repetitive elements. - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/flanking_features.py --- a/tools/new_operations/flanking_features.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,214 +0,0 @@ -#!/usr/bin/env python -#By: Guruprasad Ananda -""" -Fetch closest up/downstream interval from features corresponding to every interval in primary - -usage: %prog primary_file features_file out_file direction - -1, --cols1=N,N,N,N: Columns for start, end, strand in first file - -2, --cols2=N,N,N,N: Columns for start, end, strand in second file - -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval - -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * -from bx.intervals.io import * -from bx.intervals.operations import quicksect -from galaxy.datatypes.util.gff_util import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down): - #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases - #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand - #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand - if direction == 1: - if node.maxend <= threshold_up: - if node.end == node.maxend: - report_func_up(node) - elif node.right and node.left: - if node.right.maxend == node.maxend: - get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) - elif node.left.maxend == node.maxend: - get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) - elif node.right and node.right.maxend == node.maxend: - get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) - elif node.left and node.left.maxend == node.maxend: - get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) - elif node.minend <= threshold_up: - if node.end <= threshold_up: - report_func_up(node) - if node.left and node.right: - if node.right.minend <= threshold_up: - get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) - if node.left.minend <= threshold_up: - get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) - elif node.left: - if node.left.minend <= threshold_up: - get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) - elif node.right: - if node.right.minend <= threshold_up: - get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) - elif direction == 0: - if node.start > threshold_down: - report_func_down(node) - if node.left: - get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) - else: - if node.right: - get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) - -def proximal_region_finder(readers, region, comments=True): - """ - Returns an iterator that yields elements of the form [ , ]. - Intervals are GenomicInterval objects. - """ - primary = readers[0] - features = readers[1] - either = False - if region == 'Upstream': - up, down = True, False - elif region == 'Downstream': - up, down = False, True - else: - up, down = True, True - if region == 'Either': - either = True - - # Read features into memory: - rightTree = quicksect.IntervalTree() - for item in features: - if type( item ) is GenomicInterval: - rightTree.insert( item, features.linenum, item ) - - for interval in primary: - if type( interval ) is Header: - yield interval - if type( interval ) is Comment and comments: - yield interval - elif type( interval ) == GenomicInterval: - chrom = interval.chrom - start = int(interval.start) - end = int(interval.end) - strand = interval.strand - if chrom not in rightTree.chroms: - continue - else: - root = rightTree.chroms[chrom] #root node for the chrom tree - result_up = [] - result_down = [] - if (strand == '+' and up) or (strand == '-' and down): - #upstream +ve strand and downstream -ve strand cases - get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None) - - if (strand == '+' and down) or (strand == '-' and up): - #downstream +ve strand and upstream -ve strand case - get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node )) - - if result_up: - if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. - ends = [] - for n in result_up: - ends.append(n.end) - res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list - else: - res_ind = 0 - if not(either): - yield [ interval, result_up[res_ind].other ] - - if result_down: - if not(either): - #The last element of result_down will be the closest element to the given interval - yield [ interval, result_down[-1].other ] - - if either and (result_up or result_down): - iter_val = [] - if result_up and result_down: - if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): - iter_val = [ interval, result_up[res_ind].other ] - else: - #The last element of result_down will be the closest element to the given interval - iter_val = [ interval, result_down[-1].other ] - elif result_up: - iter_val = [ interval, result_up[res_ind].other ] - elif result_down: - #The last element of result_down will be the closest element to the given interval - iter_val = [ interval, result_down[-1].other ] - yield iter_val - -def main(): - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) - in1_gff_format = bool( options.gff1 ) - in2_gff_format = bool( options.gff2 ) - in_fname, in2_fname, out_fname, direction = args - except: - doc_optparse.exception() - - # Set readers to handle either GFF or default format. - if in1_gff_format: - in1_reader_wrapper = GFFIntervalToBEDReaderWrapper - else: - in1_reader_wrapper = NiceReaderWrapper - if in2_gff_format: - in2_reader_wrapper = GFFIntervalToBEDReaderWrapper - else: - in2_reader_wrapper = NiceReaderWrapper - - g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - - # Find flanking features. - out_file = open( out_fname, "w" ) - try: - for result in proximal_region_finder([g1,g2], direction): - if type( result ) is list: - line, closest_feature = result - # Need to join outputs differently depending on file types. - if in1_gff_format: - # Output is GFF with added attribute 'closest feature.' - - # Invervals are in BED coordinates; need to convert to GFF. - line = convert_bed_coords_to_gff( line ) - closest_feature = convert_bed_coords_to_gff( closest_feature ) - - # Replace double quotes with single quotes in closest feature's attributes. - out_file.write( "%s closest_feature \"%s\" \n" % - ( "\t".join( line.fields ), \ - "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) - ) ) - else: - # Output is BED + closest feature fields. - output_line_fields = [] - output_line_fields.extend( line.fields ) - output_line_fields.extend( closest_feature.fields ) - out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) ) - else: - out_file.write( "%s\n" % result ) - except ParseError, exc: - fail( "Invalid file format: %s" % str( exc ) ) - - print "Direction: %s" %(direction) - if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) - if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/flanking_features.xml --- a/tools/new_operations/flanking_features.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ - - for every interval - - flanking_features.py $input1 $input2 $out_file1 $direction - - #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -1 1,4,5,7 --gff1 - #else: - -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} - #end if - - #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -2 1,4,5,7 --gff2 - #else: - -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -For every interval in the **interval** dataset, this tool fetches the **closest non-overlapping** upstream and / or downstream features from the **features** dataset. - ------ - -.. class:: warningmark - -**Note:** - -Every line should contain at least 3 columns: chromosome number, start and stop coordinates. If any of these columns is missing or if start and stop coordinates are not numerical, the lines will be treated as invalid and skipped. The number of skipped lines is documented in the resulting history item as a "data issue". - -If the strand column is missing from your input interval dataset, the intervals will be considered to be on positive strand. You can add a strand column to your input dataset by using the *Text Manipulation->Add column* tool. - -For GFF files, features are added as a GTF-style attribute at the end of the line. - ------ - -**Example** - -If the **intervals** are:: - - chr1 10 100 Query1.1 - chr1 500 1000 Query1.2 - chr1 1100 1250 Query1.3 - -and the **features** are:: - - chr1 120 180 Query2.1 - chr1 140 200 Query2.2 - chr1 580 1050 Query2.3 - chr1 2000 2204 Query2.4 - chr1 2500 3000 Query2.5 - -Running this tool for **Both Upstream and Downstream** will return:: - - chr1 10 100 Query1.1 chr1 120 180 Query2.1 - chr1 500 1000 Query1.2 chr1 140 200 Query2.2 - chr1 500 1000 Query1.2 chr1 2000 2204 Query2.4 - chr1 1100 1250 Query1.3 chr1 580 1050 Query2.3 - chr1 1100 1250 Query1.3 chr1 2000 2204 Query2.4 - - - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/get_flanks.py --- a/tools/new_operations/get_flanks.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,191 +0,0 @@ -#!/usr/bin/env python -#Done by: Guru - -""" -Get Flanking regions. - -usage: %prog input out_file size direction region - -l, --cols=N,N,N,N: Columns for chrom, start, end, strand in file - -o, --off=N: Offset -""" - -import sys, re, os -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - try: - if int( sys.argv[3] ) < 0: - raise Exception - except: - stop_err( "Length of flanking region(s) must be a non-negative integer." ) - - # Parsing Command Line here - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols ) - inp_file, out_file, size, direction, region = args - if strand_col_1 <= 0: - strand = "+" #if strand is not defined, default it to + - except: - stop_err( "Metadata issue, correct the metadata attributes by clicking on the pencil icon in the history item." ) - try: - offset = int(options.off) - size = int(size) - except: - stop_err( "Invalid offset or length entered. Try again by entering valid integer values." ) - - fo = open(out_file,'w') - - skipped_lines = 0 - first_invalid_line = 0 - invalid_line = None - elems = [] - j=0 - for i, line in enumerate( file( inp_file ) ): - line = line.strip() - if line and (not line.startswith( '#' )) and line != '': - j+=1 - try: - elems = line.split('\t') - #if the start and/or end columns are not numbers, skip that line. - assert int(elems[start_col_1]) - assert int(elems[end_col_1]) - if strand_col_1 != -1: - strand = elems[strand_col_1] - #if the stand value is not + or -, skip that line. - assert strand in ['+', '-'] - if direction == 'Upstream': - if strand == '+': - if region == 'end': - elems[end_col_1] = str(int(elems[end_col_1]) + offset) - elems[start_col_1] = str( int(elems[end_col_1]) - size ) - else: - elems[end_col_1] = str(int(elems[start_col_1]) + offset) - elems[start_col_1] = str( int(elems[end_col_1]) - size ) - elif strand == '-': - if region == 'end': - elems[start_col_1] = str(int(elems[start_col_1]) - offset) - elems[end_col_1] = str(int(elems[start_col_1]) + size) - else: - elems[start_col_1] = str(int(elems[end_col_1]) - offset) - elems[end_col_1] = str(int(elems[start_col_1]) + size) - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - - elif direction == 'Downstream': - if strand == '-': - if region == 'start': - elems[end_col_1] = str(int(elems[end_col_1]) - offset) - elems[start_col_1] = str( int(elems[end_col_1]) - size ) - else: - elems[end_col_1] = str(int(elems[start_col_1]) - offset) - elems[start_col_1] = str( int(elems[end_col_1]) - size ) - elif strand == '+': - if region == 'start': - elems[start_col_1] = str(int(elems[start_col_1]) + offset) - elems[end_col_1] = str(int(elems[start_col_1]) + size) - else: - elems[start_col_1] = str(int(elems[end_col_1]) + offset) - elems[end_col_1] = str(int(elems[start_col_1]) + size) - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - - elif direction == 'Both': - if strand == '-': - if region == 'start': - start = str(int(elems[end_col_1]) - offset) - end1 = str(int(start) + size) - end2 = str(int(start) - size) - elems[start_col_1]=start - elems[end_col_1]=end1 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elems[start_col_1]=end2 - elems[end_col_1]=start - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elif region == 'end': - start = str(int(elems[start_col_1]) - offset) - end1 = str(int(start) + size) - end2 = str(int(start) - size) - elems[start_col_1]=start - elems[end_col_1]=end1 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elems[start_col_1]=end2 - elems[end_col_1]=start - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - else: - start1 = str(int(elems[end_col_1]) - offset) - end1 = str(int(start1) + size) - start2 = str(int(elems[start_col_1]) - offset) - end2 = str(int(start2) - size) - elems[start_col_1]=start1 - elems[end_col_1]=end1 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elems[start_col_1]=end2 - elems[end_col_1]=start2 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elif strand == '+': - if region == 'start': - start = str(int(elems[start_col_1]) + offset) - end1 = str(int(start) - size) - end2 = str(int(start) + size) - elems[start_col_1]=end1 - elems[end_col_1]=start - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elems[start_col_1]=start - elems[end_col_1]=end2 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elif region == 'end': - start = str(int(elems[end_col_1]) + offset) - end1 = str(int(start) - size) - end2 = str(int(start) + size) - elems[start_col_1]=end1 - elems[end_col_1]=start - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elems[start_col_1]=start - elems[end_col_1]=end2 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - else: - start1 = str(int(elems[start_col_1]) + offset) - end1 = str(int(start1) - size) - start2 = str(int(elems[end_col_1]) + offset) - end2 = str(int(start2) + size) - elems[start_col_1]=end1 - elems[end_col_1]=start1 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - elems[start_col_1]=start2 - elems[end_col_1]=end2 - assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0 - fo.write( "%s\n" % '\t'.join( elems ) ) - except: - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - fo.close() - - if skipped_lines == j: - stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." ) - if skipped_lines > 0: - print 'Skipped %d invalid lines starting with #%dL "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) - print 'Location: %s, Region: %s, Flank-length: %d, Offset: %d ' %( direction, region, size, offset ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/get_flanks.xml --- a/tools/new_operations/get_flanks.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ - - returns flanking region/s for every gene - get_flanks.py $input $out_file1 $size $direction $region -o $offset -l ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool finds the upstream and/or downstream flanking region(s) of all the selected regions in the input file. - -**Note:** Every line should contain at least 3 columns: Chromosome number, Start and Stop co-ordinates. If any of these columns is missing or if start and stop co-ordinates are not numerical, the tool may encounter exceptions and such lines are skipped as invalid. The number of invalid skipped lines is documented in the resulting history item as a "Data issue". - ------ - - -**Example 1** - -- For the following query:: - - chr22 1000 7000 NM_174568 0 + - -- running get flanks with Region: Around start, Offset: -200, Flank-length: 300 and Location: Upstream will return **(Red: Query positive strand; Blue: Flanks output)**:: - - chr22 500 800 NM_174568 0 + - -.. image:: ./static/operation_icons/flanks_ex1.gif - -**Example 2** - -- For the following query:: - - chr22 1000 7000 NM_028946 0 - - -- running get flanks with Region: Whole, Offset: 200, Flank-length: 300 and Location: Downstream will return **(Orange: Query negative strand; Magenta: Flanks output)**:: - - chr22 500 800 NM_028946 0 - - -.. image:: ./static/operation_icons/flanks_ex2.gif - - - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_basecoverage.py --- a/tools/new_operations/gops_basecoverage.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -#!/usr/bin/env python -""" -Count total base coverage. - -usage: %prog in_file out_file - -1, --cols1=N,N,N,N: Columns for start, end, strand in first file -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.base_coverage import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - in_fname, out_fname = args - except: - doc_optparse.exception() - - g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col = strand_col_1, - fix_strand=True ) - - try: - bases = base_coverage(g1) - except ParseError, exc: - fail( "Invalid file format: %s" % str( exc ) ) - out_file = open( out_fname, "w" ) - out_file.write( "%s\n" % str( bases ) ) - out_file.close() - if g1.skipped > 0: - print skipped( g1, filedesc="" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_cluster.py --- a/tools/new_operations/gops_cluster.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,132 +0,0 @@ -#!/usr/bin/env python -""" -Cluster regions of intervals. - -usage: %prog in_file out_file - -1, --cols1=N,N,N,N: Columns for start, end, strand in file - -d, --distance=N: Maximum distance between clustered intervals - -v, --overlap=N: Minimum overlap require (negative distance) - -m, --minregions=N: Minimum regions per cluster - -o, --output=N: 1)merged 2)filtered 3)clustered 4) minimum 5) maximum -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.find_clusters import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - distance = 0 - minregions = 2 - output = 1 - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - if options.distance: distance = int( options.distance ) - if options.overlap: distance = -1 * int( options.overlap ) - if options.output: output = int( options.output ) - if options.minregions: minregions = int( options.minregions ) - in_fname, out_fname = args - except: - doc_optparse.exception() - - g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - - # Get the cluster tree - try: - clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) - except ParseError, exc: - fail( "Invalid file format: %s" % str( exc ) ) - - f1 = open( in_fname, "r" ) - out_file = open( out_fname, "w" ) - - # If "merge" - if output == 1: - fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col)+1)] - for chrom, tree in clusters.items(): - for start, end, lines in tree.getregions(): - fields[g1.chrom_col] = chrom - fields[g1.start_col] = str(start) - fields[g1.end_col] = str(end) - out_file.write( "%s\n" % "\t".join( fields ) ) - - # If "filtered" we preserve order of file and comments, etc. - if output == 2: - linenums = dict() - for chrom, tree in clusters.items(): - for linenum in tree.getlines(): - linenums[linenum] = 0 - linenum = -1 - f1.seek(0) - for line in f1.readlines(): - linenum += 1 - if linenum in linenums or linenum in extra: - out_file.write( "%s\n" % line.rstrip( "\n\r" ) ) - - # If "clustered" we output original intervals, but near each other (i.e. clustered) - if output == 3: - linenums = list() - f1.seek(0) - fileLines = f1.readlines() - for chrom, tree in clusters.items(): - for linenum in tree.getlines(): - out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) ) - - # If "minimum" we output the smallest interval in each cluster - if output == 4 or output == 5: - linenums = list() - f1.seek(0) - fileLines = f1.readlines() - for chrom, tree in clusters.items(): - regions = tree.getregions() - for start, end, lines in tree.getregions(): - outsize = -1 - outinterval = None - for line in lines: - # three nested for loops? - # should only execute this code once per line - fileline = fileLines[line].rstrip("\n\r") - try: - cluster_interval = GenomicInterval( g1, fileline.split("\t"), - g1.chrom_col, - g1.start_col, - g1.end_col, - g1.strand_col, - g1.default_strand, - g1.fix_strand ) - except Exception, exc: - print >> sys.stderr, str( exc ) - f1.close() - sys.exit() - interval_size = cluster_interval.end - cluster_interval.start - if outsize == -1 or \ - ( outsize > interval_size and output == 4 ) or \ - ( outsize < interval_size and output == 5 ) : - outinterval = cluster_interval - outsize = interval_size - out_file.write( "%s\n" % outinterval ) - - f1.close() - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc="" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_complement.py --- a/tools/new_operations/gops_complement.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -#!/usr/bin/env python -""" -Complement regions. - -usage: %prog in_file out_file - -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in file - -l, --lengths=N: Filename of .len file for species (chromosome lengths) - -a, --all: Complement all chromosomes (Genome-wide complement) -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.complement import complement -from bx.intervals.operations.subtract import subtract -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - allchroms = False - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - lengths = options.lengths - if options.all: allchroms = True - in_fname, out_fname = args - except: - doc_optparse.exception() - - g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - - lens = dict() - chroms = list() - # dbfile is used to determine the length of each chromosome. The lengths - # are added to the lens dict and passed copmlement operation code in bx. - dbfile = fileinput.FileInput( lengths ) - - if dbfile: - if not allchroms: - try: - for line in dbfile: - fields = line.split("\t") - lens[fields[0]] = int(fields[1]) - except: - # assume LEN doesn't exist or is corrupt somehow - pass - elif allchroms: - try: - for line in dbfile: - fields = line.split("\t") - end = int(fields[1]) - chroms.append("\t".join([fields[0],"0",str(end)])) - except: - pass - - # Safety...if the dbfile didn't exist and we're on allchroms, then - # default to generic complement - if allchroms and len(chroms) == 0: - allchroms = False - - if allchroms: - chromReader = GenomicIntervalReader(chroms) - generator = subtract([chromReader, g1]) - else: - generator = complement(g1, lens) - - out_file = open( out_fname, "w" ) - - try: - for interval in generator: - if type( interval ) is GenomicInterval: - out_file.write( "%s\n" % "\t".join( interval ) ) - else: - out_file.write( "%s\n" % interval ) - except ParseError, exc: - out_file.close() - fail( "Invalid file format: %s" % str( exc ) ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc="" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_concat.py --- a/tools/new_operations/gops_concat.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -#!/usr/bin/env python -""" -Concatenate two bed files. The concatenated files are returned in the -same format as the first. If --sameformat is specified, then all -columns will be treated as the same, and all fields will be saved, -although the output will be trimmed to match the primary input. In -addition, if --sameformat is specified, missing fields will be padded -with a period(.). - -usage: %prog in_file_1 in_file_2 out_file - -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in first file - -2, --cols2=N,N,N,N: Columns for chrom, start, end, strand in second file - -s, --sameformat: All files are precisely the same format. -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.concat import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - sameformat=False - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) - if options.sameformat: sameformat = True - in_file_1, in_file_2, out_fname = args - except: - doc_optparse.exception() - - g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - fix_strand=True ) - - g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - - if strand_col_1 >= 0: - g1.strand_col = strand_col_1 - - out_file = open( out_fname, "w" ) - - try: - for line in concat( [g1, g2], sameformat=sameformat ): - if type( line ) is GenomicInterval: - out_file.write( "%s\n" % "\t".join( line.fields ) ) - else: - out_file.write( "%s\n" % line ) - except ParseError, exc: - out_file.close() - fail( "Invalid file format: %s" % str( exc ) ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) - if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_coverage.py --- a/tools/new_operations/gops_coverage.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ -#!/usr/bin/env python -""" -Calculate coverage of one query on another, and append the coverage to -the last two columns as bases covered and percent coverage. - -usage: %prog bed_file_1 bed_file_2 out_file - -1, --cols1=N,N,N,N: Columns for start, end, strand in first file - -2, --cols2=N,N,N,N: Columns for start, end, strand in second file -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.coverage import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) - in_fname, in2_fname, out_fname = args - except: - doc_optparse.exception() - - g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - - out_file = open( out_fname, "w" ) - - try: - for line in coverage( [g1,g2] ): - if type( line ) is GenomicInterval: - out_file.write( "%s\n" % "\t".join( line.fields ) ) - else: - out_file.write( "%s\n" % line ) - except ParseError, exc: - out_file.close() - fail( "Invalid file format: %s" % str( exc ) ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) - if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_intersect.py --- a/tools/new_operations/gops_intersect.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -#!/usr/bin/env python -""" -Find regions of first interval file that overlap regions in a second interval file. -Interval files can either be BED or GFF format. - -usage: %prog interval_file_1 interval_file_2 out_file - -1, --cols1=N,N,N,N: Columns for start, end, strand in first file - -2, --cols2=N,N,N,N: Columns for start, end, strand in second file - -m, --mincols=N: Require this much overlap (default 1bp) - -p, --pieces: just print pieces of second set (after padding) - -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval - -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.intersect import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * -from galaxy.datatypes.util.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - mincols = 1 - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) - if options.mincols: mincols = int( options.mincols ) - pieces = bool( options.pieces ) - in1_gff_format = bool( options.gff1 ) - in2_gff_format = bool( options.gff2 ) - in_fname, in2_fname, out_fname = args - except: - doc_optparse.exception() - - # Set readers to handle either GFF or default format. - if in1_gff_format: - in1_reader_wrapper = GFFReaderWrapper - else: - in1_reader_wrapper = NiceReaderWrapper - if in2_gff_format: - in2_reader_wrapper = GFFReaderWrapper - else: - in2_reader_wrapper = NiceReaderWrapper - - g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - if in1_gff_format: - # Intersect requires coordinates in BED format. - g1.convert_to_bed_coord=True - g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - if in2_gff_format: - # Intersect requires coordinates in BED format. - g2.convert_to_bed_coord=True - - out_file = open( out_fname, "w" ) - try: - for feature in intersect( [g1,g2], pieces=pieces, mincols=mincols ): - if isinstance( feature, GFFFeature ): - # Convert back to GFF coordinates since reader converted automatically. - convert_bed_coords_to_gff( feature ) - for interval in feature.intervals: - out_file.write( "%s\n" % "\t".join( interval.fields ) ) - elif isinstance( feature, GenomicInterval ): - out_file.write( "%s\n" % "\t".join( feature.fields ) ) - else: - out_file.write( "%s\n" % feature ) - except ParseError, e: - out_file.close() - fail( "Invalid file format: %s" % str( e ) ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) - if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_join.py --- a/tools/new_operations/gops_join.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -#!/usr/bin/env python -""" -Join two sets of intervals using their overlap as the key. - -usage: %prog bed_file_1 bed_file_2 out_file - -1, --cols1=N,N,N,N: Columns for start, end, strand in first file - -2, --cols2=N,N,N,N: Columns for start, end, strand in second file - -m, --mincols=N: Require this much overlap (default 1bp) - -f, --fill=N: none, right, left, both -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.join import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - mincols = 1 - upstream_pad = 0 - downstream_pad = 0 - leftfill = False - rightfill = False - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) - if options.mincols: mincols = int( options.mincols ) - if options.fill: - if options.fill == "both": - rightfill = leftfill = True - else: - rightfill = options.fill == "right" - leftfill = options.fill == "left" - in_fname, in2_fname, out_fname = args - except: - doc_optparse.exception() - - g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - - out_file = open( out_fname, "w" ) - - try: - for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): - if type( outfields ) is list: - out_file.write( "%s\n" % "\t".join( outfields ) ) - else: - out_file.write( "%s\n" % outfields ) - except ParseError, exc: - out_file.close() - fail( "Invalid file format: %s" % str( exc ) ) - except MemoryError: - out_file.close() - fail( "Input datasets were too large to complete the join operation." ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) - if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_merge.py --- a/tools/new_operations/gops_merge.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -#!/usr/bin/env python -""" -Merge overlaping regions. - -usage: %prog in_file out_file - -1, --cols1=N,N,N,N: Columns for start, end, strand in first file - -m, --mincols=N: Require this much overlap (default 1bp) - -3, --threecol: Output 3 column bed -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.merge import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - mincols = 1 - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - if options.mincols: mincols = int( options.mincols ) - in_fname, out_fname = args - except: - doc_optparse.exception() - - g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col = strand_col_1, - fix_strand=True ) - - out_file = open( out_fname, "w" ) - - try: - for line in merge(g1,mincols=mincols): - if options.threecol: - if type( line ) is GenomicInterval: - out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) ) - elif type( line ) is list: - out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) ) - else: - out_file.write( "%s\n" % line ) - else: - if type( line ) is GenomicInterval: - out_file.write( "%s\n" % "\t".join( line.fields ) ) - elif type( line ) is list: - out_file.write( "%s\n" % "\t".join( line ) ) - else: - out_file.write( "%s\n" % line ) - except ParseError, exc: - out_file.close() - fail( "Invalid file format: %s" % str( exc ) ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/gops_subtract.py --- a/tools/new_operations/gops_subtract.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -#!/usr/bin/env python -""" -Find regions of first interval file that do not overlap regions in a second -interval file. Interval files can either be BED or GFF format. - -usage: %prog interval_file_1 interval_file_2 out_file - -1, --cols1=N,N,N,N: Columns for start, end, strand in first file - -2, --cols2=N,N,N,N: Columns for start, end, strand in second file - -m, --mincols=N: Require this much overlap (default 1bp) - -p, --pieces: just print pieces of second set (after padding) - -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval - -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals import * -from bx.intervals.io import * -from bx.intervals.operations.subtract import * -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * -from galaxy.datatypes.util.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - mincols = 1 - upstream_pad = 0 - downstream_pad = 0 - - options, args = doc_optparse.parse( __doc__ ) - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) - if options.mincols: mincols = int( options.mincols ) - pieces = bool( options.pieces ) - in1_gff_format = bool( options.gff1 ) - in2_gff_format = bool( options.gff2 ) - in_fname, in2_fname, out_fname = args - except: - doc_optparse.exception() - - # Set readers to handle either GFF or default format. - if in1_gff_format: - in1_reader_wrapper = GFFReaderWrapper - else: - in1_reader_wrapper = NiceReaderWrapper - if in2_gff_format: - in2_reader_wrapper = GFFReaderWrapper - else: - in2_reader_wrapper = NiceReaderWrapper - - g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - if in1_gff_format: - # Subtract requires coordinates in BED format. - g1.convert_to_bed_coord=True - - g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - if in2_gff_format: - # Subtract requires coordinates in BED format. - g2.convert_to_bed_coord=True - - out_file = open( out_fname, "w" ) - try: - for feature in subtract( [g1,g2], pieces=pieces, mincols=mincols ): - if isinstance( feature, GFFFeature ): - # Convert back to GFF coordinates since reader converted automatically. - convert_bed_coords_to_gff( feature ) - for interval in feature.intervals: - out_file.write( "%s\n" % "\t".join( interval.fields ) ) - elif isinstance( feature, GenomicInterval ): - out_file.write( "%s\n" % "\t".join( feature.fields ) ) - else: - out_file.write( "%s\n" % feature ) - except ParseError, exc: - out_file.close() - fail( "Invalid file format: %s" % str( exc ) ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc=" of 2nd dataset" ) - if g2.skipped > 0: - print skipped( g2, filedesc=" of 1st dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/intersect.xml --- a/tools/new_operations/intersect.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,137 +0,0 @@ - - the intervals of two datasets - gops_intersect.py - $input1 $input2 $output - - #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -1 1,4,5,7 --gff1 - #else: - -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} - #end if - - #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -2 1,4,5,7 --gff2 - #else: - -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} - #end if - - -m $min $returntype - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets -- **Overlapping Intervals** returns entire intervals from the first dataset that overlap the second dataset. The returned intervals are completely unchanged, and this option only filters out intervals that do not overlap with the second dataset. -- **Overlapping pieces of Intervals** returns intervals that indicate the exact base pair overlap between the first dataset and the second dataset. The intervals returned are from the first dataset, and all fields besides start and end are guaranteed to remain unchanged. - ------ - -**Example** - -.. image:: ./static/operation_icons/gops_intersect.gif - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/join.xml --- a/tools/new_operations/join.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ - - the intervals of two datasets side-by-side - gops_join.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} -m $min -f $fill - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Where overlap** specifies the minimum overlap between intervals that allows them to be joined. -- **Return only records that are joined** returns only the records of the first dataset that join to a record in the second dataset. This is analogous to an INNER JOIN. -- **Return all records of first dataset (fill null with ".")** returns all intervals of the first dataset, and any intervals that do not join an interval from the second dataset are filled in with a period(.). This is analogous to a LEFT JOIN. -- **Return all records of second dataset (fill null with ".")** returns all intervals of the second dataset, and any intervals that do not join an interval from the first dataset are filled in with a period(.). **Note that this may produce an invalid interval file, since a period(.) is not a valid chrom, start, end or strand.** -- **Return all records of both datasets (fill nulls with ".")** returns all records from both datasets, and fills on either the right or left with periods. **Note that this may produce an invalid interval file, since a period(.) is not a valid chrom, start, end or strand.** - ------ - -**Example** - -If **First dataset** is:: - - chr1 10 100 Query1.1 - chr1 500 1000 Query1.2 - chr1 1100 1250 Query1.3 - -and **Second dataset** is:: - - chr1 20 80 Query2.1 - chr1 2000 2204 Query2.2 - chr1 2500 3000 Query2.3 - - -The four return options will generate: - - -- **Return only records that are joined**:: - - chr1 10 100 Query1.1 chr1 20 80 Query2.1 - -- **Return all records of first dataset**:: - - chr1 10 100 Query1.1 chr1 20 80 Query2.1 - chr1 500 1000 Query1.2 . . . . - chr1 1100 1250 Query1.3 . . . . - -- **Return all records of second dataset**:: - - chr1 10 100 Query1.1 chr1 20 80 Query2.1 - . . . . chr1 2000 2204 Query2.2 - . . . . chr1 2500 3000 Query2.3 - -- **Return all records of both datasets**:: - - chr1 10 100 Query1.1 chr1 20 80 Query2.1 - chr1 500 1000 Query1.2 . . . . - chr1 1100 1250 Query1.3 . . . . - . . . . chr1 2000 2204 Query2.2 - . . . . chr1 2500 3000 Query2.3 - - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/merge.xml --- a/tools/new_operations/merge.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - the overlapping intervals of a dataset - gops_merge.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} $returntype - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -This operation merges all overlapping intervals into single intervals. - -**Example** - -.. image:: ./static/operation_icons/gops_merge.gif - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/operation_filter.py --- a/tools/new_operations/operation_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -# runs after the job (and after the default post-filter) -import os -from galaxy import eggs -from galaxy import jobs -from galaxy.tools.parameters import DataToolParameter -# Older py compatibility -try: - set() -except: - from sets import Set as set - -#def exec_before_process(app, inp_data, out_data, param_dict, tool=None): -# """Sets the name of the data""" -# dbkeys = sets.Set( [data.dbkey for data in inp_data.values() ] ) -# if len(dbkeys) != 1: -# raise Exception, '

Both Queries must be from the same genome build

' - -def validate_input( trans, error_map, param_values, page_param_map ): - dbkeys = set() - data_param_names = set() - data_params = 0 - for name, param in page_param_map.iteritems(): - if isinstance( param, DataToolParameter ): - # for each dataset parameter - if param_values.get(name, None) != None: - dbkeys.add( param_values[name].dbkey ) - data_params += 1 - # check meta data - try: - param = param_values[name] - if isinstance( param.datatype, trans.app.datatypes_registry.get_datatype_by_extension( 'gff' ).__class__ ): - # TODO: currently cannot validate GFF inputs b/c they are not derived from interval. - pass - else: # Validate interval datatype. - startCol = int( param.metadata.startCol ) - endCol = int( param.metadata.endCol ) - chromCol = int( param.metadata.chromCol ) - if param.metadata.strandCol is not None: - strandCol = int ( param.metadata.strandCol ) - else: - strandCol = 0 - except: - error_msg = "The attributes of this dataset are not properly set. " + \ - "Click the pencil icon in the history item to set the chrom, start, end and strand columns." - error_map[name] = error_msg - data_param_names.add( name ) - if len( dbkeys ) > 1: - for name in data_param_names: - error_map[name] = "All datasets must belong to same genomic build, " \ - "this dataset is linked to build '%s'" % param_values[name].dbkey - if data_params != len(data_param_names): - for name in data_param_names: - error_map[name] = "A dataset of the appropriate type is required" - -# Commented out by INS, 5/30/2007. What is the PURPOSE of this? -def exec_after_process(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None): - """Verify the output data after each run""" - items = out_data.items() - - for name, data in items: - try: - if stderr and len( stderr ) > 0: - raise Exception( stderr ) - - except Exception, exc: - data.blurb = jobs.JOB_ERROR - data.state = jobs.JOB_ERROR - -## def exec_after_process(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None): -## pass - - -def exec_after_merge(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None): - exec_after_process( - app, inp_data, out_data, param_dict, tool=tool, stdout=stdout, stderr=stderr) - - # strip strand column if clusters were merged - items = out_data.items() - for name, data in items: - if param_dict['returntype'] == True: - data.metadata.chromCol = 1 - data.metadata.startCol = 2 - data.metadata.endCol = 3 - # merge always clobbers strand - data.metadata.strandCol = None - - -def exec_after_cluster(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None): - exec_after_process( - app, inp_data, out_data, param_dict, tool=tool, stdout=stdout, stderr=stderr) - - # strip strand column if clusters were merged - if param_dict["returntype"] == '1': - items = out_data.items() - for name, data in items: - data.metadata.strandCol = None diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/subtract.xml --- a/tools/new_operations/subtract.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ - - the intervals of two datasets - gops_subtract.py - $input1 $input2 $output - - #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -1 1,4,5,7 --gff1 - #else: - -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} - #end if - - #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -2 1,4,5,7 --gff2 - #else: - -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} - #end if - - -m $min $returntype - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets. -- **Intervals with no overlap** returns entire intervals from the first dataset that do not overlap the second dataset. The returned intervals are completely unchanged, and this option only filters out intervals that overlap with the second dataset. -- **Non-overlapping pieces of intervals** returns intervals from the first dataset that have the intervals from the second dataset removed. Any overlapping base pairs are removed from the range of the interval. All fields besides start and end are guaranteed to remain unchanged. - ------ - -**Example** - -.. image:: ./static/operation_icons/gops_subtract.gif - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/subtract_query.py --- a/tools/new_operations/subtract_query.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -#!/usr/bin/env python -# Greg Von Kuster - -""" -Subtract an entire query from another query -usage: %prog in_file_1 in_file_2 begin_col end_col output -""" -import sys, re -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -# Older py compatibility -try: - set() -except: - from sets import Set as set - -assert sys.version_info[:2] >= ( 2, 4 ) - -def get_lines(fname, begin_col='', end_col=''): - lines = set([]) - i = 0 - for i, line in enumerate(file(fname)): - line = line.rstrip('\r\n') - if line and not line.startswith('#'): - if begin_col and end_col: - """Both begin_col and end_col must be integers at this point.""" - try: - line = line.split('\t') - line = '\t'.join([line[j] for j in range(begin_col-1, end_col)]) - lines.add( line ) - except: pass - else: - lines.add( line ) - if i: return (i+1, lines) - else: return (i, lines) - -def main(): - - # Parsing Command Line here - options, args = doc_optparse.parse( __doc__ ) - - try: - inp1_file, inp2_file, begin_col, end_col, out_file = args - except: - doc_optparse.exception() - - begin_col = begin_col.strip() - end_col = end_col.strip() - - if begin_col != 'None' or end_col != 'None': - """ - The user selected columns for restriction. We'll allow default - values for both begin_col and end_col as long as the user selected - at least one of them for restriction. - """ - if begin_col == 'None': - begin_col = end_col - elif end_col == 'None': - end_col = begin_col - begin_col = int(begin_col) - end_col = int(end_col) - """Make sure that begin_col <= end_col (switch if not)""" - if begin_col > end_col: - tmp_col = end_col - end_col = begin_col - begin_col = tmp_col - else: - begin_col = end_col = '' - - try: - fo = open(out_file,'w') - except: - print >> sys.stderr, "Unable to open output file" - sys.exit() - - """ - len1 is the number of lines in inp1_file - lines1 is the set of unique lines in inp1_file - diff1 is the number of duplicate lines removed from inp1_file - """ - len1, lines1 = get_lines(inp1_file, begin_col, end_col) - diff1 = len1 - len(lines1) - len2, lines2 = get_lines(inp2_file, begin_col, end_col) - - lines1.difference_update(lines2) - """lines1 is now the set of unique lines in inp1_file - the set of unique lines in inp2_file""" - - for line in lines1: - print >> fo, line - - fo.close() - - info_msg = 'Subtracted %d lines. ' %((len1 - diff1) - len(lines1)) - - if begin_col and end_col: - info_msg += 'Restricted to columns c' + str(begin_col) + ' thru c' + str(end_col) + '. ' - - if diff1 > 0: - info_msg += 'Eliminated %d duplicate/blank/comment/invalid lines from first query.' %diff1 - - print info_msg - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/subtract_query.xml --- a/tools/new_operations/subtract_query.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ - - from another dataset - subtract_query.py $input1 $input2 $begin_col $end_col $output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**TIP:** This tool complements the tool in the **Operate on Genomic Intervals** tool set which subtracts the intervals of two datasets. - - ------ - -**Syntax** - -This tool subtracts an entire dataset from another dataset. - -- Any text format is valid. -- If both dataset formats are tabular, you may restrict the subtraction to specific columns **contained in both datasets** and the resulting dataset will include only the columns specified. -- The begin column must be less than or equal to the end column. If it is not, begin column is switched with end column. -- If begin column is specified but end column is not, end column will default to begin_column (and vice versa). -- All blank and comment lines are skipped and not included in the resulting dataset (comment lines are lines beginning with a # character). -- Duplicate lines are eliminated from both dataset prior to subtraction. If any duplicate lines were eliminated from the first dataset, the number is displayed in the resulting history item. - ------ - -**Example** - -If this is the **First dataset**:: - - chr1 4225 19670 - chr10 6 8 - chr1 24417 24420 - chr6_hla_hap2 0 150 - chr2 1 5 - chr10 2 10 - chr1 30 55 - chrY 1 20 - chr1 1225979 42287290 - chr10 7 8 - -and this is the **Second dataset**:: - - chr1 4225 19670 - chr10 6 8 - chr1 24417 24420 - chr6_hla_hap2 0 150 - chr2 1 5 - chr1 30 55 - chrY 1 20 - chr1 1225979 42287290 - -Subtracting the **Second dataset** from the **First dataset** (including all columns) will yield:: - - chr10 7 8 - chr10 2 10 - -Conversely, subtracting the **First dataset** from the **Second dataset** (including all columns) will result in an empty dataset. - -Subtracting the **Second dataset** from the **First dataset** (restricting to columns c1 and c2) will yield:: - - chr10 7 - chr10 2 - - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/tables_arithmetic_operations.pl --- a/tools/new_operations/tables_arithmetic_operations.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ -# A program to implement arithmetic operations on tabular files data. The program takes three inputs: -# The first input is a TABULAR format file containing numbers only. -# The second input is a TABULAR format file containing numbers only. -# The two files must have the same number of columns and the same number of rows -# The third input is an arithmetic operation: +, -, *, or / for addition, subtraction, multiplication, or division, respectively -# The output file is a TABULAR format file containing the result of implementing the arithmetic operation on both input files. -# The output file has the same number of columns and the same number of rows as each of the two input files. -# Note: in case of division, none of the values in the second input file could be 0. - -use strict; -use warnings; - -#variables to handle information of the first input tabular file -my $lineData1 = ""; -my @lineDataArray1 = (); -my $lineArraySize = 0; -my $lineCounter1 = 0; - -#variables to handle information of the second input tabular file -my $lineData2= ""; -my @lineDataArray2 = (); -my $lineCounter2 = 0; - -my $result = 0; - -# check to make sure having the correct number of arguments -my $usage = "usage: tables_arithmetic_operations.pl [TABULAR.in] [TABULAR.in] [ArithmeticOperation] [TABULAR.out] \n"; -die $usage unless @ARGV == 4; - -#variables to store the names of input and output files -my $inputTabularFile1 = $ARGV[0]; -my $inputTabularFile2 = $ARGV[1]; -my $arithmeticOperation = $ARGV[2]; -my $outputTabularFile = $ARGV[3]; - -#open the input and output files -open (INPUT1, "<", $inputTabularFile1) || die("Could not open file $inputTabularFile1 \n"); -open (INPUT2, "<", $inputTabularFile2) || die("Could not open file $inputTabularFile2 \n"); -open (OUTPUT, ">", $outputTabularFile) || die("Could not open file $outputTabularFile \n"); - -#store the first input file in the array @motifsFrequencyData1 -my @tabularData1 = ; - -#store the second input file in the array @motifsFrequencyData2 -my @tabularData2 = ; - -#reset the $lineCounter1 to 0 -$lineCounter1 = 0; - -#iterated through the lines of the first input file -INDEL1: -foreach $lineData1 (@tabularData1){ - chomp ($lineData1); - $lineCounter1++; - - #reset the $lineCounter2 to 0 - $lineCounter2 = 0; - - #iterated through the lines of the second input file - foreach $lineData2 (@tabularData2){ - chomp ($lineData2); - $lineCounter2++; - - #check if the two motifs are the same in the two input files - if ($lineCounter1 == $lineCounter2){ - - @lineDataArray1 = split(/\t/, $lineData1); - @lineDataArray2 = split(/\t/, $lineData2); - - $lineArraySize = @lineDataArray1; - - for (my $index = 0; $index < $lineArraySize; $index++){ - - if ($arithmeticOperation eq "Addition"){ - #compute the additin of both values - $result = $lineDataArray1[$index] + $lineDataArray2[$index]; - } - - if ($arithmeticOperation eq "Subtraction"){ - #compute the subtraction of both values - $result = $lineDataArray1[$index] - $lineDataArray2[$index]; - } - - if ($arithmeticOperation eq "Multiplication"){ - #compute the multiplication of both values - $result = $lineDataArray1[$index] * $lineDataArray2[$index]; - } - - if ($arithmeticOperation eq "Division"){ - - #check if the denominator is 0 - if ($lineDataArray2[$index] != 0){ - #compute the division of both values - $result = $lineDataArray1[$index] / $lineDataArray2[$index]; - } - else{ - die("A denominator could not be zero \n"); - } - } - - #store the result in the output file - if ($index < $lineArraySize - 1){ - print OUTPUT $result . "\t"; - } - else{ - print OUTPUT $result . "\n"; - } - } - next INDEL1; - } - } -} - -#close the input and output files -close(OUTPUT); -close(INPUT2); -close(INPUT1); \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/new_operations/tables_arithmetic_operations.xml --- a/tools/new_operations/tables_arithmetic_operations.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ - - on tables - - - tables_arithmetic_operations.pl $inputFile1 $inputFile2 $inputArithmeticOperation3 $outputFile1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -This program implements arithmetic operations on tabular files data. The program takes three inputs: - -- The first input is a TABULAR format file containing numbers only. -- The second input is a TABULAR format file containing numbers only. -- The third input is an arithmetic operation: +, -, x, or / for addition, subtraction, multiplication, or division, respectively. -- The output file is a TABULAR format file containing the result of implementing the arithmetic operation on both input files. - - -Notes: - -- The two files must have the same number of columns and the same number of rows. -- The output file has the same number of columns and the same number of rows as each of the two input files. -- In case of division, none of the values in the second input file could be 0, otherwise the program will stop and report an error. - -**Example** - -Let us have the first input file as follows:: - - 5 4 0 - 10 11 12 - 1 3 1 - 1 2 1 - 2 0 4 - -And the second input file as follows:: - - 5 4 4 - 2 5 8 - 1 2 1 - 3 2 5 - 2 4 4 - -Running the program and choosing "Addition" as an arithmetic operation will give the following output:: - - 10 8 4 - 12 16 20 - 2 5 2 - 4 4 6 - 4 4 8 - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/bwa_solid2fastq_modified.pl --- a/tools/next_gen_conversion/bwa_solid2fastq_modified.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -#!/usr/bin/perl -w - -# Author: lh3 -# Note: Ideally, this script should be written in C. It is a bit slow at present. - -use strict; -use warnings; -use Getopt::Std; - -my %opts; -my $version = '0.1.3'; -my $usage = qq{ -Usage: solid2fastq.pl - -Note: is the string showed in the `# Title:' line of a - ".csfasta" read file. Then F3.csfasta is read sequence - file and F3_QV.qual is the quality file. If - R3.csfasta is present, this script assumes reads are - paired; otherwise reads will be regarded as single-end. - - The read name will be :panel_x_y/[12] with `1' for R3 - tag and `2' for F3. Usually you may want to use short - to save diskspace. Long also causes troubles to maq. - -}; - -getopts('', \%opts); -die($usage) if (@ARGV != 7); -my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV; -my (@fhr, @fhw); -my $fn = ''; -my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual); -if ($is_paired eq "yes") { # paired end - for (0 .. 3) { - $fn = $fn_suff[$_]; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[0], "|gzip >$outfile2") || die; - open($fhw[1], "|gzip >$outfile1") || die; - my (@df, @dr); - @df = &read1(1); @dr = &read1(2); - while (@df && @dr) { - if ($df[0] eq $dr[0]) { # mate pair - print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; - @df = &read1(1); @dr = &read1(2); - } - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[$_]) for (0 .. $#fhw); -} else { # single end - for (0 .. 1) { - my $fn = "$fn_suff[$_]"; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[2], "|gzip >$outfile1") || die; - my @df; - while (@df = &read1(1, $fhr[0], $fhr[1])) { - print {$fhw[2]} $df[1]; - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[2]); -} - -sub read1 { - my $i = shift(@_); - my $j = ($i-1)<<1; - my ($key, $seq); - my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); - while (<$fhs>) { - my $t = <$fhq>; - if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { - $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines - #print $key; - die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t); - my $name = "$1_$2_$3/$i"; - $_ = substr(<$fhs>, 2); - tr/0123./ACGTN/; - my $s = $_; - $_ = <$fhq>; - s/^(\d+)\s*//; - s/(\d+)\s*/chr($1+33)/eg; - $seq = qq/\@$name\n$s+\n$_\n/; - last; - } - } - return defined($seq)? ($key, $seq) : (); -} diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/fastq_conversions.py --- a/tools/next_gen_conversion/fastq_conversions.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -""" -Performs various conversions around Sanger FASTQ data - -usage: %prog [options] - -c, --command=c: Command to run - -i, --input=i: Input file to be converted - -o, --outputFastqsanger=o: FASTQ Sanger converted output file for sol2std - -s, --outputFastqsolexa=s: FASTQ Solexa converted output file - -f, --outputFasta=f: FASTA converted output file - -usage: %prog command input_file output_file -""" - -import os, sys, tempfile -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - - cmd = "fq_all2std.pl %s %s > %s" - if options.command == 'sol2std': - cmd = cmd % (options.command, options.input, options.outputFastqsanger) - elif options.command == 'std2sol': - cmd = cmd % (options.command, options.input, options.outputFastqsolexa) - elif options.command == 'fq2fa': - cmd = cmd % (options.command, options.input, options.outputFasta) - try: - os.system(cmd) - except Exception, eq: - stop_err("Error converting data format.\n" + str(eq)) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/fastq_conversions.xml --- a/tools/next_gen_conversion/fastq_conversions.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ - - converts between FASTQ data and other data formats - - fastq_conversions.py - --command=$conversionType.type - --input=$input - #if $conversionType.type == "sol2std": - --outputFastqsanger=$outputFastqsanger - #else: - --outputFastqsanger="None" - #end if - #if $conversionType.type == "std2sol": - --outputFastqsolexa=$outputFastqsolexa - #else: - --outputFastqsolexa="None" - #end if - #if $conversionType.type == "fq2fa": - --outputFasta=$outputFasta - #else: - --outputFasta="None" - #end if - - - - - - - - - - - - - - - - - - - - - - conversionType['type'] == 'sol2std' - - - conversionType['type'] == 'std2sol' - - - conversionType['type'] == 'fq2fa' - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool offers several conversions options relating to the FASTQ format. - ------ - -**Examples** - -- Converting the Solexa/Illumina FASTQ data:: - - @081017-and-081020:1:1:1715:1759 - GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC - + - II#IIIIIII$5+.(9IIIIIII$%*$G$A31I&&B - -- will produce the following Sanger FASTQ data:: - - @081017-and-081020:1:1:1715:1759 - GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC - + - ++!+++++++!!!!!"+++++++!!!!)!%!!+!!%! - -- Converting standard Sanger FASTQ:: - - @1831_573_1004/1 - AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG - + - ><C&&9952+C>5<.?<79,=42<292:<(9/-7 - @1831_573_1050/1 - TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT - + - ;@@17?@=>7??@A8?==@4A?A4)&+.'&+'1, - -- will produce the following Solexa/Illumina FASTQ data:: - - @1831_573_1004/1 - AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG - + - ][bEEXXTQJb]T[M^[VXK\SQ[QXQY[GXNLV - @1831_573_1050/1 - TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT - + - Z__PV^_\]V^^_`W^\\_S`^`SHEJMFEJFPK - -- Converting the Sanger FASTQ data:: - - @1831_573_1004/1 - AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG - + - ><C&&9952+C>5<.?<79,=42<292:<(9/-7 - @1831_573_1050/1 - TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT - + - ;@@17?@=>7??@A8?==@4A?A4)&+.'&+'1, - -- will produce the following FASTA data:: - - >1831_573_1004/1 - AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG - >1831_573_1050/1 - TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT - - - diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/fastq_gen_conv.py --- a/tools/next_gen_conversion/fastq_gen_conv.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,172 +0,0 @@ -""" -Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary. - -usage: %prog [options] - -i, --input=i: Input FASTQ candidate file - -r, --origType=r: Original type - -a, --allOrNot=a: Whether or not to check all blocks - -b, --blocks=b: Number of blocks to check - -o, --output=o: Output file - -usage: %prog input_file oroutput_file -""" - -import math, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def all_bases_valid(seq): - """Confirm that the sequence contains only bases""" - valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N'] - for base in seq: - if base not in valid_bases: - return False - return True - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - orig_type = options.origType - if orig_type == 'sanger' and options.allOrNot == 'not': - max_blocks = int(options.blocks) - else: - max_blocks = -1 - fin = file(options.input, 'r') - fout = file(options.output, 'w') - range_min = 1000 - range_max = -5 - block_num = 0 - bad_blocks = 0 - base_len = -1 - line_count = 0 - lines = [] - line = fin.readline() - while line: - if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks: - fout.write(line) - if line_count % 4 == 0: - block_num += 1 - line_count += 1 - elif line.strip(): - # the line that starts a block, with a name - if line_count % 4 == 0 and line.startswith('@'): - lines.append(line) - else: - # if we expect a sequence of bases - if line_count % 4 == 1 and all_bases_valid(line.strip()): - lines.append(line) - base_len = len(line.strip()) - # if we expect the second name line - elif line_count % 4 == 2 and line.startswith('+'): - lines.append(line) - # if we expect a sequence of qualities and it's the expected length - elif line_count % 4 == 3: - split_line = line.strip().split() - # decimal qualities - if len(split_line) == base_len: - # convert - phred_list = [] - for ch in split_line: - int_ch = int(ch) - if int_ch < range_min: - range_min = int_ch - if int_ch > range_max: - range_max = int_ch - if int_ch >= 0 and int_ch <= 93: - phred_list.append(chr(int_ch + 33)) - # make sure we haven't lost any quality values - if len(phred_list) == base_len: - # print first three lines - for l in lines: - fout.write(l) - # print converted quality line - fout.write(''.join(phred_list)) - # reset - lines = [] - base_len = -1 - # abort if so - else: - bad_blocks += 1 - lines = [] - base_len = -1 - # ascii qualities - elif len(split_line[0]) == base_len: - qualities = [] - # print converted quality line - if orig_type == 'illumina': - for c in line.strip(): - if ord(c) - 64 < range_min: - range_min = ord(c) - 64 - if ord(c) - 64 > range_max: - range_max = ord(c) - 64 - if ord(c) < 64 or ord(c) > 126: - bad_blocks += 1 - base_len = -1 - lines = [] - break - else: - qualities.append( chr( ord(c) - 31 ) ) - quals = ''.join(qualities) - elif orig_type == 'solexa': - for c in line.strip(): - if ord(c) - 64 < range_min: - range_min = ord(c) - 64 - if ord(c) - 64 > range_max: - range_max = ord(c) - 64 - if ord(c) < 59 or ord(c) > 126: - bad_blocks += 1 - base_len = -1 - lines = [] - break - else: - p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) ) - qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) ) - quals = ''.join(qualities) - else: # 'sanger' - for c in line.strip(): - if ord(c) - 33 < range_min: - range_min = ord(c) - 33 - if ord(c) - 33 > range_max: - range_max = ord(c) - 33 - if ord(c) < 33 or ord(c) > 126: - bad_blocks += 1 - base_len = -1 - lines = [] - break - else: - qualities.append(c) - quals = ''.join(qualities) - # make sure we don't have bad qualities - if len(quals) == base_len: - # print first three lines - for l in lines: - fout.write(l) - # print out quality line - fout.write(quals+'\n') - # reset - lines = [] - base_len = -1 - else: - bad_blocks += 1 - base_len = -1 - lines = [] - # mark the successful end of a block - block_num += 1 - line_count += 1 - line = fin.readline() - fout.close() - fin.close() - if range_min != 1000 and range_min != -5: - outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max) - else: - outmsg = '' - if bad_blocks > 0: - outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks) - sys.stdout.write(outmsg) - -if __name__=="__main__": __main__() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/fastq_gen_conv.xml --- a/tools/next_gen_conversion/fastq_gen_conv.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ - - converts any FASTQ to Sanger - - fastq_gen_conv.py - --input=$input - --origType=$origTypeChoice.origType - #if $origTypeChoice.origType == "sanger": - --allOrNot=$origTypeChoice.howManyBlocks.allOrNot - #if $origTypeChoice.howManyBlocks.allOrNot == "not": - --blocks=$origTypeChoice.howManyBlocks.blocks - #else: - --blocks="None" - #end if - #else: - --allOrNot="None" - --blocks="None" - #end if - --output=$output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Galaxy pipeline for mapping of Illumina data requires data to be in fastq format with quality values conforming to so called "Sanger" format. Unfortunately there are many other types of fastq. Thus the main objective of this tool is to "groom" multiple types of fastq into Sanger-conforming fastq that can be used in downstream application such as mapping. - -.. class:: infomark - -**TIP**: If the input dataset is already in Sanger format the tool does not perform conversion. However validation (described below) is still performed. - ------ - -**Types of fastq datasets** - -A good description of fastq datasets can be found `here`__, while a description of Galaxy's fastq "logic" can be found `here`__. Because ranges of quality values within different types of fastq datasets overlap it very difficult to detect them automatically. This tool supports conversion of two commonly found types (Solexa/Illumina 1.0 and Illumina 1.3+) into fastq Sanger. - - .. __: http://en.wikipedia.org/wiki/FASTQ_format - .. __: http://wiki.g2.bx.psu.edu/Admin/NGS%20Local%20Setup - -.. class:: warningmark - -**NOTE** that there is also a type of fastq format where quality values are represented by a list of space-delimited integers (e.g., 40 40 20 15 -5 20 ...). This tool **does not** handle such fastq. If you have such a dataset, it needs to be converted into ASCII-type fastq (where quality values are encoded by characters) by "Numeric-to-ASCII" utility before it can accepted by this tool. - ------ - -**Validation** - -In addition to converting quality values to Sanger format the tool also checks the input dataset for consistency. Specifically, it performs these four checks: - -- skips empty lines -- checks that blocks are properly formed by making sure that: - - #. there are four lines per block - #. the first line starts with "@" - #. the third line starts with "+" - #. lengths of second line (sequences) and the fourth line (quality string) are identical - -- checks that quality values are within range for the chosen fastq format (e.g., the format provided by the user in **How do you think quality values are scaled?** drop down. - -To see exactly what the tool does you can take a look at its source code `here`__. - - .. __: http://bitbucket.org/galaxy/galaxy-central/src/tip/tools/next_gen_conversion/fastq_gen_conv.py - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/solid2fastq.py --- a/tools/next_gen_conversion/solid2fastq.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,214 +0,0 @@ -#!/usr/bin/env python - -import sys -import string -import optparse -import tempfile -import sqlite3 - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def solid2sanger( quality_string, min_qual = 0 ): - sanger = "" - quality_string = quality_string.rstrip( " " ) - for qv in quality_string.split(" "): - try: - if int( qv ) < 0: - qv = '0' - if int( qv ) < min_qual: - return False - break - sanger += chr( int( qv ) + 33 ) - except: - pass - return sanger - -def Translator(frm='', to='', delete='', keep=None): - allchars = string.maketrans('','') - if len(to) == 1: - to = to * len(frm) - trans = string.maketrans(frm, to) - if keep is not None: - delete = allchars.translate(allchars, keep.translate(allchars, delete)) - def callable(s): - return s.translate(trans, delete) - return callable - -def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ): - - # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter, - # which can have three values: fastq, txt, and db - # fastq = fastq format - # txt = space delimited format with defline, reads, and qvs - # dp = dump data into sqlite3 db. - # IMPORTNAT! If out = db two optins must be provided: - # 1. f_out must be a db connection object initialized with sqlite3.connect() - # 2. table_name must be provided - - if out == 'db': - cursor = f_out.cursor() - sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name - cursor.execute(sql) - - lines = [] - line = " " - while line: - for f in [ f_reads, f_qual ]: - line = f.readline().rstrip( '\n\r' ) - while line.startswith( '#' ): - line = f.readline().rstrip( '\n\r' ) - lines.append( line ) - - - if lines[0].startswith( '>' ) and lines[1].startswith( '>' ): - - if lines[0] != lines[1]: - stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data') - - defline = lines[0][1:] - if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ): - defline = defline[ : len( defline )-3 ] - - elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ): - - if trim_first_base: - lines[0] = lines[0][1:] - if double_encode: - de = Translator(frm="0123.", to="ACGTN") - lines[0] = de(lines[0]) - qual = solid2sanger( lines[1], int( min_qual ) ) - if qual: - if out == 'fastq': - f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) - if out == 'txt': - f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) ) - if out == 'db': - cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) ) - lines = [] - -def main(): - - usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]" - parser = optparse.OptionParser(usage=usage) - - - parser.add_option( - '--fr','--f_reads', - metavar="F3_CSFASTA_FILE", - dest='fr', - help='Name of F3 file with color space reads') - - parser.add_option( - '--fq','--f_qual', - metavar="F3_QUAL_FILE", - dest='fq', - help='Name of F3 file with color quality values') - - parser.add_option( - '--fout','--f3_fastq_output', - metavar="F3_OUTPUT", - dest='fout', - help='Name for F3 output file') - - parser.add_option( - '--rr','--r_reads', - metavar="R3_CSFASTA_FILE", - dest='rr', - default = False, - help='Name of R3 file with color space reads') - - parser.add_option( - '--rq','--r_qual', - metavar="R3_QUAL_FILE", - dest='rq', - default = False, - help='Name of R3 file with color quality values') - - parser.add_option( - '--rout', - metavar="R3_OUTPUT", - dest='rout', - help='Name for F3 output file') - - parser.add_option( - '-q','--min_qual', - dest='min_qual', - default = '-1000', - help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000') - - parser.add_option( - '-t','--trim_name', - dest='trim_name', - action='store_true', - default = False, - help='Trim _R3 and _F3 off read names. Default is False') - - parser.add_option( - '-f','--trim_first_base', - dest='trim_first_base', - action='store_true', - default = False, - help='Remove the first base of reads in color-space. Default is False') - - parser.add_option( - '-d','--double_encode', - dest='de', - action='store_true', - default = False, - help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False') - - options, args = parser.parse_args() - - if not ( options.fout and options.fr and options.fq ): - parser.error(""" - One or more of the three required paremetrs is missing: - (1) --fr F3.csfasta file - (2) --fq F3.qual file - (3) --fout name of output file - Use --help for more info - """) - - fr = open ( options.fr , 'r' ) - fq = open ( options.fq , 'r' ) - f_out = open ( options.fout , 'w' ) - - if options.rr and options.rq: - rr = open ( options.rr , 'r' ) - rq = open ( options.rq , 'r' ) - if not options.rout: - parser.error("Provide the name for f3 output using --rout option. Use --help for more info") - r_out = open ( options.rout, 'w' ) - - db = tempfile.NamedTemporaryFile() - - try: - con = sqlite3.connect(db.name) - cur = con.cursor() - except: - stop_err('Cannot connect to %s\n') % db.name - - - merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" ) - merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" ) - cur.execute('create index f3_name on f3( name )') - cur.execute('create index r3_name on r3( name )') - - cur.execute('select * from f3,r3 where f3.name = r3.name') - for item in cur: - f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) ) - r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) ) - - - else: - merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual ) - - - - f_out.close() - -if __name__ == "__main__": - main() - - \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/solid2fastq.xml --- a/tools/next_gen_conversion/solid2fastq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,160 +0,0 @@ - - SOLiD output to fastq - - #if $is_run.paired == "no" #solid2fastq.py --fr=$input1 --fq=$input2 --fout=$out_file1 -q $qual $trim_name $trim_first_base $double_encode - #elif $is_run.paired == "yes" #solid2fastq.py --fr=$input1 --fq=$input2 --fout=$out_file1 --rr=$input3 --rq=$input4 --rout=$out_file2 -q $qual $trim_name $trim_first_base $double_encode - #end if# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - is_run['paired'] == 'yes' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Converts output of SOLiD instrument (versions 3.5 and earlier) to fastq format suitable for bowtie, bwa, and PerM mappers. - --------- - -**Input datasets** - -Below are examples of forward (F3) reads and quality scores: - -Reads:: - - >1831_573_1004_F3 - T00030133312212111300011021310132222 - >1831_573_1567_F3 - T03330322230322112131010221102122113 - -Quality scores:: - - >1831_573_1004_F3 - 4 29 34 34 32 32 24 24 20 17 10 34 29 20 34 13 30 34 22 24 11 28 19 17 34 17 24 17 25 34 7 24 14 12 22 - >1831_573_1567_F3 - 8 26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 32 10 13 6 32 10 6 16 11 - - -**Mate pairs** - -If your data is from a mate-paired run, you will have additional read and quality datasets that will look similar to the ones above with one exception: the names of reads will be ending with "_R3". -In this case choose **Yes** from the *Is this a mate-pair run?* drop down and you will be able to select R reads. When processing mate pairs this tool generates two output files: one for F3 reads and the other for R3 reads. -The reads are guaranteed to be paired -- mated reads will be in the same position in F3 and R3 fastq file. However, because pairing is verified it may take a while to process an entire SOLiD run (several hours). - ------- - -**Explanation of parameters** - -**Remove reads containing color qualities below this value** - any read that contains as least one color call with quality lower than the specified value **will not** be reported. - -**Trim trailing "_F3" and "_R3"?** - does just that. Not necessary for bowtie. Required for BWA. - -**Trim first base?** - SOLiD reads contain an adapter base such as the first T in this read:: - - >1831_573_1004_F3 - T00030133312212111300011021310132222 - -this option removes this base leaving only color calls. Not necessary for bowtie. Required for BWA. - -**Double encode?** - converts color calls (0123.) to pseudo-nucleotides (ACGTN). Not necessary for bowtie. Required for BWA. - ------- - -**Examples of output** - -When all parameters are left "as-is" you will get this (using reads and qualities shown above):: - - @1831_573_1004 - T00030133312212111300011021310132222 - + - %>CCAA9952+C>5C.?C79,=42C292:C(9/-7 - @1831_573_1004 - T03330322230322112131010221102122113 - + - );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, - -Setting *Trim first base from reads* to **Yes** will produce this:: - - @1831_573_1004 - 00030133312212111300011021310132222 - + - %>CCAA9952+C>5C.?C79,=42C292:C(9/-7 - @1831_573_1004 - 03330322230322112131010221102122113 - + - );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, - -Finally, setting *Double encode* to **Yes** will yield:: - - @1831_573_1004 - TAAATACTTTCGGCGCCCTAAACCAGCTCACTGGGG - + - %>CCAA9952+C>5C.?C79,=42C292:C(9/-7 - @1831_573_1004 - TATTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT - + - );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/solid_to_fastq.py --- a/tools/next_gen_conversion/solid_to_fastq.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ -#!/usr/bin/env python - -""" -Converts SOLiD data to Sanger FASTQ format. - -usage: %prog [options] - -i, --input1=i: Forward reads file - -q, --input2=q: Forward qual file - -I, --input3=I: Reverse reads file - -Q, --input4=Q: Reverse qual file - -o, --output1=o: Forward output - -r, --output2=r: Reverse output - -usage: %prog forward_reads_file forwards_qual_file reverse_reads_file(or_None) reverse_qual_file(or_None) output_file ouptut_id output_dir -""" - -import os, sys, tempfile -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def replaceNeg1(fin, fout): - line = fin.readline() - while line.strip(): - fout.write(line.replace('-1', '1')) - line = fin.readline() - fout.seek(0) - return fout - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - # common temp file setup - tmpf = tempfile.NamedTemporaryFile() #forward reads - tmpqf = tempfile.NamedTemporaryFile() - tmpqf = replaceNeg1(file(options.input2,'r'), tmpqf) - # if paired-end data (have reverse input files) - if options.input3 != "None" and options.input4 != "None": - tmpr = tempfile.NamedTemporaryFile() #reverse reads - # replace the -1 in the qualities file - tmpqr = tempfile.NamedTemporaryFile() - tmpqr = replaceNeg1(file(options.input4,'r'), tmpqr) - cmd1 = "%s/bwa_solid2fastq_modified.pl 'yes' %s %s %s %s %s %s 2>&1" %(os.path.split(sys.argv[0])[0], tmpf.name, tmpr.name, options.input1, tmpqf.name, options.input3, tmpqr.name) - try: - os.system(cmd1) - os.system('gunzip -c %s >> %s' %(tmpf.name,options.output1)) - os.system('gunzip -c %s >> %s' %(tmpr.name,options.output2)) - except Exception, eq: - stop_err("Error converting data to fastq format.\n" + str(eq)) - tmpr.close() - tmpqr.close() - # if single-end data - else: - cmd1 = "%s/bwa_solid2fastq_modified.pl 'no' %s %s %s %s %s %s 2>&1" % (os.path.split(sys.argv[0])[0], tmpf.name, None, options.input1, tmpqf.name, None, None) - try: - os.system(cmd1) - os.system('gunzip -c %s >> %s' % (tmpf.name, options.output1)) - except Exception, eq: - stop_err("Error converting data to fastq format.\n" + str(eq)) - tmpqf.close() - tmpf.close() - sys.stdout.write('converted SOLiD data') - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/next_gen_conversion/solid_to_fastq.xml --- a/tools/next_gen_conversion/solid_to_fastq.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ - - converts SOLiD data to FASTQ data - - solid_to_fastq.py - --input1=$input1 - --input2=$input2 - #if $paired.pairedSingle == "single": - --input3="None" - --input4="None" - #else: - --input3=$input3 - --input4=$input4 - #end if - --output1=$output1 - #if $paired.pairedSingle == "single": - --output2="None" - #else: - --output2=$output2 - #end if - - - - - - - - - - - - - - - - - - - - - - - - paired['pairedSingle'] == 'paired' - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool takes reads and quality files and converts them to FASTQ data ( Sanger variant ). Any -1 qualities are converted to 1 before being converted to FASTQ. Note that it also converts sequences to base pairs. - ------ - -**Example** - -- Converting the following sequences:: - - >1831_573_1004_F3 - T00030133312212111300011021310132222 - >1831_573_1567_F3 - T03330322230322112131010221102122113 - -- and quality scores:: - - >1831_573_1004_F3 - 4 29 34 34 32 32 24 24 20 17 10 34 29 20 34 13 30 34 22 24 11 28 19 17 34 17 24 17 25 34 7 24 14 12 22 - >1831_573_1567_F3 - 8 26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 32 10 13 6 32 10 6 16 11 - -- will produce the following Sanger FASTQ data:: - - @1831_573_1004/1 - AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG - + - >CCAA9952+C>5C.?C79,=42C292:C(9/-7 - @1831_573_1567/1 - TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT - + - ;@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/cuffcompare_wrapper.py --- a/tools/ngs_rna/cuffcompare_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,131 +0,0 @@ -#!/usr/bin/env python - -import optparse, os, shutil, subprocess, sys, tempfile - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -# Copied from sam_to_bam.py: -def check_seq_file( dbkey, cached_seqs_pointer_file ): - seq_path = '' - for line in open( cached_seqs_pointer_file ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ) and line.startswith( 'index' ): - fields = line.split( '\t' ) - if len( fields ) < 3: - continue - if fields[1] == dbkey: - seq_path = fields[2].strip() - break - return seq_path - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-r', dest='ref_annotation', help='An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.' ) - parser.add_option( '-R', action="store_true", dest='ignore_nonoverlap', help='If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts accuracy file' ) - parser.add_option( '-s', dest='use_seq_data', action="store_true", help='Causes cuffcompare to look into for fasta files with the underlying genomic sequences (one file per contig) against which your reads were aligned for some optional classification functions. For example, Cufflinks transcripts consisting mostly of lower-case bases are classified as repeats. Note that must contain one fasta file per reference chromosome, and each file must be named after the chromosome, and have a .fa or .fasta extension.') - - # Wrapper / Galaxy options. - parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) - parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) - parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) - - # Outputs. - parser.add_option( '', '--combined-transcripts', dest='combined_transcripts' ) - - (options, args) = parser.parse_args() - - # output version # of tool - try: - tmp = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp, 'wb' ) - proc = subprocess.Popen( args='cuffcompare 2>&1', shell=True, stdout=tmp_stdout ) - tmp_stdout.close() - returncode = proc.wait() - stdout = None - for line in open( tmp_stdout.name, 'rb' ): - if line.lower().find( 'cuffcompare v' ) >= 0: - stdout = line.strip() - break - if stdout: - sys.stdout.write( '%s\n' % stdout ) - else: - raise Exception - except: - sys.stdout.write( 'Could not determine Cuffcompare version\n' ) - - # Set/link to sequence file. - if options.use_seq_data: - cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) - if not os.path.exists( cached_seqs_pointer_file ): - stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) - # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, - # and the equCab2.fa file will contain fasta sequences. - seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) - if options.ref_file != 'None': - # Create symbolic link to ref_file so that index will be created in working directory. - seq_path = "ref.fa" - os.symlink( options.ref_file, seq_path ) - - # Build command. - - # Base. - cmd = "cuffcompare -o cc_output " - - # Add options. - if options.ref_annotation: - cmd += " -r %s " % options.ref_annotation - if options.ignore_nonoverlap: - cmd += " -R " - if options.use_seq_data: - cmd += " -s %s " % seq_path - - # Add input files. - - # Need to symlink inputs so that output files are written to temp directory. - for i, arg in enumerate( args ): - input_file_name = "./input%i" % ( i+1 ) - os.symlink( arg, input_file_name ) - cmd += "%s " % input_file_name - - # Debugging. - print cmd - - # Run command. - try: - tmp_name = tempfile.NamedTemporaryFile( dir="." ).name - tmp_stderr = open( tmp_name, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - - # Get stderr, allowing for case where it's very large. - tmp_stderr = open( tmp_name, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - - # Error checking. - if returncode != 0: - raise Exception, stderr - - # Copy outputs. - shutil.copyfile( "cc_output.combined.gtf" , options.combined_transcripts ) - - # check that there are results in the output file - cc_output_fname = "cc_output.stats" - if len( open( cc_output_fname, 'rb' ).read().strip() ) == 0: - raise Exception, 'The main output file is empty, there may be an error with your input file or settings.' - except Exception, e: - stop_err( 'Error running cuffcompare. ' + str( e ) ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/cuffcompare_wrapper.xml --- a/tools/ngs_rna/cuffcompare_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,217 +0,0 @@ - - - compare assembled transcripts to a reference annotation and track Cufflinks transcripts across multiple experiments - - cufflinks - - - cuffcompare_wrapper.py - - ## Use annotation reference? - #if $annotation.use_ref_annotation == "Yes": - -r $annotation.reference_annotation - #if $annotation.ignore_nonoverlapping_reference: - -R - #end if - #end if - - ## Use sequence data? - #if $seq_data.use_seq_data == "Yes": - -s - #if $seq_data.seq_source.index_source == "history": - --ref_file=$seq_data.seq_source.ref_file - #else: - --ref_file="None" - #end if - --dbkey=${first_input.metadata.dbkey} - --index_dir=${GALAXY_DATA_INDEX_DIR} - #end if - - ## Outputs. - --combined-transcripts=${transcripts_combined} - - ## Inputs. - ${first_input} - #for $input_file in $input_files: - ${input_file.additional_input} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - len( input_files ) > 0 - - - len( input_files ) > 0 - - - len( input_files ) > 0 - - - - - - - - - - - - - - - - - - - - - - - - - -**Cuffcompare Overview** - -Cuffcompare is part of Cufflinks_. Cuffcompare helps you: (a) compare your assembled transcripts to a reference annotation and (b) track Cufflinks transcripts across multiple experiments (e.g. across a time course). Please cite: Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ, Salzberg SL, Wold B, Pachter L. Transcript assembly and abundance estimation from RNA-Seq reveals thousands of new transcripts and switching among isoforms. Nature Biotechnology doi:10.1038/nbt.1621 - -.. _Cufflinks: http://cufflinks.cbcb.umd.edu/ - ------- - -**Know what you are doing** - -.. class:: warningmark - -There is no such thing (yet) as an automated gearshift in expression analysis. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. - -.. __: http://cufflinks.cbcb.umd.edu/manual.html#cuffcompare - ------- - -**Input format** - -Cuffcompare takes Cufflinks' GTF output as input, and optionally can take a "reference" annotation (such as from Ensembl_) - -.. _Ensembl: http://www.ensembl.org - ------- - -**Outputs** - -Cuffcompare produces the following output files: - -Transcripts Accuracy File: - -Cuffcompare reports various statistics related to the "accuracy" of the transcripts in each sample when compared to the reference annotation data. The typical gene finding measures of "sensitivity" and "specificity" (as defined in Burset, M., Guigó, R. : Evaluation of gene structure prediction programs (1996) Genomics, 34 (3), pp. 353-367. doi: 10.1006/geno.1996.0298) are calculated at various levels (nucleotide, exon, intron, transcript, gene) for each input file and reported in this file. The Sn and Sp columns show specificity and sensitivity values at each level, while the fSn and fSp columns are "fuzzy" variants of these same accuracy calculations, allowing for a very small variation in exon boundaries to still be counted as a "match". - -Transcripts Combined File: - -Cuffcompare reports a GTF file containing the "union" of all transfrags in each sample. If a transfrag is present in both samples, it is thus reported once in the combined gtf. - -Transcripts Tracking File: - -This file matches transcripts up between samples. Each row contains a transcript structure that is present in one or more input GTF files. Because the transcripts will generally have different IDs (unless you assembled your RNA-Seq reads against a reference transcriptome), cuffcompare examines the structure of each the transcripts, matching transcripts that agree on the coordinates and order of all of their introns, as well as strand. Matching transcripts are allowed to differ on the length of the first and last exons, since these lengths will naturally vary from sample to sample due to the random nature of sequencing. -If you ran cuffcompare with the -r option, the first and second columns contain the closest matching reference transcript to the one described by each row. - -Here's an example of a line from the tracking file:: - - TCONS_00000045 XLOC_000023 Tcea|uc007afj.1 j \ - q1:exp.115|exp.115.0|100|3.061355|0.350242|0.350207 \ - q2:60hr.292|60hr.292.0|100|4.094084|0.000000|0.000000 - -In this example, a transcript present in the two input files, called exp.115.0 in the first and 60hr.292.0 in the second, doesn't match any reference transcript exactly, but shares exons with uc007afj.1, an isoform of the gene Tcea, as indicated by the class code j. The first three columns are as follows:: - - Column number Column name Example Description - ----------------------------------------------------------------------- - 1 Cufflinks transfrag id TCONS_00000045 A unique internal id for the transfrag - 2 Cufflinks locus id XLOC_000023 A unique internal id for the locus - 3 Reference gene id Tcea The gene_name attribute of the reference GTF record for this transcript, or '-' if no reference transcript overlaps this Cufflinks transcript - 4 Reference transcript id uc007afj.1 The transcript_id attribute of the reference GTF record for this transcript, or '-' if no reference transcript overlaps this Cufflinks transcript - 5 Class code c The type of match between the Cufflinks transcripts in column 6 and the reference transcript. See class codes - -Each of the columns after the fifth have the following format: - qJ:gene_id|transcript_id|FMI|FPKM|conf_lo|conf_hi - -A transcript need be present in all samples to be reported in the tracking file. A sample not containing a transcript will have a "-" in its entry in the row for that transcript. - -Class Codes - -If you ran cuffcompare with the -r option, tracking rows will contain the following values. If you did not use -r, the rows will all contain "-" in their class code column:: - - Priority Code Description - --------------------------------- - 1 = Match - 2 c Contained - 3 j New isoform - 4 e A single exon transcript overlapping a reference exon and at least 10 bp of a reference intron, indicating a possible pre-mRNA fragment. - 5 i A single exon transcript falling entirely with a reference intron - 6 r Repeat. Currently determined by looking at the reference sequence and applied to transcripts where at least 50% of the bases are lower case - 7 p Possible polymerase run-on fragment - 8 u Unknown, intergenic transcript - 9 o Unknown, generic overlap with reference - 10 . (.tracking file only, indicates multiple classifications) - -------- - -**Settings** - -All of the options have a default value. You can change any of them. Most of the options in Cuffcompare have been implemented here. - ------- - -**Cuffcompare parameter list** - -This is a list of implemented Cuffcompare options:: - - -r An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below. - -R If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts_accuracy file - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/cuffdiff_wrapper.py --- a/tools/ngs_rna/cuffdiff_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -#!/usr/bin/env python - -import optparse, os, shutil, subprocess, sys, tempfile - -def group_callback( option, op_str, value, parser ): - groups = [] - flist = [] - for arg in parser.rargs: - arg = arg.strip() - if arg[0] is "-": - break - elif arg[0] is ",": - groups.append(flist) - flist = [] - else: - flist.append(arg) - groups.append(flist) - - setattr(parser.values, option.dest, groups) - -def label_callback( option, op_str, value, parser ): - labels = [] - for arg in parser.rargs: - arg = arg.strip() - if arg[0] is "-": - break - else: - labels.append(arg) - - setattr(parser.values, option.dest, labels) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -# Copied from sam_to_bam.py: -def check_seq_file( dbkey, cached_seqs_pointer_file ): - seq_path = '' - for line in open( cached_seqs_pointer_file ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ) and line.startswith( 'index' ): - fields = line.split( '\t' ) - if len( fields ) < 3: - continue - if fields[1] == dbkey: - seq_path = fields[2].strip() - break - return seq_path - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - - # Cuffdiff options. - parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' ) - parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) - parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \ - For, example, for paired end runs with fragments selected at 300bp, \ - where each end is 50bp, you should set -r to be 200. The default is 45bp.') - parser.add_option( '-c', '--min-alignment-count', dest='min_alignment_count', help='The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\' observed changes don\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).' ) - parser.add_option( '--FDR', dest='FDR', help='The allowed false discovery rate. The default is 0.05.' ) - - # Advanced Options: - parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' ) - parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' ) - - # Wrapper / Galaxy options. - parser.add_option( '-f', '--files', dest='groups', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" ) - parser.add_option( '-A', '--inputA', dest='inputA', help='A transcript GTF file produced by cufflinks, cuffcompare, or other source.') - parser.add_option( '-1', '--input1', dest='input1', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' ) - parser.add_option( '-2', '--input2', dest='input2', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' ) - - # Label options - parser.add_option('-L', '--labels', dest='labels', action="callback", callback=label_callback, help="Labels for the groups the replicates are in.") - - # Normalization options. - parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" ) - - # Bias correction options. - parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.') - parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) - parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) - parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) - - # Outputs. - parser.add_option( "--isoforms_fpkm_tracking_output", dest="isoforms_fpkm_tracking_output" ) - parser.add_option( "--genes_fpkm_tracking_output", dest="genes_fpkm_tracking_output" ) - parser.add_option( "--cds_fpkm_tracking_output", dest="cds_fpkm_tracking_output" ) - parser.add_option( "--tss_groups_fpkm_tracking_output", dest="tss_groups_fpkm_tracking_output" ) - parser.add_option( "--isoforms_exp_output", dest="isoforms_exp_output" ) - parser.add_option( "--genes_exp_output", dest="genes_exp_output" ) - parser.add_option( "--tss_groups_exp_output", dest="tss_groups_exp_output" ) - parser.add_option( "--cds_exp_fpkm_tracking_output", dest="cds_exp_fpkm_tracking_output" ) - parser.add_option( "--splicing_diff_output", dest="splicing_diff_output" ) - parser.add_option( "--cds_diff_output", dest="cds_diff_output" ) - parser.add_option( "--promoters_diff_output", dest="promoters_diff_output" ) - - (options, args) = parser.parse_args() - - # output version # of tool - try: - tmp = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp, 'wb' ) - proc = subprocess.Popen( args='cuffdiff --no-update-check 2>&1', shell=True, stdout=tmp_stdout ) - tmp_stdout.close() - returncode = proc.wait() - stdout = None - for line in open( tmp_stdout.name, 'rb' ): - if line.lower().find( 'cuffdiff v' ) >= 0: - stdout = line.strip() - break - if stdout: - sys.stdout.write( '%s\n' % stdout ) - else: - raise Exception - except: - sys.stdout.write( 'Could not determine Cuffdiff version\n' ) - - # Make temp directory for output. - tmp_output_dir = tempfile.mkdtemp() - - # If doing bias correction, set/link to sequence file. - if options.do_bias_correction: - cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) - if not os.path.exists( cached_seqs_pointer_file ): - stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) - # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, - # and the equCab2.fa file will contain fasta sequences. - seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) - if options.ref_file != 'None': - # Create symbolic link to ref_file so that index will be created in working directory. - seq_path = os.path.join( tmp_output_dir, "ref.fa" ) - os.symlink( options.ref_file, seq_path ) - - # Build command. - - # Base; always use quiet mode to avoid problems with storing log output. - cmd = "cuffdiff --no-update-check -q" - - # Add options. - if options.inner_dist_std_dev: - cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) ) - if options.num_threads: - cmd += ( " -p %i" % int ( options.num_threads ) ) - if options.inner_mean_dist: - cmd += ( " -m %i" % int ( options.inner_mean_dist ) ) - if options.min_alignment_count: - cmd += ( " -c %i" % int ( options.min_alignment_count ) ) - if options.FDR: - cmd += ( " --FDR %f" % float( options.FDR ) ) - if options.num_importance_samples: - cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) ) - if options.max_mle_iterations: - cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) ) - if options.do_normalization: - cmd += ( " -N" ) - if options.do_bias_correction: - cmd += ( " -b %s" % seq_path ) - - # Add inputs. - # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2 - if options.groups: - cmd += " --labels " - for label in options.labels: - cmd += label + "," - cmd = cmd[:-1] - - cmd += " " + options.inputA + " " - - for group in options.groups: - for filename in group: - cmd += filename + "," - cmd = cmd[:-1] + " " - else: - cmd += " " + options.inputA + " " + options.input1 + " " + options.input2 - - # Debugging. - print cmd - - # Run command. - try: - tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name - tmp_stderr = open( tmp_name, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_output_dir, stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - - # Get stderr, allowing for case where it's very large. - tmp_stderr = open( tmp_name, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - - # Error checking. - if returncode != 0: - raise Exception, stderr - - # check that there are results in the output file - if len( open( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), 'rb' ).read().strip() ) == 0: - raise Exception, 'The main output file is empty, there may be an error with your input file or settings.' - except Exception, e: - stop_err( 'Error running cuffdiff. ' + str( e ) ) - - - # Copy output files from tmp directory to specified files. - try: - try: - shutil.copyfile( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), options.isoforms_fpkm_tracking_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "genes.fpkm_tracking" ), options.genes_fpkm_tracking_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "cds.fpkm_tracking" ), options.cds_fpkm_tracking_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "tss_groups.fpkm_tracking" ), options.tss_groups_fpkm_tracking_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "isoform_exp.diff" ), options.isoforms_exp_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "gene_exp.diff" ), options.genes_exp_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "tss_group_exp.diff" ), options.tss_groups_exp_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "splicing.diff" ), options.splicing_diff_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "cds.diff" ), options.cds_diff_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "cds_exp.diff" ), options.cds_exp_fpkm_tracking_output ) - shutil.copyfile( os.path.join( tmp_output_dir, "promoters.diff" ), options.promoters_diff_output ) - except Exception, e: - stop_err( 'Error in cuffdiff:\n' + str( e ) ) - finally: - # Clean up temp dirs - if os.path.exists( tmp_output_dir ): - shutil.rmtree( tmp_output_dir ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/cuffdiff_wrapper.xml --- a/tools/ngs_rna/cuffdiff_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,238 +0,0 @@ - - - find significant changes in transcript expression, splicing, and promoter use - - cufflinks - - - cuffdiff_wrapper.py - --FDR=$fdr - --num-threads="4" - --min-alignment-count=$min_alignment_count - - --isoforms_fpkm_tracking_output=$isoforms_fpkm_tracking - --genes_fpkm_tracking_output=$genes_fpkm_tracking - --cds_fpkm_tracking_output=$cds_fpkm_tracking - --tss_groups_fpkm_tracking_output=$tss_groups_fpkm_tracking - --isoforms_exp_output=$isoforms_exp - --genes_exp_output=$genes_exp - --tss_groups_exp_output=$tss_groups_exp - --cds_exp_fpkm_tracking_output=$cds_exp_fpkm_tracking - --splicing_diff_output=$splicing_diff - --cds_diff_output=$cds_diff - --promoters_diff_output=$promoters_diff - - ## Set paired-end data parameters? - #if $singlePaired.sPaired == "Yes": - -m $singlePaired.mean_inner_distance - -s $singlePaired.inner_distance_std_dev - #end if - - ## Normalization? - #if str($do_normalization) == "Yes": - -N - #end if - - - ## Bias correction? - #if $bias_correction.do_bias_correction == "Yes": - -b - #if $bias_correction.seq_source.index_source == "history": - --ref_file=$bias_correction.seq_source.ref_file - #else: - --ref_file="None" - #end if - --dbkey=${gtf_input.metadata.dbkey} - --index_dir=${GALAXY_DATA_INDEX_DIR} - #end if - - ## Inputs. - --inputA=$gtf_input - #if $group_analysis.do_groups == "No": - --input1=$aligned_reads1 - --input2=$aligned_reads2 - #else: - ## Replicates. - --labels - #for $group in $group_analysis.groups - ${group.group} - #end for - --files - #for $group in $group_analysis.groups - #for $file in $group.files: - ${file.file} - #end for - , - #end for - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Cuffdiff Overview** - -Cuffdiff is part of Cufflinks_. Cuffdiff find significant changes in transcript expression, splicing, and promoter use. Please cite: Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ, Salzberg SL, Wold B, Pachter L. Transcript assembly and abundance estimation from RNA-Seq reveals thousands of new transcripts and switching among isoforms. Nature Biotechnology doi:10.1038/nbt.1621 - -.. _Cufflinks: http://cufflinks.cbcb.umd.edu/ - ------- - -**Know what you are doing** - -.. class:: warningmark - -There is no such thing (yet) as an automated gearshift in expression analysis. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. - -.. __: http://cufflinks.cbcb.umd.edu/manual.html#cuffdiff - ------- - -**Input format** - -Cuffdiff takes Cufflinks or Cuffcompare GTF files as input along with two SAM files containing the fragment alignments for two or more samples. - ------- - -**Outputs** - -Cuffdiff produces many output files: - -1. Transcript FPKM expression tracking. -2. Gene FPKM expression tracking; tracks the summed FPKM of transcripts sharing each gene_id -3. Primary transcript FPKM tracking; tracks the summed FPKM of transcripts sharing each tss_id -4. Coding sequence FPKM tracking; tracks the summed FPKM of transcripts sharing each p_id, independent of tss_id -5. Transcript differential FPKM. -6. Gene differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each gene_id -7. Primary transcript differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each tss_id -8. Coding sequence differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each p_id independent of tss_id -9. Differential splicing tests: this tab delimited file lists, for each primary transcript, the amount of overloading detected among its isoforms, i.e. how much differential splicing exists between isoforms processed from a single primary transcript. Only primary transcripts from which two or more isoforms are spliced are listed in this file. -10. Differential promoter tests: this tab delimited file lists, for each gene, the amount of overloading detected among its primary transcripts, i.e. how much differential promoter use exists between samples. Only genes producing two or more distinct primary transcripts (i.e. multi-promoter genes) are listed here. -11. Differential CDS tests: this tab delimited file lists, for each gene, the amount of overloading detected among its coding sequences, i.e. how much differential CDS output exists between samples. Only genes producing two or more distinct CDS (i.e. multi-protein genes) are listed here. - -------- - -**Settings** - -All of the options have a default value. You can change any of them. Most of the options in Cuffdiff have been implemented here. - ------- - -**Cuffdiff parameter list** - -This is a list of implemented Cuffdiff options:: - - -m INT This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments selected at 300bp, where each end is 50bp, you should set -r to be 200. The default is 45bp. - -s INT The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp. - -c INT The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not significant, and the locus' observed changes don't contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads). - --FDR FLOAT The allowed false discovery rate. The default is 0.05. - --num-importance-samples INT Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000 - --max-mle-iterations INT Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000 - -N With this option, Cufflinks excludes the contribution of the top 25 percent most highly expressed genes from the number of mapped fragments used in the FPKM denominator. This can improve robustness of differential expression calls for less abundant genes and transcripts. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/cufflinks_wrapper.py --- a/tools/ngs_rna/cufflinks_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,155 +0,0 @@ -#!/usr/bin/env python - -import optparse, os, shutil, subprocess, sys, tempfile - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -# Copied from sam_to_bam.py: -def check_seq_file( dbkey, cached_seqs_pointer_file ): - seq_path = '' - for line in open( cached_seqs_pointer_file ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ) and line.startswith( 'index' ): - fields = line.split( '\t' ) - if len( fields ) < 3: - continue - if fields[1] == dbkey: - seq_path = fields[2].strip() - break - return seq_path - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-1', '--input', dest='input', help=' file of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' ) - parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' ) - parser.add_option( '-I', '--max-intron-length', dest='max_intron_len', help='The minimum intron length. Cufflinks will not report transcripts with introns longer than this, and will ignore SAM alignments with REF_SKIP CIGAR operations longer than this. The default is 300,000.' ) - parser.add_option( '-F', '--min-isoform-fraction', dest='min_isoform_fraction', help='After calculating isoform abundance for a gene, Cufflinks filters out transcripts that it believes are very low abundance, because isoforms expressed at extremely low levels often cannot reliably be assembled, and may even be artifacts of incompletely spliced precursors of processed transcripts. This parameter is also used to filter out introns that have far fewer spliced alignments supporting them. The default is 0.05, or 5% of the most abundant isoform (the major isoform) of the gene.' ) - parser.add_option( '-j', '--pre-mrna-fraction', dest='pre_mrna_fraction', help='Some RNA-Seq protocols produce a significant amount of reads that originate from incompletely spliced transcripts, and these reads can confound the assembly of fully spliced mRNAs. Cufflinks uses this parameter to filter out alignments that lie within the intronic intervals implied by the spliced alignments. The minimum depth of coverage in the intronic region covered by the alignment is divided by the number of spliced reads, and if the result is lower than this parameter value, the intronic alignments are ignored. The default is 5%.' ) - parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) - parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \ - For, example, for paired end runs with fragments selected at 300bp, \ - where each end is 50bp, you should set -r to be 200. The default is 45bp.') - parser.add_option( '-G', '--GTF', dest='GTF', help='Tells Cufflinks to use the supplied reference annotation to estimate isoform expression. It will not assemble novel transcripts, and the program will ignore alignments not structurally compatible with any reference transcript.' ) - parser.add_option( '-g', '--GTF-guide', dest='GTFguide', help='use reference transcript annotation to guide assembly' ) - - # Normalization options. - parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" ) - - # Wrapper / Galaxy options. - parser.add_option( '-A', '--assembled-isoforms-output', dest='assembled_isoforms_output_file', help='Assembled isoforms output file; formate is GTF.' ) - - # Advanced Options: - parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' ) - parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' ) - - # Bias correction options. - parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.') - parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) - parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) - parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) - - (options, args) = parser.parse_args() - - # output version # of tool - try: - tmp = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp, 'wb' ) - proc = subprocess.Popen( args='cufflinks --no-update-check 2>&1', shell=True, stdout=tmp_stdout ) - tmp_stdout.close() - returncode = proc.wait() - stdout = None - for line in open( tmp_stdout.name, 'rb' ): - if line.lower().find( 'cufflinks v' ) >= 0: - stdout = line.strip() - break - if stdout: - sys.stdout.write( '%s\n' % stdout ) - else: - raise Exception - except: - sys.stdout.write( 'Could not determine Cufflinks version\n' ) - - # If doing bias correction, set/link to sequence file. - if options.do_bias_correction: - cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) - if not os.path.exists( cached_seqs_pointer_file ): - stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) - # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, - # and the equCab2.fa file will contain fasta sequences. - seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) - if options.ref_file != 'None': - # Create symbolic link to ref_file so that index will be created in working directory. - seq_path = "ref.fa" - os.symlink( options.ref_file, seq_path ) - - # Build command. - - # Base; always use quiet mode to avoid problems with storing log output. - cmd = "cufflinks -q --no-update-check" - - # Add options. - if options.inner_dist_std_dev: - cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) ) - if options.max_intron_len: - cmd += ( " -I %i" % int ( options.max_intron_len ) ) - if options.min_isoform_fraction: - cmd += ( " -F %f" % float ( options.min_isoform_fraction ) ) - if options.pre_mrna_fraction: - cmd += ( " -j %f" % float ( options.pre_mrna_fraction ) ) - if options.num_threads: - cmd += ( " -p %i" % int ( options.num_threads ) ) - if options.inner_mean_dist: - cmd += ( " -m %i" % int ( options.inner_mean_dist ) ) - if options.GTF: - cmd += ( " -G %s" % options.GTF ) - if options.GTFguide: - cmd += ( " -g %s" % options.GTFguide ) - if options.num_importance_samples: - cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) ) - if options.max_mle_iterations: - cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) ) - if options.do_normalization: - cmd += ( " -N" ) - if options.do_bias_correction: - cmd += ( " -b %s" % seq_path ) - - # Debugging. - print cmd - - # Add input files. - cmd += " " + options.input - - # Run command. - try: - tmp_name = tempfile.NamedTemporaryFile( dir="." ).name - tmp_stderr = open( tmp_name, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - - # Get stderr, allowing for case where it's very large. - tmp_stderr = open( tmp_name, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - - # Copy outputs. - shutil.copyfile( "transcripts.gtf" , options.assembled_isoforms_output_file ) - - # Error checking. - if returncode != 0: - raise Exception, stderr - except Exception, e: - stop_err( 'Error running cufflinks. ' + str( e ) ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/cufflinks_wrapper.xml --- a/tools/ngs_rna/cufflinks_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,222 +0,0 @@ - - - transcript assembly and FPKM (RPKM) estimates for RNA-Seq data - - cufflinks - - - cufflinks_wrapper.py - --input=$input - --assembled-isoforms-output=$assembled_isoforms - --num-threads="4" - -I $max_intron_len - -F $min_isoform_fraction - -j $pre_mrna_fraction - - ## Include reference annotation? - #if $reference_annotation.use_ref == "use reference annotation": - -G $reference_annotation.reference_annotation_file - #end if - #if $reference_annotation.use_ref == "use reference annotation guide": - -g $reference_annotation_guide.reference_annotation_guide_file - #end if - - ## Set paired-end parameters? - #if $singlePaired.sPaired == "Yes": - -m $singlePaired.mean_inner_distance - -s $singlePaired.inner_distance_std_dev - #end if - - ## Normalization? - #if str($do_normalization) == "Yes": - -N - #end if - - ## Bias correction? - #if $bias_correction.do_bias_correction == "Yes": - -b - #if $bias_correction.seq_source.index_source == "history": - --ref_file=$bias_correction.seq_source.ref_file - #else: - --ref_file="None" - #end if - --dbkey=${input.metadata.dbkey} - --index_dir=${GALAXY_DATA_INDEX_DIR} - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Cufflinks Overview** - -Cufflinks_ assembles transcripts, estimates their abundances, and tests for differential expression and regulation in RNA-Seq samples. It accepts aligned RNA-Seq reads and assembles the alignments into a parsimonious set of transcripts. Cufflinks then estimates the relative abundances of these transcripts based on how many reads support each one. Please cite: Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ, Salzberg SL, Wold B, Pachter L. Transcript assembly and abundance estimation from RNA-Seq reveals thousands of new transcripts and switching among isoforms. Nature Biotechnology doi:10.1038/nbt.1621 - -.. _Cufflinks: http://cufflinks.cbcb.umd.edu/ - ------- - -**Know what you are doing** - -.. class:: warningmark - -There is no such thing (yet) as an automated gearshift in expression analysis. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. - -.. __: http://cufflinks.cbcb.umd.edu/manual.html - ------- - -**Input formats** - -Cufflinks takes a text file of SAM alignments as input. The RNA-Seq read mapper TopHat produces output in this format, and is recommended for use with Cufflinks. However Cufflinks will accept SAM alignments generated by any read mapper. Here's an example of an alignment Cufflinks will accept:: - - s6.25mer.txt-913508 16 chr1 4482736 255 14M431N11M * 0 0 \ - CAAGATGCTAGGCAAGTCTTGGAAG IIIIIIIIIIIIIIIIIIIIIIIII NM:i:0 XS:A:- - -Note the use of the custom tag XS. This attribute, which must have a value of "+" or "-", indicates which strand the RNA that produced this read came from. While this tag can be applied to any alignment, including unspliced ones, it must be present for all spliced alignment records (those with a 'N' operation in the CIGAR string). -The SAM file supplied to Cufflinks must be sorted by reference position. If you aligned your reads with TopHat, your alignments will be properly sorted already. If you used another tool, you may want to make sure they are properly sorted as follows:: - - sort -k 3,3 -k 4,4n hits.sam > hits.sam.sorted - -NOTE: Cufflinks currently only supports SAM alignments with the CIGAR match ('M') and reference skip ('N') operations. Support for the other operations, such as insertions, deletions, and clipping, will be added in the future. - ------- - -**Outputs** - -Cufflinks produces three output files: - -Transcripts and Genes: - -This GTF file contains Cufflinks' assembled isoforms. The first 7 columns are standard GTF, and the last column contains attributes, some of which are also standardized (e.g. gene_id, transcript_id). There one GTF record per row, and each record represents either a transcript or an exon within a transcript. The columns are defined as follows:: - - Column number Column name Example Description - ----------------------------------------------------- - 1 seqname chrX Chromosome or contig name - 2 source Cufflinks The name of the program that generated this file (always 'Cufflinks') - 3 feature exon The type of record (always either "transcript" or "exon"). - 4 start 77696957 The leftmost coordinate of this record (where 0 is the leftmost possible coordinate) - 5 end 77712009 The rightmost coordinate of this record, inclusive. - 6 score 77712009 The most abundant isoform for each gene is assigned a score of 1000. Minor isoforms are scored by the ratio (minor FPKM/major FPKM) - 7 strand + Cufflinks' guess for which strand the isoform came from. Always one of '+', '-' '.' - 7 frame . Cufflinks does not predict where the start and stop codons (if any) are located within each transcript, so this field is not used. - 8 attributes See below - -Each GTF record is decorated with the following attributes:: - - Attribute Example Description - ----------------------------------------- - gene_id CUFF.1 Cufflinks gene id - transcript_id CUFF.1.1 Cufflinks transcript id - FPKM 101.267 Isoform-level relative abundance in Reads Per Kilobase of exon model per Million mapped reads - frac 0.7647 Reserved. Please ignore, as this attribute may be deprecated in the future - conf_lo 0.07 Lower bound of the 95% confidence interval of the abundance of this isoform, as a fraction of the isoform abundance. That is, lower bound = FPKM * (1.0 - conf_lo) - conf_hi 0.1102 Upper bound of the 95% confidence interval of the abundance of this isoform, as a fraction of the isoform abundance. That is, upper bound = FPKM * (1.0 + conf_lo) - cov 100.765 Estimate for the absolute depth of read coverage across the whole transcript - - -Transcripts only: - This file is simply a tab delimited file containing one row per transcript and with columns containing the attributes above. There are a few additional attributes not in the table above, but these are reserved for debugging, and may change or disappear in the future. - -Genes only: -This file contains gene-level coordinates and expression values. - -------- - -**Cufflinks settings** - -All of the options have a default value. You can change any of them. Most of the options in Cufflinks have been implemented here. - ------- - -**Cufflinks parameter list** - -This is a list of implemented Cufflinks options:: - - -m INT This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments selected at 300bp, where each end is 50bp, you should set -r to be 200. The default is 45bp. - -s INT The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp. - -I INT The minimum intron length. Cufflinks will not report transcripts with introns longer than this, and will ignore SAM alignments with REF_SKIP CIGAR operations longer than this. The default is 300,000. - -F After calculating isoform abundance for a gene, Cufflinks filters out transcripts that it believes are very low abundance, because isoforms expressed at extremely low levels often cannot reliably be assembled, and may even be artifacts of incompletely spliced precursors of processed transcripts. This parameter is also used to filter out introns that have far fewer spliced alignments supporting them. The default is 0.05, or 5% of the most abundant isoform (the major isoform) of the gene. - -j Some RNA-Seq protocols produce a significant amount of reads that originate from incompletely spliced transcripts, and these reads can confound the assembly of fully spliced mRNAs. Cufflinks uses this parameter to filter out alignments that lie within the intronic intervals implied by the spliced alignments. The minimum depth of coverage in the intronic region covered by the alignment is divided by the number of spliced reads, and if the result is lower than this parameter value, the intronic alignments are ignored. The default is 5%. - -G Tells Cufflinks to use the supplied reference annotation to estimate isoform expression. It will not assemble novel transcripts, and the program will ignore alignments not structurally compatible with any reference transcript. - -N With this option, Cufflinks excludes the contribution of the top 25 percent most highly expressed genes from the number of mapped fragments used in the FPKM denominator. This can improve robustness of differential expression calls for less abundant genes and transcripts. - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/filter_transcripts_via_tracking.py --- a/tools/ngs_rna/filter_transcripts_via_tracking.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ -#!/usr/bin/env python -import os, sys, tempfile - -assert sys.version_info[:2] >= ( 2, 4 ) - -def __main__(): - """ - Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts - produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only - transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for - a particular sample. - """ - # Read parms. - tracking_file_name = sys.argv[1] - transcripts_file_name = sys.argv[2] - output_file_name = sys.argv[3] - sample_number = int ( sys.argv[4] ) - - # Open files. - transcripts_file = open( transcripts_file_name, 'r' ) - output_file = open( output_file_name, 'w' ) - - # Read transcript IDs from tracking file. - transcript_ids = {} - for i, line in enumerate( file( tracking_file_name ) ) : - # Split line into elements. Line format is - # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:|||||] - line = line.rstrip( '\r\n' ) - elems = line.split( '\t' ) - - # Get transcript info. - if sample_number == 1: - transcript_info = elems[4] - elif sample_number == 2: - transcript_info = elems[5] - if not transcript_info.startswith('q'): - # No transcript for this sample. - continue - - # Get and store transcript id. - transcript_id = transcript_info.split('|')[1] - transcript_id = transcript_id.strip('"') - transcript_ids[transcript_id] = "" - - # Filter transcripts file using transcript_ids - for i, line in enumerate( file( transcripts_file_name ) ): - # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. - elems = line.split( '\t' ) - - # Get attributes. - attributes_list = elems[8].split(";") - attributes = {} - for name_value_pair in attributes_list: - pair = name_value_pair.strip().split(" ") - name = pair[0].strip() - if name == '': - continue - # Need to strip double quote from values - value = pair[1].strip(" \"") - attributes[name] = value - - # Get element's transcript id. - transcript_id = attributes['transcript_id'] - if transcript_id in transcript_ids: - output_file.write(line) - - # Clean up. - output_file.close() - -if __name__ == "__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/filter_transcripts_via_tracking.xml --- a/tools/ngs_rna/filter_transcripts_via_tracking.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ - - using tracking file - - filter_transcripts_via_tracking.py - $tracking_file - $transcripts_file - $filtered_transcripts - $sample_num - - - - - - - - - - - - - - - - - - - Uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts produced by - cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the - GTF so that the output GTF contains only transcript found in the tracking file. Because a tracking file has multiple - samples, a sample number is used to filter transcripts for a particular sample. - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/tophat_color_wrapper.xml --- a/tools/ngs_rna/tophat_color_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,683 +0,0 @@ - - Find splice junctions using RNA-seq data - - tophat - - - tophat_wrapper.py - ## Change this to accommodate the number of threads you have available. - --num-threads="4" - - ## base- or color-space - --color-space - - ## Provide outputs. - --junctions-output=$junctions - --hits-output=$accepted_hits - - ## Handle reference file. - #if $refGenomeSource.genomeSource == "history": - --own-file=$refGenomeSource.ownFile - #else: - --indexes-path="${ filter( lambda x: str( x[0] ) == str( $refGenomeSource.index ), $__app__.tool_data_tables[ 'tophat_indexes_color' ].get_fields() )[0][-1] }" - #end if - - ## Are reads single-end or paired? - --single-paired=$singlePaired.sPaired - - ## First input file always required. - --input1=$input1 - - ## Set params based on whether reads are single-end or paired. - #if $singlePaired.sPaired == "single": - --settings=$singlePaired.sParams.sSettingsType - #if $singlePaired.sParams.sSettingsType == "full": - -a $singlePaired.sParams.anchor_length - -m $singlePaired.sParams.splice_mismatches - -i $singlePaired.sParams.min_intron_length - -I $singlePaired.sParams.max_intron_length - -F $singlePaired.sParams.junction_filter - -g $singlePaired.sParams.max_multihits - --min-segment-intron $singlePaired.sParams.min_segment_intron - --max-segment-intron $singlePaired.sParams.max_segment_intron - --seg-mismatches=$singlePaired.sParams.seg_mismatches - --seg-length=$singlePaired.sParams.seg_length - --library-type=$singlePaired.sParams.library_type - - ## Indel search. - #if $singlePaired.sParams.indel_search.allow_indel_search == "Yes": - --allow-indels - --max-insertion-length $singlePaired.sParams.indel_search.max_insertion_length - --max-deletion-length $singlePaired.sParams.indel_search.max_deletion_length - #end if - - ## Supplying junctions parameters. - #if $singlePaired.sParams.own_junctions.use_junctions == "Yes": - #if $singlePaired.sParams.own_junctions.gene_model_ann.use_annotations == "Yes": - -G $singlePaired.sParams.own_junctions.gene_model_ann.gene_annotation_model - #end if - #if $singlePaired.sParams.own_junctions.raw_juncs.use_juncs == "Yes": - -j $singlePaired.sParams.own_junctions.raw_juncs.raw_juncs - #end if - ## TODO: No idea why a string cast is necessary, but it is: - #if str($singlePaired.sParams.own_junctions.no_novel_juncs) == "Yes": - --no-novel-juncs - #end if - #end if - - #if $singlePaired.sParams.closure_search.use_search == "Yes": - --closure-search - --min-closure-exon $singlePaired.sParams.closure_search.min_closure_exon - --min-closure-intron $singlePaired.sParams.closure_search.min_closure_intron - --max-closure-intron $singlePaired.sParams.closure_search.max_closure_intron - #else: - --no-closure-search - #end if - #if $singlePaired.sParams.coverage_search.use_search == "Yes": - --coverage-search - --min-coverage-intron $singlePaired.sParams.coverage_search.min_coverage_intron - --max-coverage-intron $singlePaired.sParams.coverage_search.max_coverage_intron - #else: - --no-coverage-search - #end if - ## TODO: No idea why the type conversion is necessary, but it seems to be. - #if str($singlePaired.sParams.microexon_search) == "Yes": - --microexon-search - #end if - #end if - #else: - --input2=$singlePaired.input2 - -r $singlePaired.mate_inner_distance - --settings=$singlePaired.pParams.pSettingsType - #if $singlePaired.pParams.pSettingsType == "full": - --mate-std-dev=$singlePaired.pParams.mate_std_dev - -a $singlePaired.pParams.anchor_length - -m $singlePaired.pParams.splice_mismatches - -i $singlePaired.pParams.min_intron_length - -I $singlePaired.pParams.max_intron_length - -F $singlePaired.pParams.junction_filter - -g $singlePaired.pParams.max_multihits - --min-segment-intron $singlePaired.pParams.min_segment_intron - --max-segment-intron $singlePaired.pParams.max_segment_intron - --seg-mismatches=$singlePaired.pParams.seg_mismatches - --seg-length=$singlePaired.pParams.seg_length - --library-type=$singlePaired.pParams.library_type - - ## Indel search. - #if $singlePaired.pParams.indel_search.allow_indel_search == "Yes": - --allow-indels - --max-insertion-length $singlePaired.pParams.indel_search.max_insertion_length - --max-deletion-length $singlePaired.pParams.indel_search.max_deletion_length - #end if - - ## Supplying junctions parameters. - #if $singlePaired.pParams.own_junctions.use_junctions == "Yes": - #if $singlePaired.pParams.own_junctions.gene_model_ann.use_annotations == "Yes": - -G $singlePaired.pParams.own_junctions.gene_model_ann.gene_annotation_model - #end if - #if $singlePaired.pParams.own_junctions.raw_juncs.use_juncs == "Yes": - -j $singlePaired.pParams.own_junctions.raw_juncs.raw_juncs - #end if - ## TODO: No idea why type cast is necessary, but it is: - #if str($singlePaired.pParams.own_junctions.no_novel_juncs) == "Yes": - --no-novel-juncs - #end if - #end if - - #if $singlePaired.pParams.closure_search.use_search == "Yes": - --closure-search - --min-closure-exon $singlePaired.pParams.closure_search.min_closure_exon - --min-closure-intron $singlePaired.pParams.closure_search.min_closure_intron - --max-closure-intron $singlePaired.pParams.closure_search.max_closure_intron - #else: - --no-closure-search - #end if - #if $singlePaired.pParams.coverage_search.use_search == "Yes": - --coverage-search - --min-coverage-intron $singlePaired.pParams.coverage_search.min_coverage_intron - --max-coverage-intron $singlePaired.pParams.coverage_search.max_coverage_intron - #else: - --no-coverage-search - #end if - ## TODO: No idea why the type conversion is necessary, but it seems to be. - #if str ($singlePaired.pParams.microexon_search) == "Yes": - --microexon-search - #end if - #end if - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ( - ( ( 'sParams' in singlePaired ) and ( 'indel_search' in singlePaired['sParams'] ) and - ( singlePaired['sParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) or - ( ( 'pParams' in singlePaired ) and ( 'indel_search' in singlePaired['pParams'] ) and - ( singlePaired['pParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) - ) - - - - - - - - - - - - - - - - - - ( - ( ( 'sParams' in singlePaired ) and ( 'indel_search' in singlePaired['sParams'] ) and - ( singlePaired['sParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) or - ( ( 'pParams' in singlePaired ) and ( 'indel_search' in singlePaired['pParams'] ) and - ( singlePaired['pParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) - ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Tophat Overview** - -TopHat_ is a fast splice junction mapper for RNA-Seq reads. It aligns RNA-Seq reads to mammalian-sized genomes using the ultra high-throughput short read aligner Bowtie, and then analyzes the mapping results to identify splice junctions between exons. Please cite: Trapnell, C., Pachter, L. and Salzberg, S.L. TopHat: discovering splice junctions with RNA-Seq. Bioinformatics 25, 1105-1111 (2009). - -.. _Tophat: http://tophat.cbcb.umd.edu/ - ------- - -**Know what you are doing** - -.. class:: warningmark - -There is no such thing (yet) as an automated gearshift in splice junction identification. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. - -.. __: http://tophat.cbcb.umd.edu/manual.html - ------- - -**Input formats** - -Tophat accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files. - ------- - -**Outputs** - -Tophat produces two main output files: - -- junctions -- A UCSC BED_ track of junctions reported by TopHat. Each junction consists of two connected BED blocks, where each block is as long as the maximal overhang of any read spanning the junction. The score is the number of alignments spanning the junction. -- accepted_hits -- A list of read alignments in BAM_ format. - -.. _BED: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 -.. _BAM: http://samtools.sourceforge.net/ - -Two other possible outputs, depending on the options you choose, are insertions and deletions, both of which are in BED format. - -------- - -**Tophat settings** - -All of the options have a default value. You can change any of them. Some of the options in Tophat have been implemented here. - ------- - -**Tophat parameter list** - -This is a list of implemented Tophat options:: - -This is a list of implemented Tophat options:: - - -r This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments - selected at 300bp, where each end is 50bp, you should set -r to be 200. There is no default, and this parameter - is required for paired end runs. - --mate-std-dev INT The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp. - -a/--min-anchor-length INT The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction. Note that individual spliced - alignments may span a junction with fewer than this many bases on one side. However, every junction involved in spliced alignments is supported by at least one - read with this many bases on each side. This must be at least 3 and the default is 8. - -m/--splice-mismatches INT The maximum number of mismatches that may appear in the "anchor" region of a spliced alignment. The default is 0. - -i/--min-intron-length INT The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart. The default is 70. - -I/--max-intron-length INT The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read. The default is 500000. - -F/--min-isoform-fraction 0.0-1.0 TopHat filters out junctions supported by too few alignments. Suppose a junction spanning two exons, is supported by S reads. Let the average depth of coverage of - exon A be D, and assume that it is higher than B. If S / D is less than the minimum isoform fraction, the junction is not reported. A value of zero disables the - filter. The default is 0.15. - -g/--max-multihits INT Instructs TopHat to allow up to this many alignments to the reference for a given read, and suppresses all alignments for reads with more than this many - alignments. The default is 40. - -G/--GTF [GTF 2.2 file] Supply TopHat with a list of gene model annotations. TopHat will use the exon records in this file to build a set of known splice junctions for each gene, and will attempt to align reads to these junctions even if they would not normally be covered by the initial mapping. - -j/--raw-juncs [juncs file] Supply TopHat with a list of raw junctions. Junctions are specified one per line, in a tab-delimited format. Records look like: [chrom] [left] [right] [+/-], left and right are zero-based coordinates, and specify the last character of the left sequenced to be spliced to the first character of the right sequence, inclusive. - -no-novel-juncs Only look for junctions indicated in the supplied GFF file. (ignored without -G) - --no-closure-search Disables the mate pair closure-based search for junctions. Currently, has no effect - closure search is off by default. - --closure-search Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (about or less than 50bp) - --no-coverage-search Disables the coverage based search for junctions. - --coverage-search Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity. - --microexon-search With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer. - --butterfly-search TopHat will use a slower but potentially more sensitive algorithm to find junctions in addition to its standard search. Consider using this if you expect that your experiment produced a lot of reads from pre-mRNA, that fall within the introns of your transcripts. - --segment-mismatches Read segments are mapped independently, allowing up to this many mismatches in each segment alignment. The default is 2. - --segment-length Each read is cut up into segments, each at least this long. These segments are mapped independently. The default is 25. - --min-closure-exon During closure search for paired end reads, exonic hops in the potential splice graph must be at least this long. The default is 50. - --min-closure-intron The minimum intron length that may be found during closure search. The default is 50. - --max-closure-intron The maximum intron length that may be found during closure search. The default is 5000. - --min-coverage-intron The minimum intron length that may be found during coverage search. The default is 50. - --max-coverage-intron The maximum intron length that may be found during coverage search. The default is 20000. - --min-segment-intron The minimum intron length that may be found during split-segment search. The default is 50. - --max-segment-intron The maximum intron length that may be found during split-segment search. The default is 500000. - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/tophat_wrapper.py --- a/tools/ngs_rna/tophat_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,236 +0,0 @@ -#!/usr/bin/env python - -import optparse, os, shutil, subprocess, sys, tempfile, fileinput - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) - parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' ) - parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' ) - parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is BAM.' ) - parser.add_option( '', '--own-file', dest='own_file', help='' ) - parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) - parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \ - For, example, for paired end runs with fragments selected at 300bp, \ - where each end is 50bp, you should set -r to be 200. There is no default, \ - and this parameter is required for paired end runs.') - parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' ) - parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', - help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' ) - parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' ) - parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', - help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' ) - parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', - help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' ) - parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' ) - parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' ) - parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' ) - parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' ) - parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' ) - parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.' ) - parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' ) - parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' ) - - # Options for supplying own junctions - parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \ - TopHat will use the exon records in this file to build \ - a set of known splice junctions for each gene, and will \ - attempt to align reads to these junctions even if they \ - would not normally be covered by the initial mapping.') - parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \ - specified one per line, in a tab-delimited format. Records \ - look like: <+/-> left and right are \ - zero-based coordinates, and specify the last character of the \ - left sequenced to be spliced to the first character of the right \ - sequence, inclusive.') - parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \ - supplied GFF file. (ignored without -G)") - # Types of search. - parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.') - parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)') - parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' ) - parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.') - parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' ) - parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' ) - parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' ) - parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' ) - parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' ) - parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' ) - parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' ) - parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' ) - - # Wrapper options. - parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' ) - parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' ) - parser.add_option( '', '--single-paired', dest='single_paired', help='' ) - parser.add_option( '', '--settings', dest='settings', help='' ) - - (options, args) = parser.parse_args() - - # output version # of tool - try: - tmp = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp, 'wb' ) - proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout ) - tmp_stdout.close() - returncode = proc.wait() - stdout = open( tmp_stdout.name, 'rb' ).readline().strip() - if stdout: - sys.stdout.write( '%s\n' % stdout ) - else: - raise Exception - except: - sys.stdout.write( 'Could not determine Tophat version\n' ) - - # Color or base space - space = '' - if options.color_space: - space = '-C' - - # Creat bowtie index if necessary. - tmp_index_dir = tempfile.mkdtemp() - if options.own_file: - index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) ) - try: - os.link( options.own_file, index_path + '.fa' ) - except: - # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension - pass - cmd_index = 'bowtie-build %s -f %s %s' % ( space, options.own_file, index_path ) - try: - tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name - tmp_stderr = open( tmp, 'wb' ) - proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - except Exception, e: - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Error indexing reference sequence\n' + str( e ) ) - else: - index_path = options.index_path - - # Build tophat command. - cmd = 'tophat %s %s %s' - reads = options.input1 - if options.input2: - reads += ' ' + options.input2 - opts = '-p %s %s' % ( options.num_threads, space ) - if options.single_paired == 'paired': - opts += ' -r %s' % options.mate_inner_dist - if options.settings == 'preSet': - cmd = cmd % ( opts, index_path, reads ) - else: - try: - if int( options.min_anchor_length ) >= 3: - opts += ' -a %s' % options.min_anchor_length - else: - raise Exception, 'Minimum anchor length must be 3 or greater' - opts += ' -m %s' % options.splice_mismatches - opts += ' -i %s' % options.min_intron_length - opts += ' -I %s' % options.max_intron_length - if float( options.junction_filter ) != 0.0: - opts += ' -F %s' % options.junction_filter - opts += ' -g %s' % options.max_multihits - # Custom junctions options. - if options.gene_model_annotations: - opts += ' -G %s' % options.gene_model_annotations - if options.raw_juncs: - opts += ' -j %s' % options.raw_juncs - if options.no_novel_juncs: - opts += ' --no-novel-juncs' - if options.library_type: - opts += ' --library-type %s' % options.library_type - if options.allow_indels: - # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. - opts += ' --allow-indels' - #opts += ' --max-insertion-length %i --max-deletion-length %i' % ( int( options.max_insertion_length ), int( options.max_deletion_length ) ) - # need to warn user of this fact - sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" ) - - # Search type options. - if options.coverage_search: - opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron ) - else: - opts += ' --no-coverage-search' - if options.closure_search: - opts += ' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron ) - else: - opts += ' --no-closure-search' - if options.microexon_search: - opts += ' --microexon-search' - if options.single_paired == 'paired': - opts += ' --mate-std-dev %s' % options.mate_std_dev - if options.seg_mismatches: - opts += ' --segment-mismatches %d' % int( options.seg_mismatches ) - if options.seg_length: - opts += ' --segment-length %d' % int( options.seg_length ) - if options.min_segment_intron: - opts += ' --min-segment-intron %d' % int( options.min_segment_intron ) - if options.max_segment_intron: - opts += ' --max-segment-intron %d' % int( options.max_segment_intron ) - cmd = cmd % ( opts, index_path, reads ) - except Exception, e: - # Clean up temp dirs - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) ) - #print cmd - - # Run - try: - tmp_out = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp_out, 'wb' ) - tmp_err = tempfile.NamedTemporaryFile().name - tmp_stderr = open( tmp_err, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) - returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp_err, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stdout.close() - tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - - # Copy output files from tmp directory to specified files. - shutil.copyfile( os.path.join( "tophat_out", "junctions.bed" ), options.junctions_output_file ) - shutil.copyfile( os.path.join( "tophat_out", "accepted_hits.bam" ), options.accepted_hits_output_file ) - - # TODO: look for errors in program output. - except Exception, e: - stop_err( 'Error in tophat:\n' + str( e ) ) - - # Clean up temp dirs - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/tophat_wrapper.xml --- a/tools/ngs_rna/tophat_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,680 +0,0 @@ - - Find splice junctions using RNA-seq data - tophat --version - - tophat - - - tophat_wrapper.py - ## Change this to accommodate the number of threads you have available. - --num-threads="4" - - ## Provide outputs. - --junctions-output=$junctions - --hits-output=$accepted_hits - - ## Handle reference file. - #if $refGenomeSource.genomeSource == "history": - --own-file=$refGenomeSource.ownFile - #else: - --indexes-path="${ filter( lambda x: str( x[0] ) == str( $refGenomeSource.index ), $__app__.tool_data_tables[ 'tophat_indexes' ].get_fields() )[0][-1] }" - #end if - - ## Are reads single-end or paired? - --single-paired=$singlePaired.sPaired - - ## First input file always required. - --input1=$input1 - - ## Set params based on whether reads are single-end or paired. - #if $singlePaired.sPaired == "single": - --settings=$singlePaired.sParams.sSettingsType - #if $singlePaired.sParams.sSettingsType == "full": - -a $singlePaired.sParams.anchor_length - -m $singlePaired.sParams.splice_mismatches - -i $singlePaired.sParams.min_intron_length - -I $singlePaired.sParams.max_intron_length - -F $singlePaired.sParams.junction_filter - -g $singlePaired.sParams.max_multihits - --min-segment-intron $singlePaired.sParams.min_segment_intron - --max-segment-intron $singlePaired.sParams.max_segment_intron - --seg-mismatches=$singlePaired.sParams.seg_mismatches - --seg-length=$singlePaired.sParams.seg_length - --library-type=$singlePaired.sParams.library_type - - ## Indel search. - #if $singlePaired.sParams.indel_search.allow_indel_search == "Yes": - --allow-indels - --max-insertion-length $singlePaired.sParams.indel_search.max_insertion_length - --max-deletion-length $singlePaired.sParams.indel_search.max_deletion_length - #end if - - ## Supplying junctions parameters. - #if $singlePaired.sParams.own_junctions.use_junctions == "Yes": - #if $singlePaired.sParams.own_junctions.gene_model_ann.use_annotations == "Yes": - -G $singlePaired.sParams.own_junctions.gene_model_ann.gene_annotation_model - #end if - #if $singlePaired.sParams.own_junctions.raw_juncs.use_juncs == "Yes": - -j $singlePaired.sParams.own_junctions.raw_juncs.raw_juncs - #end if - ## TODO: No idea why a string cast is necessary, but it is: - #if str($singlePaired.sParams.own_junctions.no_novel_juncs) == "Yes": - --no-novel-juncs - #end if - #end if - - #if $singlePaired.sParams.closure_search.use_search == "Yes": - --closure-search - --min-closure-exon $singlePaired.sParams.closure_search.min_closure_exon - --min-closure-intron $singlePaired.sParams.closure_search.min_closure_intron - --max-closure-intron $singlePaired.sParams.closure_search.max_closure_intron - #else: - --no-closure-search - #end if - #if $singlePaired.sParams.coverage_search.use_search == "Yes": - --coverage-search - --min-coverage-intron $singlePaired.sParams.coverage_search.min_coverage_intron - --max-coverage-intron $singlePaired.sParams.coverage_search.max_coverage_intron - #else: - --no-coverage-search - #end if - ## TODO: No idea why the type conversion is necessary, but it seems to be. - #if str($singlePaired.sParams.microexon_search) == "Yes": - --microexon-search - #end if - #end if - #else: - --input2=$singlePaired.input2 - -r $singlePaired.mate_inner_distance - --settings=$singlePaired.pParams.pSettingsType - #if $singlePaired.pParams.pSettingsType == "full": - --mate-std-dev=$singlePaired.pParams.mate_std_dev - -a $singlePaired.pParams.anchor_length - -m $singlePaired.pParams.splice_mismatches - -i $singlePaired.pParams.min_intron_length - -I $singlePaired.pParams.max_intron_length - -F $singlePaired.pParams.junction_filter - -g $singlePaired.pParams.max_multihits - --min-segment-intron $singlePaired.pParams.min_segment_intron - --max-segment-intron $singlePaired.pParams.max_segment_intron - --seg-mismatches=$singlePaired.pParams.seg_mismatches - --seg-length=$singlePaired.pParams.seg_length - --library-type=$singlePaired.pParams.library_type - - ## Indel search. - #if $singlePaired.pParams.indel_search.allow_indel_search == "Yes": - --allow-indels - --max-insertion-length $singlePaired.pParams.indel_search.max_insertion_length - --max-deletion-length $singlePaired.pParams.indel_search.max_deletion_length - #end if - - ## Supplying junctions parameters. - #if $singlePaired.pParams.own_junctions.use_junctions == "Yes": - #if $singlePaired.pParams.own_junctions.gene_model_ann.use_annotations == "Yes": - -G $singlePaired.pParams.own_junctions.gene_model_ann.gene_annotation_model - #end if - #if $singlePaired.pParams.own_junctions.raw_juncs.use_juncs == "Yes": - -j $singlePaired.pParams.own_junctions.raw_juncs.raw_juncs - #end if - ## TODO: No idea why type cast is necessary, but it is: - #if str($singlePaired.pParams.own_junctions.no_novel_juncs) == "Yes": - --no-novel-juncs - #end if - #end if - - #if $singlePaired.pParams.closure_search.use_search == "Yes": - --closure-search - --min-closure-exon $singlePaired.pParams.closure_search.min_closure_exon - --min-closure-intron $singlePaired.pParams.closure_search.min_closure_intron - --max-closure-intron $singlePaired.pParams.closure_search.max_closure_intron - #else: - --no-closure-search - #end if - #if $singlePaired.pParams.coverage_search.use_search == "Yes": - --coverage-search - --min-coverage-intron $singlePaired.pParams.coverage_search.min_coverage_intron - --max-coverage-intron $singlePaired.pParams.coverage_search.max_coverage_intron - #else: - --no-coverage-search - #end if - ## TODO: No idea why the type conversion is necessary, but it seems to be. - #if str ($singlePaired.pParams.microexon_search) == "Yes": - --microexon-search - #end if - #end if - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ( - ( ( 'sParams' in singlePaired ) and ( 'indel_search' in singlePaired['sParams'] ) and - ( singlePaired['sParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) or - ( ( 'pParams' in singlePaired ) and ( 'indel_search' in singlePaired['pParams'] ) and - ( singlePaired['pParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) - ) - - - - - - - - - - - - - - - - - - ( - ( ( 'sParams' in singlePaired ) and ( 'indel_search' in singlePaired['sParams'] ) and - ( singlePaired['sParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) or - ( ( 'pParams' in singlePaired ) and ( 'indel_search' in singlePaired['pParams'] ) and - ( singlePaired['pParams']['indel_search']['allow_indel_search'] == 'Yes' ) ) - ) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Tophat Overview** - -TopHat_ is a fast splice junction mapper for RNA-Seq reads. It aligns RNA-Seq reads to mammalian-sized genomes using the ultra high-throughput short read aligner Bowtie, and then analyzes the mapping results to identify splice junctions between exons. Please cite: Trapnell, C., Pachter, L. and Salzberg, S.L. TopHat: discovering splice junctions with RNA-Seq. Bioinformatics 25, 1105-1111 (2009). - -.. _Tophat: http://tophat.cbcb.umd.edu/ - ------- - -**Know what you are doing** - -.. class:: warningmark - -There is no such thing (yet) as an automated gearshift in splice junction identification. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. - -.. __: http://tophat.cbcb.umd.edu/manual.html - ------- - -**Input formats** - -Tophat accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files. - ------- - -**Outputs** - -Tophat produces two output files: - -- junctions -- A UCSC BED_ track of junctions reported by TopHat. Each junction consists of two connected BED blocks, where each block is as long as the maximal overhang of any read spanning the junction. The score is the number of alignments spanning the junction. -- accepted_hits -- A list of read alignments in BAM_ format. - -.. _BED: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 -.. _BAM: http://samtools.sourceforge.net/ - -Two other possible outputs, depending on the options you choose, are insertions and deletions, both of which are in BED format. - -------- - -**Tophat settings** - -All of the options have a default value. You can change any of them. Some of the options in Tophat have been implemented here. - ------- - -**Tophat parameter list** - -This is a list of implemented Tophat options:: - - -r This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments - selected at 300bp, where each end is 50bp, you should set -r to be 200. There is no default, and this parameter - is required for paired end runs. - --mate-std-dev INT The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp. - -a/--min-anchor-length INT The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction. Note that individual spliced - alignments may span a junction with fewer than this many bases on one side. However, every junction involved in spliced alignments is supported by at least one - read with this many bases on each side. This must be at least 3 and the default is 8. - -m/--splice-mismatches INT The maximum number of mismatches that may appear in the "anchor" region of a spliced alignment. The default is 0. - -i/--min-intron-length INT The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart. The default is 70. - -I/--max-intron-length INT The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read. The default is 500000. - -F/--min-isoform-fraction 0.0-1.0 TopHat filters out junctions supported by too few alignments. Suppose a junction spanning two exons, is supported by S reads. Let the average depth of coverage of - exon A be D, and assume that it is higher than B. If S / D is less than the minimum isoform fraction, the junction is not reported. A value of zero disables the - filter. The default is 0.15. - -g/--max-multihits INT Instructs TopHat to allow up to this many alignments to the reference for a given read, and suppresses all alignments for reads with more than this many - alignments. The default is 40. - -G/--GTF [GTF 2.2 file] Supply TopHat with a list of gene model annotations. TopHat will use the exon records in this file to build a set of known splice junctions for each gene, and will attempt to align reads to these junctions even if they would not normally be covered by the initial mapping. - -j/--raw-juncs [juncs file] Supply TopHat with a list of raw junctions. Junctions are specified one per line, in a tab-delimited format. Records look like: [chrom] [left] [right] [+/-], left and right are zero-based coordinates, and specify the last character of the left sequenced to be spliced to the first character of the right sequence, inclusive. - -no-novel-juncs Only look for junctions indicated in the supplied GFF file. (ignored without -G) - --no-closure-search Disables the mate pair closure-based search for junctions. Currently, has no effect - closure search is off by default. - --closure-search Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (about or less than 50bp) - --no-coverage-search Disables the coverage based search for junctions. - --coverage-search Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity. - --microexon-search With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer. - --butterfly-search TopHat will use a slower but potentially more sensitive algorithm to find junctions in addition to its standard search. Consider using this if you expect that your experiment produced a lot of reads from pre-mRNA, that fall within the introns of your transcripts. - --segment-mismatches Read segments are mapped independently, allowing up to this many mismatches in each segment alignment. The default is 2. - --segment-length Each read is cut up into segments, each at least this long. These segments are mapped independently. The default is 25. - --min-closure-exon During closure search for paired end reads, exonic hops in the potential splice graph must be at least this long. The default is 50. - --min-closure-intron The minimum intron length that may be found during closure search. The default is 50. - --max-closure-intron The maximum intron length that may be found during closure search. The default is 5000. - --min-coverage-intron The minimum intron length that may be found during coverage search. The default is 50. - --max-coverage-intron The maximum intron length that may be found during coverage search. The default is 20000. - --min-segment-intron The minimum intron length that may be found during split-segment search. The default is 50. - --max-segment-intron The maximum intron length that may be found during split-segment search. The default is 500000. - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_rna/trinity_all.xml --- a/tools/ngs_rna/trinity_all.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ - - - De novo assembly of RNA-Seq data - - trinity - - - Trinity.pl - - ## Additional parameters. - #if $additional_params.use_additional == "yes": - --min_contig_length $additional_params.min_contig_length - #end if - - ## Inputs. - #if $inputs.paired_or_single == "paired": - --left $inputs.left_input --right $inputs.right_input - #if $inputs.left_input.ext == 'fa': - --seqType fa - #else: - --seqType fq - #end if - #if $inputs.library_type != 'None': - --SS_lib_type $inputs.library_type - #end if - #else: - --single $inputs.input - #if $inputs.input.ext == 'fa': - --seqType fa - #else: - --seqType fq - #end if - #if $inputs.library_type != 'None': - --SS_lib_type $inputs.library_type - #end if - #end if - - ## CPU and butterfly options. - --CPU 4 --run_butterfly --bfly_opts "-V 10 --stderr" > $trinity_log 2>&1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass. - - .. _Trinity: http://trinityrnaseq.sourceforge.net - - diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_simulation/ngs_simulation.py --- a/tools/ngs_simulation/ngs_simulation.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,280 +0,0 @@ -#!/usr/bin/env python - -""" -Runs Ben's simulation. - -usage: %prog [options] - -i, --input=i: Input genome (FASTA format) - -g, --genome=g: If built-in, the genome being used - -l, --read_len=l: Read length - -c, --avg_coverage=c: Average coverage - -e, --error_rate=e: Error rate (0-1) - -n, --num_sims=n: Number of simulations to run - -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1) - -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1) - -p, --output_png=p: Plot output - -s, --summary_out=s: Whether or not to output a file with summary of all simulations - -m, --output_summary=m: File name for output summary of all simulations - -f, --new_file_path=f: Directory for summary output files - -""" -# removed output of all simulation results on request (not working) -# -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds) -# -o, --output=o: Base name for summary output for each run - -from rpy import * -import os -import random, sys, tempfile -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - # validate parameters - error = '' - try: - read_len = int( options.read_len ) - if read_len <= 0: - raise Exception, ' greater than 0' - except TypeError, e: - error = ': %s' % str( e ) - if error: - stop_err( 'Make sure your number of reads is an integer value%s' % error ) - error = '' - try: - avg_coverage = int( options.avg_coverage ) - if avg_coverage <= 0: - raise Exception, ' greater than 0' - except Exception, e: - error = ': %s' % str( e ) - if error: - stop_err( 'Make sure your average coverage is an integer value%s' % error ) - error = '' - try: - error_rate = float( options.error_rate ) - if error_rate >= 1.0: - error_rate = 10 ** ( -error_rate / 10.0 ) - elif error_rate < 0: - raise Exception, ' between 0 and 1' - except Exception, e: - error = ': %s' % str( e ) - if error: - stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error ) - try: - num_sims = int( options.num_sims ) - except TypeError, e: - stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) ) - if len( options.polymorphism ) > 0: - polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ] - else: - stop_err( 'Select at least one polymorphism value to use' ) - if len( options.detection_thresh ) > 0: - detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ] - else: - stop_err( 'Select at least one detection threshold to use' ) - - # mutation dictionaries - hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary - mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary - - # read fasta file to seq string - all_lines = open( options.input, 'rb' ).readlines() - seq = '' - for line in all_lines: - line = line.rstrip() - if line.startswith('>'): - pass - else: - seq += line.upper() - seq_len = len( seq ) - - # output file name template -# removed output of all simulation results on request (not working) -# if options.sim_results == "true": -# out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' ) -# else: -# out_name_template = tempfile.NamedTemporaryFile().name + '_%s' - out_name_template = tempfile.NamedTemporaryFile().name + '_%s' - print 'out_name_template:', out_name_template - - # set up output files - outputs = {} - i = 1 - for p in polymorphisms: - outputs[ p ] = {} - for d in detection_threshes: - outputs[ p ][ d ] = out_name_template % i - i += 1 - - # run sims - for polymorphism in polymorphisms: - for detection_thresh in detection_threshes: - output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' ) - output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len ) - sim_count = 0 - while sim_count < num_sims: - # randomly pick heteroplasmic base index - hbase = random.choice( range( 0, seq_len ) ) - #hbase = seq_len/2#random.randrange( 0, seq_len ) - # create 2D quasispecies list - qspec = map( lambda x: [], [0] * seq_len ) - # simulate read indices and assign to quasispecies - i = 0 - while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage) - start = random.choice( range( 0, seq_len ) ) - #start = seq_len/2#random.randrange( 0, seq_len ) # assign read start - if random.random() < 0.5: # positive sense read - end = start + read_len # assign read end - if end > seq_len: # overshooting origin - read = range( start, seq_len ) + range( 0, ( end - seq_len ) ) - else: # regular read - read = range( start, end ) - else: # negative sense read - end = start - read_len # assign read end - if end < -1: # overshooting origin - read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 ) - else: # regular read - read = range( start, end, -1 ) - # assign read to quasispecies list by index - for j in read: - if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het - ref = hp_dict[ seq[ j ] ] - else: # ref is the verbatim reference nucleotide (all positions) - ref = seq[ j ] - if random.random() < error_rate: # base in read is misread with p = err - qspec[ j ].append( mt_dict[ ref ] ) - else: # otherwise we carry ref through to the end - qspec[ j ].append(ref) - # last but not least - i += 1 - bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later - for i, nuc in enumerate( seq ): - cov = len( qspec[ i ] ) - bases[ 'A' ] = qspec[ i ].count( 'A' ) - bases[ 'C' ] = qspec[ i ].count( 'C' ) - bases[ 'G' ] = qspec[ i ].count( 'G' ) - bases[ 'T' ] = qspec[ i ].count( 'T' ) - # calculate max NON-REF deviation - del bases[ nuc ] - maxdev = float( max( bases.values() ) ) / cov - # deal with non-het sites - if i != hbase: - if maxdev >= detection_thresh: # greater than detection threshold = false positive - fpos += 1 - # deal with het sites - if i == hbase: - hnuc = hp_dict[ nuc ] # let's recover het variant - if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative - fneg += 1 - del bases[ hnuc ] # ignore het variant - maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site - if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible) - fpos += 1 - # output error sums and genome size to summary file - output.write( '%d\t%d\n' % ( fpos, fneg ) ) - sim_count += 1 - # close output up - output.close() - - # Parameters (heteroplasmy, error threshold, colours) - r( ''' - het=c(%s) - err=c(%s) - grade = (0:32)/32 - hues = rev(gray(grade)) - ''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) ) - - # Suppress warnings - r( 'options(warn=-1)' ) - - # Create allsum (for FP) and allneg (for FN) objects - r( 'allsum <- data.frame()' ) - for polymorphism in polymorphisms: - for detection_thresh in detection_threshes: - output = outputs[ polymorphism ][ detection_thresh ] - cmd = ''' - ngsum = read.delim('%s', header=T) - ngsum$fprate <- ngsum$FP/%s - ngsum$hetcol <- %s - ngsum$errcol <- %s - allsum <- rbind(allsum, ngsum) - ''' % ( output, seq_len, polymorphism, detection_thresh ) - r( cmd ) - - if os.path.getsize( output ) == 0: - for p in outputs.keys(): - for d in outputs[ p ].keys(): - sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n') - - if options.summary_out == "true": - r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary ) - - # Summary objects (these could be printed) - r( ''' - tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean) - tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean) - cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t') - write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t') - cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t') - write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t') - ''' % tuple( [ options.output_summary ] * 4 ) ) - - # Setup graphs - #pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10) - r( ''' - png('%s', width=800, height=500, units='px', res=250) - layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10)) - ''' % options.output_png ) - - # Main title - genome = '' - if options.genome: - genome = '%s: ' % options.genome - r( ''' - par(mar=c(0,0,0,0)) - plot(1, type='n', axes=F, xlab='', ylab='') - text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7) - ''' % ( genome, options.num_sims, options.avg_coverage ) ) - - # False positive boxplot - r( ''' - par(mar=c(5,4,2,2), las=1, cex=0.35) - boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85) - title(main='False Positives', xlab='false positive rate', ylab='') - ''' ) - - # False negative heatmap (note zlim command!) - num_polys = len( polymorphisms ) - num_dets = len( detection_threshes ) - r( ''' - par(mar=c(5,4,2,1), las=1, cex=0.35) - image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1) - axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i') - axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85) - title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold') - ''' % ( num_polys, num_dets, num_polys, num_dets ) ) - - # Scale alongside - r( ''' - par(mar=c(2,2,2,3), las=1) - image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85) - title(main='Key', cex=0.35) - mtext('false negative rate', side=1, cex=0.35) - ''' ) - - # Close graphics - r( ''' - layout(1) - dev.off() - ''' ) - - # Tidy up -# r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' ) - -if __name__ == "__main__" : __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/ngs_simulation/ngs_simulation.xml --- a/tools/ngs_simulation/ngs_simulation.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,217 +0,0 @@ - - - Illumina runs - - ngs_simulation.py - #if $in_type.input_type == "built-in" - --input="${ filter( lambda x: str( x[0] ) == str( $in_type.genome ), $__app__.tool_data_tables[ 'ngs_sim_fasta' ].get_fields() )[0][-1] }" - --genome=$in_type.genome - #else - --input=$in_type.input1 - #end if - --read_len=$read_len - --avg_coverage=$avg_coverage - --error_rate=$error_rate - --num_sims=$num_sims - --polymorphism=$polymorphism - --detection_thresh=$detection_thresh - --output_png=$output_png - --summary_out=$summary_out - --output_summary=$output_summary - --new_file_path=$__new_file_path__ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - summary_out == True - - - - - - - - - -**What it does** - -This tool simulates an Illumina run and provides plots of false positives and false negatives. It allows for a range of simulation parameters to be set. Note that this simulation sets only one (randomly chosen) position in the genome as polymorphic, according to the value specified. Superimposed on this are "sequencing errors", which are uniformly (and randomly) distributed. Polymorphisms are assigned using the detection threshold, so if the detection threshold is set to the same as the minor allele frequency, the expected false negative rate is 50%. - -**Parameter list** - -These are the parameters that should be set for the simulation:: - - Read length (which is the same for all reads) - Average Coverage - Frequency for Minor Allele - Sequencing Error Rate - Detection Threshold - Number of Simulations - -You also should choose to use either a built-in genome or supply your own FASTA file. - -**Output** - -There are one or two. The first is a png that contains two different plots and is always generated. The second is optional and is a text file with some summary information about the simulations that were run. Below are some example outputs for a 10-simulation run on phiX with the default settings:: - - Read length 76 - Average coverage 200 - Error rate/quality score 0.001 - Number of simulations 100 - Frequencies for minor allele 0.002 - 0.004 - Detection thresholds 0.003 - 0.005 - 0.007 - Include summary file Yes - -Plot output (png): - -.. image:: ./static/images/ngs_simulation.png - -Summary output (txt):: - - FP FN GENOMESIZE.5386 fprate hetcol errcol - Min. : 71.0 Min. :0.0 Mode:logical Min. :0.01318 Min. :0.004 Min. :0.007 - 1st Qu.:86.0 1st Qu.:1.0 NA's:10 1st Qu.:0.01597 1st Qu.:0.004 1st Qu.:0.007 - Median :92.5 Median :1.0 NA Median :0.01717 Median :0.004 Median :0.007 - Mean :93.6 Mean :0.9 NA Mean :0.01738 Mean :0.004 Mean :0.007 - 3rd Qu.:100.8 3rd Qu.:1.0 NA 3rd Qu.:0.01871 3rd Qu.:0.004 3rd Qu.:0.007 - Max. :123.0 Max. :1.0 NA Max. :0.02284 Max. :0.004 Max. :0.007 - - False Positive Rate Summary - 0.003 0.005 0.007 - 0.001 0.17711 0.10854 0.01673 - 0.009 0.18049 0.10791 0.01738 - - False Negative Rate Summary - 0.003 0.005 0.007 - 0.001 1.0 0.8 1.0 - 0.009 0.4 0.7 0.9 - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/peak_calling/ccat_2_wrapper.xml --- a/tools/peak_calling/ccat_2_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ - - Control-based ChIP-seq Analysis Tool - ccat_wrapper.py '$input_tag_file' '$input_control_file' '$chromInfo' - #if str( $options_type[ 'options_type_selector' ] ) == 'advanced': - '$input_advanced_config_file' - #else: - '${ options_type.input_config_file.fields.path }' - #end if - 'CCAT in Galaxy' - '$output_peak_file' '$output_region_file' '$output_top_file' '$output_log_file' - - CCAT - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #if str( $options_type['options_type_selector' ] ) == 'advanced': -fragmentSize ${options_type[ 'fragment_size' ]} -slidingWinSize ${options_type[ 'sliding_window_size' ]} -movingStep ${options_type[ 'moving_step' ]} -isStrandSensitiveMode ${options_type[ 'is_strand_sensitive_mode' ]} -minCount ${options_type[ 'min_count' ]} -outputNum ${options_type[ 'output_num' ]} -randomSeed ${options_type[ 'random_seed' ]} -minScore ${options_type[ 'min_score' ]} -bootstrapPass ${options_type[ 'bootstrap_pass' ]} -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool allows ChIP-seq peak/region calling using CCAT. - -View the original CCAT documentation: http://cmb.gis.a-star.edu.sg/ChIPSeq/paperCCAT.htm. - - diff -r c2a356708570 -r 33c067c3ae34 tools/peak_calling/ccat_wrapper.py --- a/tools/peak_calling/ccat_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -import sys, subprocess, tempfile, shutil, os.path - -CCAT_BINARY = "CCAT" - -def get_top_count( filename ): - for line in open( filename ): - if line.startswith( 'outputNum' ): - return int( line.split()[-1].strip() ) - -def stop_err( tmp_dir, exception ): - print >> sys.stderr, "Error running CCAT." - shutil.rmtree( tmp_dir ) #some error has occurred, provide info and remove possibly non-empty temp directory - raise exception - -def main(): - input_tag_file = sys.argv[1] - input_control_file = sys.argv[2] - chrom_info_file = sys.argv[3] - input_config_file = sys.argv[4] - project_name = sys.argv[5] - output_peak_file = sys.argv[6] - output_region_file = sys.argv[7] - output_top_file = sys.argv[8] - output_log_file = sys.argv[9] - - tmp_dir = tempfile.mkdtemp() - try: - proc = subprocess.Popen( args="%s %s > %s" % ( CCAT_BINARY, " ".join( map( lambda x: "'%s'" % x, [ input_tag_file, input_control_file, chrom_info_file, input_config_file, project_name ] ) ), output_log_file ), shell=True, cwd=tmp_dir ) - proc.wait() - if proc.returncode: - raise Exception( "Error code: %i" % proc.returncode ) - output_num = get_top_count( input_config_file ) - shutil.move( os.path.join( tmp_dir, "%s.significant.peak" % project_name ), output_peak_file ) - shutil.move( os.path.join( tmp_dir, "%s.significant.region" % project_name ), output_region_file ) - shutil.move( os.path.join( tmp_dir, "%s.top%i.peak" % ( project_name, output_num ) ), output_top_file ) - except Exception, e: - return stop_err( tmp_dir, e ) - os.rmdir( tmp_dir ) #clean up empty temp working directory - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/peak_calling/ccat_wrapper.xml --- a/tools/peak_calling/ccat_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ - - Control-based ChIP-seq Analysis Tool - ccat_wrapper.py '$input_tag_file' '$input_control_file' '$chromInfo' - #if str( $options_type[ 'options_type_selector' ] ) == 'advanced': - '$input_advanced_config_file' - #else: - '${ options_type.input_config_file.fields.path }' - #end if - 'CCAT in Galaxy' - '$output_peak_file' '$output_region_file' '$output_top_file' '$output_log_file' - - CCAT - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #if str( $options_type['options_type_selector' ] ) == 'advanced': -fragmentSize ${options_type[ 'fragment_size' ]} -slidingWinSize ${options_type[ 'sliding_window_size' ]} -movingStep ${options_type[ 'moving_step' ]} -isStrandSensitiveMode ${options_type[ 'is_strand_sensitive_mode' ]} -minCount ${options_type[ 'min_count' ]} -outputNum ${options_type[ 'output_num' ]} -randomSeed ${options_type[ 'random_seed' ]} -minScore ${options_type[ 'min_score' ]} -bootstrapPass ${options_type[ 'bootstrap_pass' ]} -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool allows ChIP-seq peak/region calling using CCAT. - -View the original CCAT documentation: http://cmb.gis.a-star.edu.sg/ChIPSeq/paperCCAT.htm. - - diff -r c2a356708570 -r 33c067c3ae34 tools/peak_calling/macs_wrapper.py --- a/tools/peak_calling/macs_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,135 +0,0 @@ -import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip -from galaxy import eggs -import pkg_resources -pkg_resources.require( "simplejson" ) -import simplejson - -CHUNK_SIZE = 1024 - -def gunzip_cat_glob_path( glob_path, target_filename, delete = False ): - out = open( target_filename, 'wb' ) - for filename in glob.glob( glob_path ): - fh = gzip.open( filename, 'rb' ) - while True: - data = fh.read( CHUNK_SIZE ) - if data: - out.write( data ) - else: - break - fh.close() - if delete: - os.unlink( filename ) - out.close() - -def xls_to_interval( xls_file, interval_file, header = None ): - out = open( interval_file, 'wb' ) - if header: - out.write( '#%s\n' % header ) - wrote_header = False - #From macs readme: Coordinates in XLS is 1-based which is different with BED format. - for line in open( xls_file ): - #keep all existing comment lines - if line.startswith( '#' ): - out.write( line ) - elif not wrote_header: - out.write( '#%s' % line ) - wrote_header = True - else: - fields = line.split( '\t' ) - if len( fields ) > 1: - fields[1] = str( int( fields[1] ) - 1 ) - out.write( '\t'.join( fields ) ) - out.close() - -def main(): - options = simplejson.load( open( sys.argv[1] ) ) - output_bed = sys.argv[2] - output_extra_html = sys.argv[3] - output_extra_path = sys.argv[4] - - experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for filenames (gzip of wig files will fail with spaces - macs doesn't properly escape them)..need to replace all whitespace, split makes this easier - cmdline = "macs -t %s" % ",".join( options['input_chipseq'] ) - if options['input_control']: - cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) ) - cmdline = "%s --format='%s' --name='%s' --gsize='%s' --tsize='%s' --bw='%s' --pvalue='%s' --mfold='%s' %s --lambdaset='%s' %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['tsize'], options['bw'], options['pvalue'], options['mfold'], options['nolambda'], options['lambdaset'], options['futurefdr'] ) - if 'wig' in options: - wigextend = int( options['wig']['wigextend'] ) - if wigextend >= 0: - wigextend = "--wigextend='%s'" % wigextend - else: - wigextend = '' - cmdline = "%s --wig %s --space='%s'" % ( cmdline, wigextend, options['wig']['space'] ) - if 'nomodel' in options: - cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] ) - if 'diag' in options: - cmdline = "%s --diag --fe-min='%s' --fe-max='%s' --fe-step='%s'" % ( cmdline, options['diag']['fe-min'], options['diag']['fe-max'], options['diag']['fe-step'] ) - - tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user - stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report - proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) ) - proc.wait() - #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log - #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup - if proc.returncode: - stderr_f = open( stderr_name ) - while True: - chunk = stderr_f.read( CHUNK_SIZE ) - if not chunk: - stderr_f.close() - break - sys.stderr.write( chunk ) - - #run R to create pdf from model script - if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ): - cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name ) - proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir ) - proc.wait() - - - #move bed out to proper output file - created_bed_name = os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name ) - if os.path.exists( created_bed_name ): - shutil.move( created_bed_name, output_bed ) - - #parse xls files to interval files as needed - if options['xls_to_interval']: - create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name ) - if os.path.exists( create_peak_xls_file ): - xls_to_interval( create_peak_xls_file, options['xls_to_interval']['peaks_file'], header = 'peaks file' ) - create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name ) - if os.path.exists( create_peak_xls_file ): - xls_to_interval( create_peak_xls_file, options['xls_to_interval']['negative_peaks_file'], header = 'negative peaks file' ) - - #merge and move wig files as needed, delete gz'd files and remove emptied dirs - if 'wig' in options: - wig_base_dir = os.path.join( tmp_dir, "%s_MACS_wiggle" % experiment_name ) - if os.path.exists( wig_base_dir ): - #treatment - treatment_dir = os.path.join( wig_base_dir, "treat" ) - if os.path.exists( treatment_dir ): - gunzip_cat_glob_path( os.path.join( treatment_dir, "*.wig.gz" ), options['wig']['output_treatment_file'], delete = True ) - os.rmdir( treatment_dir ) - #control - if options['input_control']: - control_dir = os.path.join( wig_base_dir, "control" ) - if os.path.exists( control_dir ): - gunzip_cat_glob_path( os.path.join( control_dir, "*.wig.gz" ), options['wig']['output_control_file'], delete = True ) - os.rmdir( control_dir ) - os.rmdir( wig_base_dir ) - - #move all remaining files to extra files path of html file output to allow user download - out_html = open( output_extra_html, 'wb' ) - out_html.write( 'Additional output created by MACS (%s)

Additional Files:

    \n' % experiment_name ) - os.mkdir( output_extra_path ) - for filename in sorted( os.listdir( tmp_dir ) ): - shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) ) - out_html.write( '
  • %s
  • \n' % ( filename, filename ) ) - out_html.write( '

\n' ) - out_html.write( '

Messages from MACS:

\n

%s

\n' % open( stderr_name, 'rb' ).read() ) - out_html.write( '\n' ) - out_html.close() - - os.unlink( stderr_name ) - os.rmdir( tmp_dir ) - -if __name__ == "__main__": main() diff -r c2a356708570 -r 33c067c3ae34 tools/peak_calling/macs_wrapper.xml --- a/tools/peak_calling/macs_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,230 +0,0 @@ - - Model-based Analysis of ChIP-Seq - macs_wrapper.py $options_file $output_bed_file $output_extra_files $output_extra_files.files_path - - macs - macs - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - xls_to_interval is True - - - xls_to_interval is True - input_type['input_control_file1'] is not None - - - wig_type['wig_type_selector']=='wig' - - - wig_type['wig_type_selector'] == 'wig' - input_type['input_control_file1'] is not None - - - - - <% -import simplejson -%> -#set $__options = { 'experiment_name':str( $experiment_name ), 'gsize':int( float( str( $gsize ) ) ), 'tsize':str( $tsize ), 'bw':str( $bw ), 'pvalue':str( $pvalue ), 'mfold':str( $mfold ), 'nolambda':str( $nolambda ), 'lambdaset': str( $lambdaset ), 'futurefdr':str( $futurefdr ) } -#if str( $xls_to_interval ) == 'create': -#set $__options['xls_to_interval'] = { 'peaks_file': str( $output_xls_to_interval_peaks_file ), 'negative_peaks_file': str( $output_xls_to_interval_negative_peaks_file ) } -#else: -#set $__options['xls_to_interval'] = False -#end if -##treatment/tag input files and format -#set $__options['input_chipseq'] = [ str( $input_type['input_chipseq_file1'] ) ] -#if $input_type['input_type_selector'] == 'paired_end': -#set $_hole = __options['input_chipseq'].append( str( $input_type['input_chipseq_file2'] ) ) -#set $__options['format'] = 'ELANDMULTIPET' -#else: -#set $__options['format'] = $input_type['input_chipseq_file1'].extension.upper() -#end if -##control/input files -#set $__options['input_control'] = [] -#if str( $input_type['input_control_file1'] ) != 'None': -#set $_hole = __options['input_control'].append( str( $input_type['input_control_file1'] ) ) -#end if -#if $input_type['input_type_selector'] == 'paired_end' and str( $input_type['input_control_file2'] ) != 'None': -#set $_hole = __options['input_control'].append( str( $input_type['input_control_file2'] ) ) -#end if -##wig options -#if $wig_type['wig_type_selector'] == 'wig': -#set $__options['wig'] = {} -#set $__options['wig']['wigextend'] = str( $wig_type['wigextend'] ) -#set $__options['wig']['space'] = str( $wig_type['space'] ) -#set $__options['wig']['output_treatment_file'] = str( $output_treatment_wig_file ) -#if $input_type['input_control_file1'] is not None: -#set $__options['wig']['output_control_file'] = str( $output_control_wig_file ) -#end if -#end if -##model options -#if $nomodel_type['nomodel_type_selector'] == 'nomodel': -#set $__options['nomodel'] = str( $nomodel_type['shiftsize'] ) -#end if -##diag options -#if $diag_type['diag_type_selector'] == 'diag': -#set $__options['diag'] = { 'fe-min':str( $diag_type['fe-min'] ), 'fe-max':str( $diag_type['fe-max'] ), 'fe-step':str( $diag_type['fe-step'] ) } -#end if -${ simplejson.dumps( __options ) } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool allows ChIP-seq peak calling using MACS. - -Depending upon selected options, 2 to 6 history items will be created; the first output will be a standard BED file and the last will be an HTML report containing links to download additional files generated by MACS. Up to two each of wig and interval files can be optionally created; the interval files are parsed from the xls output. - -View the original MACS documentation: http://liulab.dfci.harvard.edu/MACS/00README.html. - - diff -r c2a356708570 -r 33c067c3ae34 tools/peak_calling/sicer_wrapper.py --- a/tools/peak_calling/sicer_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,156 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg - -""" -A wrapper script for running SICER (spatial clustering approach for the identification of ChIP-enriched regions) region caller. -""" - -import sys, optparse, os, tempfile, subprocess, shutil - -CHUNK_SIZE = 2**20 #1mb - -VALID_BUILDS = [ 'mm8', 'mm9', 'hg18', 'hg19', 'dm2', 'dm3', 'sacCer1', 'pombe', 'rn4', 'tair8' ] #HACK! FIXME: allow using all specified builds, would currently require hacking SICER's "GenomeData.py" on the fly. - -def cleanup_before_exit( tmp_dir ): - if tmp_dir and os.path.exists( tmp_dir ): - shutil.rmtree( tmp_dir ) - - -def open_file_from_option( filename, mode = 'rb' ): - if filename: - return open( filename, mode = mode ) - return None - -def add_one_to_file_column( filename, column, split_char = "\t", startswith_skip = None ): - tmp_out = tempfile.TemporaryFile( mode='w+b' ) - tmp_in = open( filename ) - for line in tmp_in: - if startswith_skip and line.startswith( startswith_skip ): - tmp_out.write( line ) - else: - fields = line.rstrip( '\n\r' ).split( split_char ) - if len( fields ) <= column: - tmp_out.write( line ) - else: - fields[ column ] = str( int( fields[ column ] ) + 1 ) - tmp_out.write( "%s\n" % ( split_char.join( fields ) ) ) - tmp_in.close() - tmp_out.seek( 0 ) - tmp_in = open( filename, 'wb' ) - while True: - chunk = tmp_out.read( CHUNK_SIZE ) - if chunk: - tmp_in.write( chunk ) - else: - break - tmp_in.close() - tmp_out.close() - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - #stdout/err - parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' ) - parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' ) - parser.add_option( '', '--fix_off_by_one_errors', dest='fix_off_by_one_errors', action='store_true', default=False, help='If specified, fix off-by-one errors in output files' ) - #inputs - parser.add_option( '-b', '--bed_file', dest='bed_file', action='store', type="string", default=None, help='Input ChIP BED file.' ) - parser.add_option( '-c', '--control_file', dest='control_file', action='store', type="string", default=None, help='Input control BED file.' ) - parser.add_option( '-d', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='Input dbkey.' ) - parser.add_option( '-r', '--redundancy_threshold', dest='redundancy_threshold', action='store', type="int", default=1, help='Redundancy Threshold: The number of copies of identical reads allowed in a library.' ) - parser.add_option( '-w', '--window_size', dest='window_size', action='store', type="int", default=200, help='Window size: resolution of SICER algorithm. For histone modifications, one can use 200 bp' ) - parser.add_option( '-f', '--fragment_size', dest='fragment_size', action='store', type="int", default=150, help='Fragment size: is for determination of the amount of shift from the beginning of a read to the center of the DNA fragment represented by the read. FRAGMENT_SIZE=150 means the shift is 75.' ) - parser.add_option( '-e', '--effective_genome_fraction', dest='effective_genome_fraction', action='store', type="float", default=0.74, help='Effective genome fraction: Effective Genome as fraction of the genome size. It depends on read length.' ) - parser.add_option( '-g', '--gap_size', dest='gap_size', action='store', type="int", default=600, help='Gap size: needs to be multiples of window size. Namely if the window size is 200, the gap size should be 0, 200, 400, 600, ... .' ) - parser.add_option( '-o', '--error_cut_off', dest='error_cut_off', action='store', type="string", default="0.1", help='Error Cut off: FDR or E-value' ) #read as string to construct names properly - #outputs - parser.add_option( '', '--redundancy_removed_test_bed_output_file', dest='redundancy_removed_test_bed_output_file', action='store', type="string", default=None, help='test-1-removed.bed: redundancy_removed test bed file' ) - parser.add_option( '', '--redundancy_removed_control_bed_output_file', dest='redundancy_removed_control_bed_output_file', action='store', type="string", default=None, help='control-1-removed.bed: redundancy_removed control bed file' ) - parser.add_option( '', '--summary_graph_output_file', dest='summary_graph_output_file', action='store', type="string", default=None, help='test-W200.graph: summary graph file for test-1-removed.bed with window size 200, in bedGraph format.' ) - parser.add_option( '', '--test_normalized_wig_output_file', dest='test_normalized_wig_output_file', action='store', type="string", default=None, help='test-W200-normalized.wig: the above file normalized by library size per million and converted into wig format. This file can be uploaded to the UCSC genome browser' ) - parser.add_option( '', '--score_island_output_file', dest='score_island_output_file', action='store', type="string", default=None, help='test-W200-G600.scoreisland: an intermediate file for debugging usage.' ) - parser.add_option( '', '--islands_summary_output_file', dest='islands_summary_output_file', action='store', type="string", default=None, help='test-W200-G600-islands-summary: summary of all candidate islands with their statistical significance.' ) - parser.add_option( '', '--significant_islands_summary_output_file', dest='significant_islands_summary_output_file', action='store', type="string", default=None, help='test-W200-G600-islands-summary-FDR.01: summary file of significant islands with requirement of FDR=0.01.' ) - parser.add_option( '', '--significant_islands_output_file', dest='significant_islands_output_file', action='store', type="string", default=None, help='test-W200-G600-FDR.01-island.bed: delineation of significant islands in "chrom start end read-count-from-redundancy_removed-test.bed" format' ) - parser.add_option( '', '--island_filtered_output_file', dest='island_filtered_output_file', action='store', type="string", default=None, help='test-W200-G600-FDR.01-islandfiltered.bed: library of raw redundancy_removed reads on significant islands.' ) - parser.add_option( '', '--island_filtered_normalized_wig_output_file', dest='island_filtered_normalized_wig_output_file', action='store', type="string", default=None, help='test-W200-G600-FDR.01-islandfiltered-normalized.wig: wig file for the island-filtered redundancy_removed reads.' ) - (options, args) = parser.parse_args() - - #check if valid build - assert options.dbkey in VALID_BUILDS, ValueError( "The specified build ('%s') is not available for this tool." % options.dbkey ) - - #everything will occur in this temp directory - tmp_dir = tempfile.mkdtemp() - - #link input files into tmp_dir and build command line - bed_base_filename = 'input_bed_file' - bed_filename = '%s.bed' % bed_base_filename - os.symlink( options.bed_file, os.path.join( tmp_dir, bed_filename ) ) - if options.control_file is not None: - cmd = "SICER.sh" - else: - cmd = "SICER-rb.sh" - cmd = '%s "%s" "%s"' % ( cmd, tmp_dir, bed_filename ) - if options.control_file is not None: - control_base_filename = 'input_control_file' - control_filename = '%s.bed' % control_base_filename - os.symlink( options.control_file, os.path.join( tmp_dir, control_filename ) ) - cmd = '%s "%s"' % ( cmd, control_filename ) - cmd = '%s "%s" "%s" "%i" "%i" "%i" "%f" "%i" "%s"' % ( cmd, tmp_dir, options.dbkey, options.redundancy_threshold, options.window_size, options.fragment_size, options.effective_genome_fraction, options.gap_size, options.error_cut_off ) - - #set up stdout and stderr output options - stdout = open_file_from_option( options.stdout, mode = 'wb' ) - stderr = open_file_from_option( options.stderr, mode = 'wb' ) - #if no stderr file is specified, we'll use our own - if stderr is None: - stderr = tempfile.NamedTemporaryFile( dir=tmp_dir ) - stderr.close() - stderr = open( stderr.name, 'w+b' ) - - proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir ) - return_code = proc.wait() - - if return_code: - stderr_target = sys.stderr - else: - stderr_target = stdout #sys.stdout - stderr_target.write( "\nAdditionally, these warnings were reported:\n" ) - stderr.flush() - stderr.seek(0) - while True: - chunk = stderr.read( CHUNK_SIZE ) - if chunk: - stderr_target.write( chunk ) - else: - break - stderr.close() - - try: - #move files to where they belong - shutil.move( os.path.join( tmp_dir,'%s-%i-removed.bed' % ( bed_base_filename, options.redundancy_threshold ) ), options.redundancy_removed_test_bed_output_file ) - shutil.move( os.path.join( tmp_dir,'%s-W%i.graph' % ( bed_base_filename, options.window_size ) ), options.summary_graph_output_file ) - if options.fix_off_by_one_errors: add_one_to_file_column( options.summary_graph_output_file, 2 ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-normalized.wig' % ( bed_base_filename, options.window_size ) ), options.test_normalized_wig_output_file ) - if options.control_file is not None: - shutil.move( os.path.join( tmp_dir,'%s-%i-removed.bed' % ( control_base_filename, options.redundancy_threshold ) ), options.redundancy_removed_control_bed_output_file ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i.scoreisland' % ( bed_base_filename, options.window_size, options.gap_size ) ), options.score_island_output_file ) - if options.fix_off_by_one_errors: add_one_to_file_column( options.score_island_output_file, 2 ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-islands-summary' % ( bed_base_filename, options.window_size, options.gap_size ) ), options.islands_summary_output_file ) - if options.fix_off_by_one_errors: add_one_to_file_column( options.islands_summary_output_file, 2 ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-islands-summary-FDR%s' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.significant_islands_summary_output_file ) - if options.fix_off_by_one_errors: add_one_to_file_column( options.significant_islands_summary_output_file, 2 ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-FDR%s-island.bed' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.significant_islands_output_file ) - if options.fix_off_by_one_errors: add_one_to_file_column( options.significant_islands_output_file, 2 ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-FDR%s-islandfiltered.bed' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_output_file ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-FDR%s-islandfiltered-normalized.wig' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_normalized_wig_output_file ) - else: - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-E%s.scoreisland' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.score_island_output_file ) - if options.fix_off_by_one_errors: add_one_to_file_column( options.score_island_output_file, 2 ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-E%s-islandfiltered.bed' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_output_file ) - shutil.move( os.path.join( tmp_dir,'%s-W%i-G%i-E%s-islandfiltered-normalized.wig' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_normalized_wig_output_file ) - except Exception, e: - raise e - finally: - cleanup_before_exit( tmp_dir ) - -if __name__=="__main__": __main__() diff -r c2a356708570 -r 33c067c3ae34 tools/peak_calling/sicer_wrapper.xml --- a/tools/peak_calling/sicer_wrapper.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,176 +0,0 @@ - - Statistical approach for the Identification of ChIP-Enriched Regions - sicer_wrapper.py - --bed_file '${input_bed_file}' - #if str( $input_control_file ) != 'None': - --control_file '${input_control_file}' - --significant_islands_output_file "${significant_islands_output_file}" - --islands_summary_output_file "${islands_summary_output_file}" - --significant_islands_summary_output_file "${significant_islands_summary_output_file}" - #end if - ${fix_off_by_one_errors} - --dbkey '${input_bed_file.dbkey}' - --redundancy_threshold '${redundancy_threshold}' - --window_size '${window_size}' - --fragment_size '${fragment_size}' - --effective_genome_fraction '${effective_genome_fraction}' - --gap_size '${gap_size}' - --error_cut_off '${error_cut_off}' - ##output files - --stdout "${output_log_file}" - --redundancy_removed_test_bed_output_file "${redundancy_removed_test_bed_output_file}" - --redundancy_removed_control_bed_output_file "${redundancy_removed_control_bed_output_file}" - --score_island_output_file "${score_island_output_file}" - --summary_graph_output_file "${summary_graph_output_file}" - --test_normalized_wig_output_file "${test_normalized_wig_output_file}" - --island_filtered_output_file "${island_filtered_output_file}" - --island_filtered_normalized_wig_output_file "${island_filtered_normalized_wig_output_file}" - - - SICER - - - - value.dbkey in [ 'mm8', 'mm9', 'hg18', 'hg19', 'dm2', 'dm3', 'sacCer1', 'pombe', 'rn4', 'tair8' ] - - - - - - - - - - - - - - - - - - input_control_file is not None - - - - - input_control_file is not None - - - - - - input_control_file is not None - - - input_control_file is not None - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -SICER first and foremost is a filtering tool. Its main functions are:: - - 1. Delineation of the significantly ChIP-enriched regions, which can be used to associate with other genomic landmarks. - 2. Identification of reads on the ChIP-enriched regions, which can be used for profiling and other quantitative analysis. - -View the original SICER documentation: http://home.gwu.edu/~wpeng/Software.htm. - ------- - -.. class:: warningmark - - By default, SICER creates files that do not conform to standards (e.g. BED files are closed, not half-open). This could have implications for downstream analysis. - To force the output of SICER to be formatted properly to standard file formats, check the **"Fix off-by-one errors in output files"** option. - ------- - -**Citation** - -For the underlying tool, please cite `Zang C, Schones DE, Zeng C, Cui K, Zhao K, Peng W. A clustering approach for identification of enriched domains from histone modification ChIP-Seq data. Bioinformatics. 2009 Aug 1;25(15):1952-8. <http://www.ncbi.nlm.nih.gov/pubmed/19505939>`_ - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/picard_AddOrReplaceReadGroups.xml --- a/tools/picard/picard_AddOrReplaceReadGroups.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,204 +0,0 @@ - - picard - - picard_wrapper.py - --input="$inputFile" - --rg-lb="$rglb" - --rg-pl="$rgpl" - --rg-pu="$rgpu" - --rg-sm="$rgsm" - --rg-id="$rgid" - --rg-opts=${readGroupOpts.rgOpts} - #if $readGroupOpts.rgOpts == "full" - --rg-cn="$readGroupOpts.rgcn" - --rg-ds="$readGroupOpts.rgds" - #end if - --output-format=$outputFormat - --output=$outFile - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/AddOrReplaceReadGroups.jar" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Add or Replace Read Groups in an input BAM or SAM file. - -**Read Groups are Important!** - -Many downstream analysis tools (such as GATK, for example) require BAM datasets to contain read groups. Even if you are not going to use GATK, setting read groups correctly from the start will simplify your life greatly. Below we provide an explanation of read groups fields taken from GATK FAQ webpage: - -.. csv-table:: - :header-rows: 1 - - Tag,Importance,Definition,Meaning - "ID","Required","Read group identifier. Each @RG line must have a unique ID. The value of ID is used in the RG tags of alignment records. Must be unique among all read groups in header section. Read group IDs may be modified when merging SAM files in order to handle collisions.","Ideally, this should be a globally unique identify across all sequencing data in the world, such as the Illumina flowcell + lane name and number. Will be referenced by each read with the RG:Z field, allowing tools to determine the read group information associated with each read, including the sample from which the read came. Also, a read group is effectively treated as a separate run of the NGS instrument in tools like base quality score recalibration (a GATK component) -- all reads within a read group are assumed to come from the same instrument run and to therefore share the same error model." - "SM","Sample. Use pool name where a pool is being sequenced.","Required. As important as ID.","The name of the sample sequenced in this read group. GATK tools treat all read groups with the same SM value as containing sequencing data for the same sample. Therefore it's critical that the SM field be correctly specified, especially when using multi-sample tools like the Unified Genotyper (a GATK component)." - "PL","Platform/technology used to produce the read. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.","Important. Not currently used in the GATK, but was in the past, and may return. The only way to known the sequencing technology used to generate the sequencing data","It's a good idea to use this field." - "LB","DNA preparation library identify","Essential for MarkDuplicates","MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates, in case the same DNA library was sequenced on multiple lanes." - -**Example of Read Group usage** - -Support we have a trio of samples: MOM, DAD, and KID. Each has two DNA libraries prepared, one with 400 bp inserts and another with 200 bp inserts. Each of these libraries is run on two lanes of an illumina hiseq, requiring 3 x 2 x 2 = 12 lanes of data. When the data come off the sequencer, we would create 12 BAM files, with the following @RG fields in the header:: - - Dad's data: - @RG ID:FLOWCELL1.LANE1 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200 - @RG ID:FLOWCELL1.LANE2 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200 - @RG ID:FLOWCELL1.LANE3 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400 - @RG ID:FLOWCELL1.LANE4 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400 - - Mom's data: - @RG ID:FLOWCELL1.LANE5 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200 - @RG ID:FLOWCELL1.LANE6 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200 - @RG ID:FLOWCELL1.LANE7 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400 - @RG ID:FLOWCELL1.LANE8 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400 - - Kid's data: - @RG ID:FLOWCELL2.LANE1 PL:illumina LB:LIB-KID-1 SM:KID PI:200 - @RG ID:FLOWCELL2.LANE2 PL:illumina LB:LIB-KID-1 SM:KID PI:200 - @RG ID:FLOWCELL2.LANE3 PL:illumina LB:LIB-KID-2 SM:KID PI:400 - @RG ID:FLOWCELL2.LANE4 PL:illumina LB:LIB-KID-2 SM:KID PI:400 - -Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library). - -**Picard documentation** - -This is a Galaxy wrapper for AddOrReplaceReadGroups, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------- - -.. class:: infomark - -**Inputs, outputs, and parameters** - -Either a sam file or a bam file must be supplied. If a bam file is used, it must -be coordinate-sorted. Galaxy currently coordinate-sorts all bam files. - -The output file is either bam (the default) or sam, according to user selection, -and contains the same information as the input file except for the appropraite -additional (or modified) read group tags. Bam is recommended since it is smaller. - -From the Picard documentation. - -AddOrReplaceReadGroups REQUIRED parameters:: - - Option (Type) Description - - RGLB=String Read Group Library - RGPL=String Read Group platform (e.g. illumina, solid) - RGPU=String Read Group platform unit (eg. run barcode) - RGSM=String Read Group sample name - RGID=String Read Group ID; Default value: null (empty) - -AddOrReplaceReadGroups OPTIONAL parameters:: - - Option (Type) Description - - RGCN=String Read Group sequencing center name; Default value: null (empty) - RGDS=String Read Group description Default value: null (empty) - -One parameter that Picard's AddOrReplaceReadGroups offers that is automatically -set by Galaxy is the SORT_ORDER, which is set to coordinate. - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/picard_BamIndexStats.xml --- a/tools/picard/picard_BamIndexStats.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ - - picard - - picard_wrapper.py - --input "$input_file" - --bai-file "$input_file.metadata.bam_index" - -t "$htmlfile" - -d "$htmlfile.files_path" - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/BamIndexStats.jar" - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Generate Bam Index Stats for a provided BAM file. - -**Picard documentation** - -This is a Galaxy wrapper for BamIndexStats, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------- - -.. class:: infomark - -**Inputs and outputs** - -The only input is the BAM file you wish to obtain statistics for, which is required. -Note that it must be coordinate-sorted. Galaxy currently coordinate-sorts all BAM files. - -This tool outputs an HTML file that contains links to the actual metrics results, as well -as a log file with info on the exact command run. - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - ------- - -**Example** - -Given a BAM file created from the following:: - - @HD VN:1.0 SO:coordinate - @SQ SN:chr1 LN:101 - @SQ SN:chr7 LN:404 - @SQ SN:chr8 LN:202 - @SQ SN:chr10 LN:303 - @SQ SN:chr14 LN:505 - @RG ID:0 SM:Hi,Mom! - @RG ID:1 SM:samplesample DS:ClearDescription - @PG ID:1 PN:Hey! VN:2.0 - @CO Just a generic comment to make the header longer - read1 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0 - read2 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0 - read3 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0 - read4 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0 - read5 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0 - read6 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1 RG:Z:0 - read7 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1 RG:Z:0 - read8 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1 RG:Z:0 - -The following metrics file will be produced:: - - chr1 length= 101 Aligned= 0 Unaligned= 0 - chr7 length= 404 Aligned= 7 Unaligned= 0 - chr8 length= 202 Aligned= 0 Unaligned= 0 - chr10 length= 303 Aligned= 0 Unaligned= 0 - chr14 length= 505 Aligned= 0 Unaligned= 0 - NoCoordinateCount= 1 - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/picard_MarkDuplicates.xml --- a/tools/picard/picard_MarkDuplicates.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,150 +0,0 @@ - - - picard_wrapper.py - --input="$input_file" - --remove-dups="$remDups" - --read-regex="$readRegex" - --opt-dup-dist="$optDupeDist" - --output-format=$outputFormat - --output-txt=$outMetrics - #if str( $outputFormat ) == "sam" - #if str( $remDups ) == "true" - --output-sam=$outFileSamRemoved - #else - --output-sam=$outFileSamMarked - #end if - #else if str( $outputFormat ) == "bam" - #if str( $remDups ) == "true" - --output-sam=$outFileBamRemoved - #else - --output-sam=$outFileBamMarked - #end if - #end if - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/" - --picard-cmd="MarkDuplicates" - - - - - - - - - - - - - - - - - - - - - outputFormat is False - remDups is False - - - outputFormat is False - remDups is True - - - outputFormat is True - remDups is False - - - outputFormat is True - remDups is True - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -MarkDuplicates examines aligned records in the supplied sam or bam file to identify duplicate molecules. - -**Picard documentation** - -This is a Galaxy interface for MarkDuplicates, a part of Picard-tools_, which is closely related to SAMTools_. - - .. _Picard-tools: http://picard.sourceforge.net/index.shtml - .. _SAMTools: http://samtools.sourceforge.net/ - ------- - -**Input** - -Either a sam file or a bam file is required. If a bam file is used, it must be coordinate-sorted. - -**Outputs** - -This tool provides two outputs. The first contains the marked (or kept) records and is either bam (the default) or sam, according to user selection. Bam is recommended since it is smaller. The second output is the metrics file, which is a text file containing information about the duplicates. - -**MarkDuplicates parameters** - -The two main parameters to be concerned with are the flag for removing duplicates and the regular expression needed to identify reads. If it is set to remove duplicates, they will not be written to the output file; otherwise they will appear in the output but will be flagged appropriately. The read name regular expression is used to parse read names from the input sam file. Read names are parsed to extract three variables: tile/region, x coordinate, and y coordinate. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order (capture groups are enclosed in parentheses). Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. - -One other parameter that can be set is the maximum offset between two duplicate clusters in order for them to be considered optical duplicates. Later versions of the Illumina pipeline that multiply pixel values by 10 should generally use 50-100 pixels; otherwise 5-10 is normal. The default is set to 100. - -One parameter that Picard's MarkDuplicates offers that is automatically set by Galaxy is the ASSUME_SORTED, which is set to true because Galaxy bam should always be coordinate-sorted. - -**Note on the use of regular expressions for read name parsing** - -The regular expression (regex) is used to parse the read names, so it's important to get it exactly right (so you probably don't want to edit this unless you know exactly what you're doing). The three parts of the read names identified are tile/region, x coordinate, and y coordinate, which are used in conjunction with the optical duplication rate to more accurately estimate library size. - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/picard_ReorderSam.xml --- a/tools/picard/picard_ReorderSam.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,165 +0,0 @@ - - picard - - picard_wrapper.py - --input=$inputFile - #if $source.indexSource == "built-in" - --ref="${ filter( lambda x: str( x[0] ) == str( $source.ref ), $__app__.tool_data_tables[ 'picard_indexes' ].get_fields() )[0][-1] }" - #else - --ref-file=$refFile - --species-name=$source.speciesName - --build-name=$source.buildName - --trunc-names=$source.truncateSeqNames - #end if - --allow-inc-dict-concord=$allowIncDictConcord - --allow-contig-len-discord=$allowContigLenDiscord - --output-format=$outputFormat - --output=$outFile - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/ReorderSam.jar" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Reorder SAM/BAM to match contig ordering in a particular reference file. Note that this is -not the same as sorting as done by the SortSam tool, which sorts by either coordinate -values or query name. The ordering in ReorderSam is based on exact name matching of -contigs/chromosomes. Reads that are mapped to a contig that is not in the new reference file are -not included in the output. - -**Picard documentation** - -This is a Galaxy wrapper for ReorderSam, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------- - -.. class:: infomark - -**Inputs, outputs, and parameters** - -For the file that needs to be reordered, either a sam file or a bam file must be supplied. -If a bam file is used, it must be coordinate-sorted. A reference file is also required, -so either a fasta file should be supplied or a built-in reference can be selected. - -The output contains the same reads as the input file but the reads have been rearranged so -they appear in the same order as the provided reference file. The tool will output either -bam (the default) or sam, according to user selection. Bam is recommended since it is smaller. - -The only extra parameters that can be set are flags for allowing incomplete dict concordance -and allowing contig length discordance. If incomplete dict concordance is allowed, only a -partial overlap of the bam contigs with the new reference sequence contigs is required. By -default it is off, requiring a corresponding contig in the new reference for each read contig. -If contig length discordance is allowed, contig names that are the same between a read and the -new reference contig are allowed even if they have different lengths. This is usually not a -good idea, unless you know exactly what you're doing. It's off by default. - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/picard_ReplaceSamHeader.xml --- a/tools/picard/picard_ReplaceSamHeader.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ - - picard - - picard_wrapper.py - --input "$inputFile" - -o $outFile - --header-file $headerFile - --output-format $outputFormat - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/ReplaceSamHeader.jar" - --tmpdir "${__new_file_path__}" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Replace Sam Header with the header from another sam file. The tool does not do any -significant validation, so it's up to the user to make sure that the elements in -the header are relevant and that the new header has all the required things. - -Replace the SAMFileHeader in a SAM file with the given header. Validation is -minimal. It is up to the user to ensure that all the elements referred to in the -SAMRecords are present in the new header. Sort order of the two input files must -be the same. - -**Picard documentation** - -This is a Galaxy wrapper for ReplaceSamHeader, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------- - -.. class:: infomark - -**Inputs and outputs** - -Either a sam file or a bam file is required as the file whose header will be replaced. -The header file is also required and can also be either sam or bam (it does not have -to be the same type as the other file). In both cases, if a bam file is used, it must -be coordinate-sorted. Galaxy currently coordinate-sorts all bam files. - -The tool will output either bam (the default) or sam. Bam is recommended since it is smaller. - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/picard_wrapper.py --- a/tools/picard/picard_wrapper.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,734 +0,0 @@ -#!/usr/bin/env python -""" -Originally written by Kelly Vincent -pretty output and additional picard wrappers by Ross Lazarus for rgenetics -Runs all available wrapped Picard tools. -usage: picard_wrapper.py [options] -code Ross wrote licensed under the LGPL -see http://www.gnu.org/copyleft/lesser.html -""" - -import optparse, os, sys, subprocess, tempfile, shutil, time, logging - -galhtmlprefix = """ - - - - - - - - - -
-""" -galhtmlattr = """Galaxy tool %s run at %s
""" -galhtmlpostfix = """
\n""" - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - - -class PicardBase(): - """ - simple base class with some utilities for Picard - adapted and merged with Kelly Vincent's code april 2011 Ross - lots of changes... - """ - - def __init__(self, opts=None,arg0=None): - """ common stuff needed at init for a picard tool - """ - assert opts <> None, 'PicardBase needs opts at init' - self.opts = opts - if self.opts.outdir == None: - self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir - assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in' - self.picname = self.baseName(opts.jar) - if self.picname.startswith('picard'): - self.picname = opts.picard_cmd # special case for some tools like replaceheader? - self.progname = self.baseName(arg0) - self.version = '0.002' - self.delme = [] # list of files to destroy - self.title = opts.title - self.inputfile = opts.input - try: - os.makedirs(opts.outdir) - except: - pass - try: - os.makedirs(opts.tmpdir) - except: - pass - self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname) - self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname) - self.setLogging(logfname=self.log_filename) - - def baseName(self,name=None): - return os.path.splitext(os.path.basename(name))[0] - - def setLogging(self,logfname="picard_wrapper.log"): - """setup a logger - """ - logging.basicConfig(level=logging.INFO, - filename=logfname, - filemode='a') - - - def readLarge(self,fname=None): - """ read a potentially huge file. - """ - try: - # get stderr, allowing for case where it's very large - tmp = open( fname, 'rb' ) - s = '' - buffsize = 1048576 - try: - while True: - more = tmp.read( buffsize ) - if len(more) > 0: - s += more - else: - break - except OverflowError: - pass - tmp.close() - except Exception, e: - stop_err( 'Error : %s' % str( e ) ) - return s - - def runCL(self,cl=None,output_dir=None): - """ construct and run a command line - we have galaxy's temp path as opt.temp_dir so don't really need isolation - sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts - """ - assert cl <> None, 'PicardBase runCL needs a command line as cl' - if output_dir == None: - output_dir = self.opts.outdir - if type(cl) == type([]): - cl = ' '.join(cl) - fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt') - tlf = open(templog,'wb') - fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt') - tef = open(temperr,'wb') - process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir) - rval = process.wait() - tlf.close() - tef.close() - stderrs = self.readLarge(temperr) - stdouts = self.readLarge(templog) - if len(stderrs) > 0: - s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs) - else: - s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval) - logging.info(s) - os.unlink(templog) # always - os.unlink(temperr) # always - return s, stdouts # sometimes this is an output - - def runPic(self, jar, cl): - """ - cl should be everything after the jar file name in the command - """ - runme = ['java -Xmx%s' % self.opts.maxjheap] - runme.append('-jar %s' % jar) - runme += cl - s,stdout = self.runCL(cl=runme, output_dir=self.opts.outdir) - return stdout - - def samToBam(self,infile=None,outdir=None): - """ - use samtools view to convert sam to bam - """ - fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam') - cl = ['samtools view -h -b -S -o ',tempbam,infile] - tlog,stdouts = self.runCL(cl,outdir) - return tlog,tempbam - - #def bamToSam(self,infile=None,outdir=None): - # """ - # use samtools view to convert bam to sam - # """ - # fd,tempsam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.sam') - # cl = ['samtools view -h -o ',tempsam,infile] - # tlog,stdouts = self.runCL(cl,outdir) - # return tlog,tempsam - - def sortSam(self, infile=None,outfile=None,outdir=None): - """ - """ - print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir) - cl = ['samtools sort',infile,outfile] - tlog,stdouts = self.runCL(cl,outdir) - return tlog - - def cleanup(self): - for fname in self.delme: - try: - os.unlink(fname) - except: - pass - - def prettyPicout(self,transpose,maxrows): - """organize picard outpouts into a report html page - """ - res = [] - try: - r = open(self.metricsOut,'r').readlines() - except: - r = [] - if len(r) > 0: - res.append('Picard on line resources
\n') - if transpose: - res.append('Picard output (transposed to make it easier to see)
\n') - else: - res.append('Picard output
\n') - res.append('\n') - dat = [] - heads = [] - lastr = len(r) - 1 - # special case for estimate library complexity hist - thist = False - for i,row in enumerate(r): - if row.strip() > '': - srow = row.split('\t') - if row.startswith('#'): - heads.append(row.strip()) # want strings - else: - dat.append(srow) # want lists - if row.startswith('## HISTOGRAM'): - thist = True - if len(heads) > 0: - hres = ['' % (i % 2,x) for i,x in enumerate(heads)] - res += hres - heads = [] - if len(dat) > 0: - if transpose and not thist: - tdat = map(None,*dat) # transpose an arbitrary list of lists - tdat = ['\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)] - else: - tdat = ['\t'.join(x).strip() for x in dat] # back to strings :( - tdat = ['\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)] - res += tdat - dat = [] - res.append('
%s
%s%s 
%s
\n') - return res - - def fixPicardOutputs(self,transpose,maxloglines): - """ - picard produces long hard to read tab header files - make them available but present them transposed for readability - """ - logging.shutdown() - self.cleanup() # remove temp files stored in delme - rstyle="""""" - res = [rstyle,] - res.append(galhtmlprefix % self.progname) - res.append(galhtmlattr % (self.picname,timenow())) - flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')] - pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] - if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs - for p in pdflist: - imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf - res.append('
\n') - res.append('\n' % (p,imghref)) - res.append('
\n') - if len(flist) > 0: - res.append('The following output files were created (click the filename to view/download a copy):
') - res.append('\n') - for i,f in enumerate(flist): - fn = os.path.split(f)[-1] - res.append('\n' % (fn,fn)) - res.append('
%s

\n') - pres = self.prettyPicout(transpose,maxloglines) - if len(pres) > 0: - res += pres - l = open(self.log_filename,'r').readlines() - llen = len(l) - if llen > 0: - res.append('Picard Tool Run Log


\n') - rlog = ['
',]
-            if llen > maxloglines:
-                n = min(50,int(maxloglines/2))
-                rlog += l[:n]
-                rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines))
-                rlog += l[-n:]
-            else:
-                rlog += l
-            rlog.append('
') - if llen > maxloglines: - rlog.append('\n## WARNING - %d log lines truncated - %s contains entire output' % (llen - maxloglines,self.log_filename,self.log_filename)) - res += rlog - else: - res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename) - res.append('
The freely available Picard software \n') - res.append( 'generated all outputs reported here running as a Galaxy tool') - res.append(galhtmlpostfix) - outf = open(self.opts.htmlout,'w') - outf.write(''.join(res)) - outf.write('\n') - outf.close() - - def makePicInterval(self,inbed=None,outf=None): - """ - picard wants bait and target files to have the same header length as the incoming bam/sam - a meaningful (ie accurate) representation will fail because of this - so this hack - it would be far better to be able to supply the original bed untouched - """ - assert inbed <> None - bed = open(inbed,'r').readlines() - thead = os.path.join(self.opts.outdir,'tempSamHead.txt') - if self.opts.datatype == 'sam': - cl = ['samtools view -H -S',self.opts.input,'>',thead] - else: - cl = ['samtools view -H',self.opts.input,'>',thead] - self.runCL(cl=cl,output_dir=self.opts.outdir) - head = open(thead,'r').readlines() - s = '## got %d rows of header\n' % (len(head)) - logging.info(s) - o = open(outf,'w') - o.write(''.join(head)) - o.write(''.join(bed)) - o.close() - return outf - - def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None): - """ - interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs! - Do the work of removing all the error sequences - pysam is cool - infile = pysam.Samfile( "-", "r" ) - outfile = pysam.Samfile( "-", "w", template = infile ) - for s in infile: outfile.write(s) - - errors from ValidateSameFile.jar look like - WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing - ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary. - ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041 - - """ - assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam - assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path' - removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2] - remDict = dict(zip(removeNames,range(len(removeNames)))) - infile = pysam.Samfile(insam,'rb') - info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict)) - if len(removeNames) > 0: - outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file - i = 0 - j = 0 - for row in infile: - dropme = remDict.get(row.qname,None) # keep if None - if not dropme: - outfile.write(row) - j += 1 - else: # discard - i += 1 - info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam)) - outfile.close() - infile.close() - else: # we really want a nullop or a simple pointer copy - infile.close() - if newsam: - shutil.copy(insam,newsam) - logging.info(info) - - - -def __main__(): - doFix = False # tools returning htmlfile don't need this - doTranspose = True # default - maxloglines = 100 # default - #Parse Command Line - op = optparse.OptionParser() - # All tools - op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' ) - op.add_option('-e', '--inputext', default=None) - op.add_option('-o', '--output', default=None) - op.add_option('-n', '--title', default="Pick a Picard Tool") - op.add_option('-t', '--htmlout', default=None) - op.add_option('-d', '--outdir', default=None) - op.add_option('-x', '--maxjheap', default='4g') - op.add_option('-b', '--bisulphite', default='false') - op.add_option('-s', '--sortorder', default='query') - op.add_option('','--tmpdir', default='/tmp') - op.add_option('-j','--jar',default='') - op.add_option('','--picard-cmd',default=None) - # Many tools - op.add_option( '', '--output-format', dest='output_format', help='Output format' ) - op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' ) - op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None ) - # CreateSequenceDictionary - op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None ) - op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' ) - op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' ) - op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' ) - # MarkDuplicates - op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' ) - op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' ) - # CollectInsertSizeMetrics - op.add_option('', '--taillimit', default="0") - op.add_option('', '--histwidth', default="0") - op.add_option('', '--minpct', default="0.01") - # CollectAlignmentSummaryMetrics - op.add_option('', '--maxinsert', default="20") - op.add_option('', '--adaptors', action='append', type="string") - # FixMateInformation and validate - # CollectGcBiasMetrics - op.add_option('', '--windowsize', default='100') - op.add_option('', '--mingenomefrac', default='0.00001') - # AddOrReplaceReadGroups - op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' ) - op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' ) - op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' ) - op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' ) - op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' ) - op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' ) - op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' ) - op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' ) - # ReorderSam - op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' ) - op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' ) - # ReplaceSamHeader - op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' ) - - op.add_option('','--assumesorted', default='true') - op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*") - #estimatelibrarycomplexity - op.add_option('','--minid', default="5") - op.add_option('','--maxdiff', default="0.03") - op.add_option('','--minmeanq', default="20") - #hsmetrics - op.add_option('','--baitbed', default=None) - op.add_option('','--targetbed', default=None) - #validate - op.add_option('','--ignoreflags', action='append', type="string") - op.add_option('','--maxerrors', default=None) - op.add_option('','--datatype', default=None) - op.add_option('','--bamout', default=None) - op.add_option('','--samout', default=None) - - opts, args = op.parse_args() - opts.sortme = opts.assumesorted == 'false' - assert opts.input <> None - # need to add - # instance that does all the work - pic = PicardBase(opts,sys.argv[0]) - - tmp_dir = opts.outdir - haveTempout = False # we use this where sam output is an option - - # set ref and dict files to use (create if necessary) - ref_file_name = opts.ref - if opts.ref_file <> None: - csd = 'CreateSequenceDictionary' - realjarpath = os.path.split(opts.jar)[0] - jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq - tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname) - ref_file_name = '%s.fasta' % tmp_ref_name - # build dict - dict_file_name = '%s.dict' % tmp_ref_name - os.symlink( opts.ref_file, ref_file_name ) - cl = ['REFERENCE=%s' % ref_file_name] - cl.append('OUTPUT=%s' % dict_file_name) - cl.append('URI=%s' % os.path.basename( opts.ref_file )) - cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names) - if opts.species_name: - cl.append('SPECIES=%s' % opts.species_name) - if opts.build_name: - cl.append('GENOME_ASSEMBLY=%s' % opts.build_name) - pic.delme.append(dict_file_name) - pic.delme.append(ref_file_name) - pic.delme.append(tmp_ref_name) - s = pic.runPic(jarpath, cl) - # run relevant command(s) - - # define temporary output - # if output is sam, it must have that extension, otherwise bam will be produced - # specify sam or bam file with extension - if opts.output_format == 'sam': - suff = '.sam' - else: - suff = '' - tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff ) - - cl = ['VALIDATION_STRINGENCY=LENIENT',] - - if pic.picname == 'AddOrReplaceReadGroups': - # sort order to match Galaxy's default - cl.append('SORT_ORDER=coordinate') - # input - cl.append('INPUT=%s' % opts.input) - # outputs - cl.append('OUTPUT=%s' % tempout) - # required read groups - cl.append('RGLB="%s"' % opts.rg_library) - cl.append('RGPL="%s"' % opts.rg_platform) - cl.append('RGPU="%s"' % opts.rg_plat_unit) - cl.append('RGSM="%s"' % opts.rg_sample) - if opts.rg_id: - cl.append('RGID="%s"' % opts.rg_id) - # optional read groups - if opts.rg_seq_center: - cl.append('RGCN="%s"' % opts.rg_seq_center) - if opts.rg_desc: - cl.append('RGDS="%s"' % opts.rg_desc) - pic.runPic(opts.jar, cl) - haveTempout = True - - elif pic.picname == 'BamIndexStats': - tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir ) - tmp_bam_name = '%s.bam' % tmp_name - tmp_bai_name = '%s.bai' % tmp_bam_name - os.symlink( opts.input, tmp_bam_name ) - os.symlink( opts.bai_file, tmp_bai_name ) - cl.append('INPUT=%s' % ( tmp_bam_name )) - pic.delme.append(tmp_bam_name) - pic.delme.append(tmp_bai_name) - pic.delme.append(tmp_name) - s = pic.runPic( opts.jar, cl ) - f = open(pic.metricsOut,'a') - f.write(s) # got this on stdout from runCl - f.write('\n') - f.close() - doTranspose = False # but not transposed - - elif pic.picname == 'EstimateLibraryComplexity': - cl.append('I=%s' % opts.input) - cl.append('O=%s' % pic.metricsOut) - if float(opts.minid) > 0: - cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid) - if float(opts.maxdiff) > 0.0: - cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff) - if float(opts.minmeanq) > 0: - cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq) - if opts.readregex > '': - cl.append('READ_NAME_REGEX="%s"' % opts.readregex) - if float(opts.optdupdist) > 0: - cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist) - pic.runPic(opts.jar,cl) - - elif pic.picname == 'CollectAlignmentSummaryMetrics': - # Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data. - # why? Dunno Seems to work without complaining if the .bai file is AWOL.... - fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) - try: - os.symlink(ref_file_name,fakefasta) - except: - s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy' - info = s - shutil.copy(ref_file_name,fakefasta) - pic.delme.append(fakefasta) - cl.append('ASSUME_SORTED=%s' % opts.assumesorted) - adaptorseqs = ''.join([' ADAPTER_SEQUENCE=%s' % x for x in opts.adaptors]) - cl.append(adaptorseqs) - cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite) - cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert) - cl.append('OUTPUT=%s' % pic.metricsOut) - cl.append('R=%s' % fakefasta) - cl.append('TMP_DIR=%s' % opts.tmpdir) - if not opts.assumesorted.lower() == 'true': # we need to sort input - fakeinput = '%s.sorted' % opts.input - s = pic.sortSam(opts.input, fakeinput, opts.outdir) - pic.delme.append(fakeinput) - cl.append('INPUT=%s' % fakeinput) - else: - cl.append('INPUT=%s' % os.path.abspath(opts.input)) - pic.runPic(opts.jar,cl) - - - elif pic.picname == 'CollectGcBiasMetrics': - assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name - # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data. - # why? Dunno - fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name)) - try: - os.symlink(ref_file_name,fakefasta) - except: - s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy' - info = s - shutil.copy(ref_file_name,fakefasta) - pic.delme.append(fakefasta) - x = 'rgPicardGCBiasMetrics' - pdfname = '%s.pdf' % x - jpgname = '%s.jpg' % x - tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out') - temppdf = os.path.join(opts.outdir,pdfname) - cl.append('R=%s' % fakefasta) - cl.append('WINDOW_SIZE=%s' % opts.windowsize) - cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac) - cl.append('INPUT=%s' % opts.input) - cl.append('OUTPUT=%s' % tempout) - cl.append('TMP_DIR=%s' % opts.tmpdir) - cl.append('CHART_OUTPUT=%s' % temppdf) - cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut) - pic.runPic(opts.jar,cl) - if os.path.isfile(temppdf): - cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find - s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir) - else: - s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf - lf = open(pic.log_filename,'a') - lf.write(s) - lf.write('\n') - lf.close() - - elif pic.picname == 'CollectInsertSizeMetrics': - isPDF = 'InsertSizeHist.pdf' - pdfpath = os.path.join(opts.outdir,isPDF) - histpdf = 'InsertSizeHist.pdf' - cl.append('I=%s' % opts.input) - cl.append('O=%s' % pic.metricsOut) - cl.append('HISTOGRAM_FILE=%s' % histpdf) - if opts.taillimit <> '0': - cl.append('TAIL_LIMIT=%s' % opts.taillimit) - if opts.histwidth <> '0': - cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth) - if float( opts.minpct) > 0.0: - cl.append('MINIMUM_PCT=%s' % opts.minpct) - pic.runPic(opts.jar,cl) - if os.path.exists(pdfpath): # automake thumbnail - will be added to html - cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath] - s,stdouts = pic.runCL(cl=cl2,output_dir=opts.outdir) - else: - s = 'Unable to find expected pdf file %s
\n' % pdfpath - s += 'This always happens if single ended data was provided to this tool,\n' - s += 'so please double check that your input data really is paired-end NGS data.
\n' - s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n
' - stdouts = '' - logging.info(s) - if len(stdouts) > 0: - logging.info(stdouts) - - elif pic.picname == 'MarkDuplicates': - # assume sorted even if header says otherwise - cl.append('ASSUME_SORTED=%s' % (opts.assumesorted)) - # input - cl.append('INPUT=%s' % opts.input) - # outputs - cl.append('OUTPUT=%s' % opts.output) - cl.append('METRICS_FILE=%s' % pic.metricsOut ) - # remove or mark duplicates - cl.append('REMOVE_DUPLICATES=%s' % opts.remdups) - # the regular expression to be used to parse reads in incoming SAM file - cl.append('READ_NAME_REGEX="%s"' % opts.readregex) - # maximum offset between two duplicate clusters - cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist) - pic.runPic(opts.jar, cl) - - elif pic.picname == 'FixMateInformation': - cl.append('I=%s' % opts.input) - cl.append('O=%s' % tempout) - cl.append('SORT_ORDER=%s' % opts.sortorder) - pic.runPic(opts.jar,cl) - haveTempout = True - - elif pic.picname == 'ReorderSam': - # input - cl.append('INPUT=%s' % opts.input) - # output - cl.append('OUTPUT=%s' % tempout) - # reference - cl.append('REFERENCE=%s' % ref_file_name) - # incomplete dict concordance - if opts.allow_inc_dict_concord == 'true': - cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true') - # contig length discordance - if opts.allow_contig_len_discord == 'true': - cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true') - pic.runPic(opts.jar, cl) - haveTempout = True - - elif pic.picname == 'ReplaceSamHeader': - cl.append('INPUT=%s' % opts.input) - cl.append('OUTPUT=%s' % tempout) - cl.append('HEADER=%s' % opts.header_file) - pic.runPic(opts.jar, cl) - haveTempout = True - - elif pic.picname == 'CalculateHsMetrics': - maxloglines = 100 - baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait') - targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target') - baitf = pic.makePicInterval(opts.baitbed,baitfname) - if opts.targetbed == opts.baitbed: # same file sometimes - targetf = baitf - else: - targetf = pic.makePicInterval(opts.targetbed,targetfname) - cl.append('BAIT_INTERVALS=%s' % baitf) - cl.append('TARGET_INTERVALS=%s' % targetf) - cl.append('INPUT=%s' % os.path.abspath(opts.input)) - cl.append('OUTPUT=%s' % pic.metricsOut) - cl.append('TMP_DIR=%s' % opts.tmpdir) - pic.runPic(opts.jar,cl) - - elif pic.picname == 'ValidateSamFile': - import pysam - doTranspose = False - sortedfile = os.path.join(opts.outdir,'rgValidate.sorted') - stf = open(pic.log_filename,'w') - tlog = None - if opts.datatype == 'sam': # need to work with a bam - tlog,tempbam = pic.samToBam(opts.input,opts.outdir) - try: - tlog = pic.sortSam(tempbam,sortedfile,opts.outdir) - except: - print '## exception on sorting sam file %s' % opts.input - else: # is already bam - try: - tlog = pic.sortSam(opts.input,sortedfile,opts.outdir) - except: # bug - [bam_sort_core] not being ignored - TODO fixme - print '## exception on sorting bam file %s' % opts.input - if tlog: - print '##tlog=',tlog - stf.write(tlog) - stf.write('\n') - sortedfile = '%s.bam' % sortedfile # samtools does that - cl.append('O=%s' % pic.metricsOut) - cl.append('TMP_DIR=%s' % opts.tmpdir) - cl.append('I=%s' % sortedfile) - opts.maxerrors = '99999999' - cl.append('MAX_OUTPUT=%s' % opts.maxerrors) - if opts.ignoreflags[0] <> 'None': # picard error values to ignore - igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None'] - cl.append(' '.join(igs)) - if opts.bisulphite.lower() <> 'false': - cl.append('IS_BISULFITE_SEQUENCED=true') - if opts.ref <> None or opts.ref_file <> None: - cl.append('R=%s' % ref_file_name) - pic.runPic(opts.jar,cl) - if opts.datatype == 'sam': - pic.delme.append(tempbam) - newsam = opts.output - outformat = 'bam' - pe = open(pic.metricsOut,'r').readlines() - pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat) - pic.delme.append(sortedfile) # not wanted - stf.close() - pic.cleanup() - else: - print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname - sys.exit(1) - if haveTempout: - # Some Picard tools produced a potentially intermediate bam file. - # Either just move to final location or create sam - shutil.move(tempout, os.path.abspath(opts.output)) - - if opts.htmlout <> None or doFix: # return a pretty html page - pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines) - -if __name__=="__main__": __main__() - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/rgPicardASMetrics.xml --- a/tools/picard/rgPicardASMetrics.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,162 +0,0 @@ - - - picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file" - --assumesorted "$sorted" -b "$bisulphite" --adaptors "$adaptors" --maxinsert "$maxinsert" -n "$out_prefix" - -j ${GALAXY_DATA_INDEX_DIR}/shared/jars/CollectAlignmentSummaryMetrics.jar -#if $genomeSource.refGenomeSource == "history": - --ref-file "$genomeSource.ownFile" -#else - --ref "${ filter( lambda x: str( x[0] ) == str( $genomeSource.index ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] }" -#end if - - picard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Summary** - -This Galaxy tool uses Picard to report high-level measures of alignment based on a provided sam or bam file. - -**Picard documentation** - -This is a Galaxy wrapper for CollectAlignmentSummaryMetrics, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------ - -.. class:: infomark - -**Syntax** - -- **Input** - SAM/BAM format aligned short read data in your current history -- **Title** - the title to use for all output files from this job - use it for high level metadata -- **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices: - - - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options. - - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy. - - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references. - -- **Assume Sorted** - saves sorting time - but only if true! -- **Bisulphite data** - see Picard documentation http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics -- **Maximum acceptable insertion length** - see Picard documentation at http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics - ------ - -.. class:: infomark - -**Inputs, outputs, and parameters** - -The Picard documentation (reformatted for Galaxy) says: - -.. csv-table:: - :header-rows: 1 - - Option,Description - "INPUT=File","SAM or BAM file Required." - "OUTPUT=File","File to write insert size metrics to Required." - "REFERENCE_SEQUENCE=File","Reference sequence file Required." - "ASSUME_SORTED=Boolean","If true (default), unsorted SAM/BAM files will be considerd coordinate sorted " - "MAX_INSERT_SIZE=Integer","Paired end reads above this insert size will be considered chimeric along with inter-chromosomal pairs. Default value: 100000." - "ADAPTER_SEQUENCE=String","This option may be specified 0 or more times. " - "IS_BISULFITE_SEQUENCED=Boolean","Whether the SAM or BAM file consists of bisulfite sequenced reads. Default value: false. " - "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created." - -The output produced by the tool has the following columns:: - - 1. CATEGORY: One of either UNPAIRED (for a fragment run), FIRST_OF_PAIR when metrics are for only the first read in a paired run, SECOND_OF_PAIR when the metrics are for only the second read in a paired run or PAIR when the metrics are aggregeted for both first and second reads in a pair. - 2. TOTAL_READS: The total number of reads including all PF and non-PF reads. When CATEGORY equals PAIR this value will be 2x the number of clusters. - 3. PF_READS: The number of PF reads where PF is defined as passing Illumina's filter. - 4. PCT_PF_READS: The percentage of reads that are PF (PF_READS / TOTAL_READS) - 5. PF_NOISE_READS: The number of PF reads that are marked as noise reads. A noise read is one which is composed entirey of A bases and/or N bases. These reads are marked as they are usually artifactual and are of no use in downstream analysis. - 6. PF_READS_ALIGNED: The number of PF reads that were aligned to the reference sequence. This includes reads that aligned with low quality (i.e. their alignments are ambiguous). - 7. PCT_PF_READS_ALIGNED: The percentage of PF reads that aligned to the reference sequence. PF_READS_ALIGNED / PF_READS - 8. PF_HQ_ALIGNED_READS: The number of PF reads that were aligned to the reference sequence with a mapping quality of Q20 or higher signifying that the aligner estimates a 1/100 (or smaller) chance that the alignment is wrong. - 9. PF_HQ_ALIGNED_BASES: The number of bases aligned to the reference sequence in reads that were mapped at high quality. Will usually approximate PF_HQ_ALIGNED_READS * READ_LENGTH but may differ when either mixed read lengths are present or many reads are aligned with gaps. - 10. PF_HQ_ALIGNED_Q20_BASES: The subest of PF_HQ_ALIGNED_BASES where the base call quality was Q20 or higher. - 11. PF_HQ_MEDIAN_MISMATCHES: The median number of mismatches versus the reference sequence in reads that were aligned to the reference at high quality (i.e. PF_HQ_ALIGNED READS). - 12. PF_HQ_ERROR_RATE: The percentage of bases that mismatch the reference in PF HQ aligned reads. - 13. MEAN_READ_LENGTH: The mean read length of the set of reads examined. When looking at the data for a single lane with equal length reads this number is just the read length. When looking at data for merged lanes with differing read lengths this is the mean read length of all reads. - 14. READS_ALIGNED_IN_PAIRS: The number of aligned reads who's mate pair was also aligned to the reference. - 15. PCT_READS_ALIGNED_IN_PAIRS: The percentage of reads who's mate pair was also aligned to the reference. READS_ALIGNED_IN_PAIRS / PF_READS_ALIGNED - 16. BAD_CYCLES: The number of instrument cycles in which 80% or more of base calls were no-calls. - 17. STRAND_BALANCE: The number of PF reads aligned to the positive strand of the genome divided by the number of PF reads aligned to the genome. - 18. PCT_CHIMERAS: The percentage of reads that map outside of a maximum insert size (usually 100kb) or that have the two ends mapping to different chromosomes. - 19. PCT_ADAPTER: The percentage of PF reads that are unaligned and match to a known adapter sequence right from the start of the read. - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/rgPicardFixMate.xml --- a/tools/picard/rgPicardFixMate.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ - - for paired data - - picard_wrapper.py -i "$input_file" -o "$out_file" --tmpdir "${__new_file_path__}" -n "$out_prefix" - --output-format "$outputFormat" -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/FixMateInformation.jar" --sortorder "$sortOrder" - - picard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Ensure that all mate-pair information is in sync between each read and it's mate pair. - -**Picard documentation** - -This is a Galaxy wrapper for FixMateInformation, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - -.. class:: warningmark - -**Useful for paired data only** - -Likely won't do anything helpful for single end sequence data -Currently, Galaxy doesn't distinguish paired from single ended SAM/BAM so make sure -the data you choose are valid (paired end) SAM or BAM data - unless you trust this -tool not to harm your data. - ------ - -.. class:: infomark - -**Syntax** - -- **Input** - a paired read sam/bam format aligned short read data in your current history -- **Sort order** - can be used to adjust the ordering of reads -- **Title** - the title to use for all output files from this job - use it for high level metadata -- **Output Format** - either SAM or compressed as BAM - ------ - -.. class:: infomark - -**Inputs, outputs, and parameters** - -.. csv-table:: - - :header-rows: 1 - - Option,Description - "INPUT=File","The input file to fix. This option may be specified 0 or more times." - "OUTPUT=File","The output file to write to" - "SORT_ORDER=SortOrder","Optional sort order if the OUTPUT file should be sorted differently than the INPUT file. Default value: null. Possible values: {unsorted, queryname, coordinate}" - "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false" - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/rgPicardGCBiasMetrics.xml --- a/tools/picard/rgPicardGCBiasMetrics.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ - - - picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file" - --windowsize "$windowsize" --mingenomefrac "$mingenomefrac" -n "$out_prefix" --tmpdir "${__new_file_path__}" - -j ${GALAXY_DATA_INDEX_DIR}/shared/jars/CollectGcBiasMetrics.jar -#if $genomeSource.refGenomeSource == "history": - --ref-file "$genomeSource.ownFile" -#else: - --ref "${ filter( lambda x: str( x[0] ) == str( $genomeSource.index ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] }" -#end if - - picard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Summary** - -This Galaxy tool uses Picard to report detailed metrics about reads that fall within windows of a certain GC bin on the reference genome. - -**Picard documentation** - -This is a Galaxy wrapper for CollectGcBiasMetrics, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------ - -.. class:: infomark - -**Syntax** - -- **Input** - SAM/BAM format aligned short read data in your current history -- **Title** - the title to use for all output files from this job - use it for high level metadata -- **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices: - - - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options. - - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy. - - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references. - -- **Window Size** see Picard documentation http://picard.sourceforge.net/command-line-overview.shtml#CollectGCBiasMetrics -- **Minimum Genome Fraction** See Picard documentation at http://picard.sourceforge.net/command-line-overview.shtml#CollectGCBiasMetrics - ------ - -.. class:: infomark - -**Inputs, outputs, and parameters** - -The Picard documentation (reformatted for Galaxy) says: - -.. csv-table:: - :header-rows: 1 - - Option,Description - "REFERENCE_SEQUENCE=File","The reference sequence fasta file. Required." - "INPUT=File","The BAM or SAM file containing aligned reads. Required." - "OUTPUT=File","The text file to write the metrics table to. Required." - "CHART_OUTPUT=File","The PDF file to render the chart to. Required." - "SUMMARY_OUTPUT=File","The text file to write summary metrics to. Default value: null." - "WINDOW_SIZE=Integer","The size of windows on the genome that are used to bin reads. Default value: 100." - "MINIMUM_GENOME_FRACTION=Double","For summary metrics, exclude GC windows that include less than this fraction of the genome. Default value: 1.0E-5." - "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false." - -The output produced by the tool has the following columns:: - - 1. GC: The G+C content of the reference sequence represented by this bin. Values are from 0% to 100% - 2. WINDOWS: The number of windows on the reference genome that have this G+C content. - 3. READ_STARTS: The number of reads who's start position is at the start of a window of this GC. - 4. MEAN_BASE_QUALITY: The mean quality (determined via the error rate) of all bases of all reads that are assigned to windows of this GC. - 5. NORMALIZED_COVERAGE: The ration of "coverage" in this GC bin vs. the mean coverage of all GC bins. A number of 1 represents mean coverage, a number less than one represents lower than mean coverage (e.g. 0.5 means half as much coverage as average) while a number greater than one represents higher than mean coverage (e.g. 3.1 means this GC bin has 3.1 times more reads per window than average). - 6. ERROR_BAR_WIDTH: The radius of error bars in this bin based on the number of observations made. For example if the normalized coverage is 0.75 and the error bar width is 0.1 then the error bars would be drawn from 0.65 to 0.85. - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/rgPicardHsMetrics.xml --- a/tools/picard/rgPicardHsMetrics.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,131 +0,0 @@ - - for targeted resequencing data - - - picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file" --datatype "$input_file.ext" - --baitbed "$bait_bed" --targetbed "$target_bed" -n "$out_prefix" --tmpdir "${__new_file_path__}" - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/CalculateHsMetrics.jar" - - - picard - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Summary** - -Calculates a set of Hybrid Selection specific metrics from an aligned SAM or BAM file. - -**Picard documentation** - -This is a Galaxy wrapper for CollectAlignmentSummaryMetrics, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------ - -.. class:: infomark - -**Inputs, outputs, and parameters** - -Picard documentation says (reformatted for Galaxy): - -Calculates a set of Hybrid Selection specific metrics from an aligned SAM or BAM file. - -.. csv-table:: - :header-rows: 1 - - "Option", "Description" - "BAIT_INTERVALS=File","An interval list file that contains the locations of the baits used. Required." - "TARGET_INTERVALS=File","An interval list file that contains the locations of the targets. Required." - "INPUT=File","An aligned SAM or BAM file. Required." - "OUTPUT=File","The output file to write the metrics to. Required. Cannot be used in conjuction with option(s) METRICS_FILE (M)" - "METRICS_FILE=File","Legacy synonym for OUTPUT, should not be used. Required. Cannot be used in conjuction with option(s) OUTPUT (O)" - "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false" - -HsMetrics - - The set of metrics captured that are specific to a hybrid selection analysis. - -Output Column Definitions:: - - 1. BAIT_SET: The name of the bait set used in the hybrid selection. - 2. GENOME_SIZE: The number of bases in the reference genome used for alignment. - 3. BAIT_TERRITORY: The number of bases which have one or more baits on top of them. - 4. TARGET_TERRITORY: The unique number of target bases in the experiment where target is usually exons etc. - 5. BAIT_DESIGN_EFFICIENCY: Target terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target. - 6. TOTAL_READS: The total number of reads in the SAM or BAM file examine. - 7. PF_READS: The number of reads that pass the vendor's filter. - 8. PF_UNIQUE_READS: The number of PF reads that are not marked as duplicates. - 9. PCT_PF_READS: PF reads / total reads. The percent of reads passing filter. - 10. PCT_PF_UQ_READS: PF Unique Reads / Total Reads. - 11. PF_UQ_READS_ALIGNED: The number of PF unique reads that are aligned with mapping score > 0 to the reference genome. - 12. PCT_PF_UQ_READS_ALIGNED: PF Reads Aligned / PF Reads. - 13. PF_UQ_BASES_ALIGNED: The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps. - 14. ON_BAIT_BASES: The number of PF aligned bases that mapped to a baited region of the genome. - 15. NEAR_BAIT_BASES: The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region. - 16. OFF_BAIT_BASES: The number of PF aligned bases that mapped to neither on or near a bait. - 17. ON_TARGET_BASES: The number of PF aligned bases that mapped to a targetted region of the genome. - 18. PCT_SELECTED_BASES: On+Near Bait Bases / PF Bases Aligned. - 19. PCT_OFF_BAIT: The percentage of aligned PF bases that mapped neither on or near a bait. - 20. ON_BAIT_VS_SELECTED: The percentage of on+near bait bases that are on as opposed to near. - 21. MEAN_BAIT_COVERAGE: The mean coverage of all baits in the experiment. - 22. MEAN_TARGET_COVERAGE: The mean coverage of targets that recieved at least coverage depth = 2 at one base. - 23. PCT_USABLE_BASES_ON_BAIT: The number of aligned, de-duped, on-bait bases out of the PF bases available. - 24. PCT_USABLE_BASES_ON_TARGET: The number of aligned, de-duped, on-target bases out of the PF bases available. - 25. FOLD_ENRICHMENT: The fold by which the baited region has been amplified above genomic background. - 26. ZERO_CVG_TARGETS_PCT: The number of targets that did not reach coverage=2 over any base. - 27. FOLD_80_BASE_PENALTY: The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to the mean coverage level in those targets. - 28. PCT_TARGET_BASES_2X: The percentage of ALL target bases acheiving 2X or greater coverage. - 29. PCT_TARGET_BASES_10X: The percentage of ALL target bases acheiving 10X or greater coverage. - 30. PCT_TARGET_BASES_20X: The percentage of ALL target bases acheiving 20X or greater coverage. - 31. PCT_TARGET_BASES_30X: The percentage of ALL target bases acheiving 30X or greater coverage. - 32. HS_LIBRARY_SIZE: The estimated number of unique molecules in the selected part of the library. - 33. HS_PENALTY_10X: The "hybrid selection penalty" incurred to get 80% of target bases to 10X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 10X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 10 * HS_PENALTY_10X. - 34. HS_PENALTY_20X: The "hybrid selection penalty" incurred to get 80% of target bases to 20X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 20X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 20 * HS_PENALTY_20X. - 35. HS_PENALTY_30X: The "hybrid selection penalty" incurred to get 80% of target bases to 10X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 30X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 30 * HS_PENALTY_30X. - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/rgPicardInsertSize.xml --- a/tools/picard/rgPicardInsertSize.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ - - for PAIRED data - picard - - picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --taillimit "$tailLimit" - --histwidth "$histWidth" --minpct "$minPct" - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file" - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Reads a SAM or BAM file and describes the distribution -of insert size (excluding duplicates) with metrics and a histogram plot. - -**Picard documentation** - -This is a Galaxy wrapper for CollectInsertSizeMetrics, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - -.. class:: warningmark - -**Useful for paired data only** - -This tool works for paired data only and can be expected to fail for single end data. - ------ - -.. class:: infomark - -**Inputs, outputs, and parameters** - -Picard documentation says (reformatted for Galaxy): - -.. csv-table:: - :header-rows: 1 - - Option,Description - "INPUT=File","SAM or BAM file Required." - "OUTPUT=File","File to write insert size metrics to Required." - "HISTOGRAM_FILE=File","File to write insert size histogram chart to Required." - "TAIL_LIMIT=Integer","When calculating mean and stdev stop when the bins in the tail of the distribution contain fewer than mode/TAIL_LIMIT items. This also limits how much data goes into each data category of the histogram." - "HISTOGRAM_WIDTH=Integer","Explicitly sets the histogram width, overriding the TAIL_LIMIT option. Also, when calculating mean and stdev, only bins LE HISTOGRAM_WIDTH will be included. " - "MINIMUM_PCT=Float","When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads. (Range: 0 to 1) Default value: 0.01." - "STOP_AFTER=Integer","Stop after processing N reads, mainly for debugging. Default value: 0." - "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false." - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/rgPicardLibComplexity.xml --- a/tools/picard/rgPicardLibComplexity.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ - - - picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --minid "$minIDbases" - --maxdiff "$maxDiff" --minmeanq "$minMeanQ" --readregex "$readRegex" --optdupdist "$optDupeDist" - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/EstimateLibraryComplexity.jar" -d "$html_file.files_path" -t "$html_file" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Attempts to estimate library complexity from sequence alone. -Does so by sorting all reads by the first N bases (5 by default) of each read and then -comparing reads with the first N bases identical to each other for duplicates. Reads are considered to be -duplicates if they match each other with no gaps and an overall mismatch rate less than or equal to MAX_DIFF_RATE (0.03 by default). - -Reads of poor quality are filtered out so as to provide a more accurate estimate. -The filtering removes reads with any no-calls in the first N bases or with a mean base quality lower than -MIN_MEAN_QUALITY across either the first or second read. - -The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes these in the -calculation of library size. Also, since there is no alignment to screen out technical reads one -further filter is applied on the data. After examining all reads a histogram is built of -[#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are -then removed from the histogram as outliers before library size is estimated. - -**Picard documentation** - -This is a Galaxy wrapper for EstimateLibraryComplexity, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------ - -.. class:: infomark - -**Inputs, outputs, and parameters** - -Picard documentation says (reformatted for Galaxy): - -.. csv-table:: - :header-rows: 1 - - Option Description - "INPUT=File","One or more files to combine and estimate library complexity from. Reads can be mapped or unmapped. This option may be specified 0 or more times." - "OUTPUT=File","Output file to writes per-library metrics to. Required." - "MIN_IDENTICAL_BASES=Integer","The minimum number of bases at the starts of reads that must be identical for reads to be grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads will be compared at a time, so lower numbers will produce more accurate results but consume exponentially more memory and CPU. Default value: 5." - "MAX_DIFF_RATE=Double","The maximum rate of differences between two reads to call them identical. Default value: 0.03. " - "MIN_MEAN_QUALITY=Integer","The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with lower average quality are filtered out and not considered in any calculations. Default value: 20." - "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. This option can be set to 'null' to clear the default value." - "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100" - "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false. This option can be set to 'null' to clear the default value. " - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. - -.. class:: infomark - -**Note on the Regular Expression** - -(from the Picard docs) -This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file. -These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. -The regular expression should contain three capture groups for the three variables, in order. -Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/picard/rgPicardMarkDups.xml --- a/tools/picard/rgPicardMarkDups.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ - - - picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" -o "$out_file" - --remdups "$remDups" --assumesorted "$assumeSorted" --readregex "$readRegex" --optdupdist "$optDupeDist" - -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/MarkDuplicates.jar" -d "$html_file.files_path" -t "$html_file" -e "$input_file.ext" - - picard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**Purpose** - -Marks all duplicate reads in a provided SAM or BAM file and either removes them or flags them. - -**Picard documentation** - -This is a Galaxy wrapper for MarkDuplicates, a part of the external package Picard-tools_. - - .. _Picard-tools: http://www.google.com/search?q=picard+samtools - ------ - -.. class:: infomark - -**Inputs, outputs, and parameters** - -Picard documentation says (reformatted for Galaxy): - -.. csv-table:: Mark Duplicates docs - :header-rows: 1 - - Option,Description - "INPUT=File","The input SAM or BAM file to analyze. Must be coordinate sorted. Required." - "OUTPUT=File","The output file to right marked records to Required." - "METRICS_FILE=File","File to write duplication metrics to Required." - "REMOVE_DUPLICATES=Boolean","If true do not write duplicates to the output file instead of writing them with appropriate flags set. Default value: false." - "ASSUME_SORTED=Boolean","If true, assume that the input file is coordinate sorted, even if the header says otherwise. Default value: false." - "MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=Integer","This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000." - "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=Integer","Maximum number of file handles to keep open when spilling read ends to disk." - "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. " - "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100" - -.. class:: warningmark - -**Warning on SAM/BAM quality** - -Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT** -flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears -to be the only way to deal with SAM/BAM that cannot be parsed. -.. class:: infomark - -**Note on the Regular Expression** - -(from the Picard docs) -This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+). - -Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. All records are then written to the output file with the duplicate records flagged unless the remove duplicates option is selected. In some cases you may want to do this, but please only do this if you really understand what you are doing. - - - - - - - - - - - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/bar_chart.py --- a/tools/plotting/bar_chart.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,144 +0,0 @@ -#!/usr/bin/env python - - -""" -histogram_gnuplot.py <ylabel> <yrange_min> <yrange_max> <grath_file> -a generic histogram builder based on gnuplot backend - - data_file - tab delimited file with data - xtic_column - column containing labels for x ticks [integer, 0 means no ticks] - column_list - comma separated list of columns to plot - title - title for the entire histrogram - ylabel - y axis label - yrange_max - minimal value at the y axis (integer) - yrange_max - maximal value at the y_axis (integer) - to set yrange to autoscaling assign 0 to yrange_min and yrange_max - graph_file - file to write histogram image to - img_size - as X,Y pair in pixels (e.g., 800,600 or 600,800 etc.) - - - This tool required gnuplot and gnuplot.py - -anton nekrutenko | anton@bx.psu.edu - -""" - -import Gnuplot, Gnuplot.funcutils -import sys, string, tempfile, os - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(tmpFileName): - skipped_lines_count = 0 - skipped_lines_index = [] - gf = open(tmpFileName, 'w') - - - try: - in_file = open( sys.argv[1], 'r' ) - xtic = int( sys.argv[2] ) - col_list = string.split( sys.argv[3],"," ) - title = 'set title "' + sys.argv[4] + '"' - ylabel = 'set ylabel "' + sys.argv[5] + '"' - ymin = sys.argv[6] - ymax = sys.argv[7] - img_file = sys.argv[8] - img_size = sys.argv[9] - except: - stop_err("Check arguments\n") - - try: - int( col_list[0] ) - except: - stop_err('You forgot to set columns for plotting\n') - - - for i, line in enumerate( in_file ): - valid = True - line = line.rstrip('\r\n') - if line and not line.startswith( '#' ): - row = [] - try: - fields = line.split( '\t' ) - for col in col_list: - row.append( str( float( fields[int( col )-1] ) ) ) - - except: - valid = False - skipped_lines_count += 1 - skipped_lines_index.append(i) - - else: - valid = False - skipped_lines_count += 1 - skipped_lines_index.append(i) - - if valid and xtic > 0: - row.append( fields[xtic-1] ) - elif valid and xtic == 0: - row.append( str( i ) ) - - if valid: - gf.write( '\t'.join( row ) ) - gf.write( '\n' ) - - if skipped_lines_count < i: - - #prepare 'using' clause of plot statement - - g_plot_command = ' '; - - #set the first column - if xtic > 0: - g_plot_command = "'%s' using 1:xticlabels(%s) ti 'Column %s', " % ( tmpFileName, str( len( row ) ), col_list[0] ) - else: - g_plot_command = "'%s' using 1 ti 'Column %s', " % ( tmpFileName, col_list[0] ) - - #set subsequent columns - - for i in range(1,len(col_list)): - g_plot_command += "'%s' using %s t 'Column %s', " % ( tmpFileName, str( i+1 ), col_list[i] ) - - g_plot_command = g_plot_command.rstrip( ', ' ) - - yrange = 'set yrange [' + ymin + ":" + ymax + ']' - - try: - g = Gnuplot.Gnuplot() - g('reset') - g('set boxwidth 0.9 absolute') - g('set style fill solid 1.00 border -1') - g('set style histogram clustered gap 5 title offset character 0, 0, 0') - g('set xtics border in scale 1,0.5 nomirror rotate by 90 offset character 0, 0, 0') - g('set key invert reverse Left outside') - if xtic == 0: g('unset xtics') - g(title) - g(ylabel) - g_term = 'set terminal png tiny size ' + img_size - g(g_term) - g_out = 'set output "' + img_file + '"' - if ymin != ymax: - g(yrange) - g(g_out) - g('set style data histograms') - g.plot(g_plot_command) - except: - stop_err("Gnuplot error: Data cannot be plotted") - else: - sys.stderr.write('Column(s) %s of your dataset do not contain valid numeric data' %sys.argv[3] ) - - if skipped_lines_count > 0: - sys.stdout.write('\nWARNING. You dataset contain(s) %d invalid lines starting with line #%d. These lines were skipped while building the graph.\n' % ( skipped_lines_count, skipped_lines_index[0]+1 ) ) - - -if __name__ == "__main__": - # The tempfile initialization is here because while inside the main() it seems to create a condition - # when the file is removed before gnuplot has a chance of accessing it - gp_data_file = tempfile.NamedTemporaryFile('w') - Gnuplot.gp.GnuplotOpts.default_term = 'png' - main(gp_data_file.name) - diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/bar_chart.xml --- a/tools/plotting/bar_chart.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -<tool id="barchart_gnuplot" name="Bar chart"> - <description>for multiple columns</description> - <command interpreter="python"> - #if $xtic.userSpecified == "Yes" #bar_chart.py $input $xtic.xticColumn $colList "$title" "$ylabel" $ymin $ymax $out_file1 "$pdf_size" - #else #bar_chart.py $input 0 $colList "$title" "$ylabel" $ymin $ymax $out_file1 "$pdf_size" - #end if - </command> - <inputs> - <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> - <conditional name="xtic"> - <param name="userSpecified" type="select" label="Use X Tick labels?" help="see example below"> - <option value="Yes">Yes</option> - <option value="No">No</option> - </param> - <when value="Yes"> - <param name="xticColumn" type="data_column" data_ref="input" numerical="False" label="Use this column for X Tick labels" /> - </when> - <when value="No"> - </when> - </conditional> - <param name="colList" label="Numerical columns" type="data_column" numerical="True" multiple="True" data_ref="input" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" /> - <param name="title" type="text" size="30" value="Bar Chart" label="Plot title"/> - <param name="ylabel" type="text" size="30" value="V1" label="Label for Y axis"/> - <param name="ymin" type="integer" size="4" value="0" label="Minimal value on Y axis" help="set to 0 for autoscaling"/> - <param name="ymax" type="integer" size="4" value="0" label="Maximal value on Y axis" help="set to 0 for autoscaling"/> - <param name="pdf_size" type="select" label="Choose chart size (pixels)"> - <option value="800,600">Normal: 800 by 600</option> - <option value="640,480">Small: 640 by 480</option> - <option value="1480,800">Large: 1480 by 800</option> - <option value="600,800">Normal Flipped: 600 by 800</option> - <option value="480,640">Small Flipped: 480 by 640</option> - <option value="800,1480">Large Flipped: 800 by 1480</option> - </param> - </inputs> - <outputs> - <data format="png" name="out_file1" /> - </outputs> - <requirements> - <requirement type="python-module">Gnuplot</requirement> - <requirement type="python-module">Numeric</requirement> - </requirements> - <help> - -**What it does** - -This tool builds a bar chart on one or more columns. Suppose you have dataset like this one:: - - Gene1 10 15 - Gene2 20 14 - Gene3 67 45 - Gene4 55 12 - -Graphing columns 2 and 3 while using column 1 for X Tick Labels will produce the following plot: - -.. image:: ./static/images/bar_chart.png - :height: 324 - :width: 540 - -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/boxplot.xml --- a/tools/plotting/boxplot.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,102 +0,0 @@ -<tool id="qual_stats_boxplot" name="Boxplot" version="1.0.0"> - <description>of quality statistics</description> - <command>gnuplot < '$gnuplot_commands' 2>&1 || echo "Error running gnuplot." >&2</command> - <requirements> - <requirement type="binary" version="gnuplot 4.2 patchlevel 2">gnuplot</requirement> - </requirements> - <inputs> - <param name="input_file" type="data" format="tabular" label="Quality Statistics File"/> - <param name="title" type="text" value="Box plot in Galaxy" label="Title for plot" size="50"/> - <param name="graph_size" type="text" value="2048,768" label="Dimensions of Graph"/> - <param name="xlabel" type="text" value="X Axis Label" label="X axis label" size="50"/> - <param name="ylabel" type="text" value="Score Value" label="Y axis label" size="50"/> - <param name="xcol" type="data_column" data_ref="input_file" label="Column for X axis position" default_value="1" help="A unique number; c1 if plotting output of FASTQ summary"/> - <param name="q1col" type="data_column" data_ref="input_file" label="Column for Q1" default_value="7" help="c7 if plotting output of FASTQ summary"/> - <param name="medcol" type="data_column" data_ref="input_file" label="Column for Median" default_value="8" help="c8 if plotting output of FASTQ summary"/> - <param name="q3col" type="data_column" data_ref="input_file" label="Column for Q3" default_value="9" help="c9 if plotting output of FASTQ summary"/> - <param name="lwcol" type="data_column" data_ref="input_file" label="Column for left whisker" default_value="11" help="c11 if plotting output of FASTQ summary"/> - <param name="rwcol" type="data_column" data_ref="input_file" label="Column for right whisker" default_value="12" help="c12 if plotting output of FASTQ summary"/> - <conditional name="use_outliers"> - <param name="use_outliers_type" type="select" label="Plot Outliers"> - <option value="use_outliers" selected="true">Plot Outliers</option> - <option value="dont_use_outliers">Don't Plot Outliers</option> - </param> - <when value="use_outliers"> - <param name="outliercol" type="data_column" data_ref="input_file" label="Column for Outliers" default_value="13" help="c13 if plotting output of FASTQ summary"/> - </when> - <when value="dont_use_outliers"> - </when> - </conditional> - </inputs> - <configfiles> - <configfile name="gnuplot_commands"> -set output '$output_file' -set term png size ${graph_size} -set boxwidth 0.8 -set key right tmargin -set xlabel "${xlabel}" -set ylabel "${ylabel}" -set title "${title}" -set xtics 1 -set ytics 1 -set grid ytics -set offsets 1, 1, 1, 1 -plot '${input_file}' using ${xcol}:${q1col}:${lwcol}:${rwcol}:${q3col} with candlesticks lt 1 lw 1 title 'Quartiles' whiskerbars, \ - '' using ${xcol}:${medcol}:${medcol}:${medcol}:${medcol} with candlesticks lt -1 lw 2 title 'Medians'\ -#if str( $use_outliers['use_outliers_type'] ) == 'use_outliers': -, "< python -c \"for xval, yvals in [ ( fields[${xcol} - 1], fields[${use_outliers['outliercol']} - 1].split( ',' ) ) for fields in [ line.rstrip( '\\n\\r' ).split( '\\t' ) for line in open( '${input_file}' ) if not line.startswith( '#' ) ] if len( fields ) > max( ${xcol} - 1, ${use_outliers['outliercol']} - 1 ) ]: print '\\n'.join( [ '%s\\t%s' % ( xval, yval ) for yval in yvals if yval ] )\"" using 1:2 with points pt 29 title 'Outliers' -#end if - </configfile> - </configfiles> - <outputs> - <data name="output_file" format="png" /> - </outputs> - <tests> - <test> - <param name="input_file" value="fastq_stats_1_out.tabular" ftype="tabular" /> - <param name="title" value="Boxplot of Summary Statistics for Sanger Reads" /> - <param name="graph_size" value="2048,768" /> - <param name="xlabel" value="Read Column" /> - <param name="ylabel" value="Quality Score Value" /> - <param name="xcol" value="1" /> - <param name="q1col" value="7" /> - <param name="medcol" value="8" /> - <param name="q3col" value="9" /> - <param name="lwcol" value="11" /> - <param name="rwcol" value="12" /> - <param name="use_outliers_type" value="use_outliers" /> - <param name="outliercol" value="13" /> - <output name="output_file" file="boxplot_summary_statistics_out.png" /> - </test> - </tests> - <help> - -**What it does** - -Creates a boxplot graph. Its main purpose is to display a distribution of quality scores produced by *NGS: QC and maniupulation -> FASTQ Summary Statistics* tool. - -.. class:: warningmark - -**TIP:** If you want to display a distribution of quality scores produced by *NGS: QC and maniupulation -> FASTQ Summary Statistics* and the column assignments within the tool's interface are not automatically set (they will all read "c1" in that case) set columns manually to the following values:: - - Column for X axis c1 - Column for Q1 c7 - Column for Median c8 - Column for Q3 c9 - Column for left whisker c11 - Column for right whisker c12 - Column for Outliers c13 - ------ - -**Output Example** - -* Black horizontal lines are medians -* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1) -* Whiskers show outliers at max. 1.5*IQR - -.. image:: ./static/images/solid_qual.png - - - </help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/histogram.py --- a/tools/plotting/histogram.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster - -import sys -from rpy import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - - # Handle input params - in_fname = sys.argv[1] - out_fname = sys.argv[2] - try: - column = int( sys.argv[3] ) - 1 - except: - stop_err( "Column not specified, your query does not contain a column of numerical data." ) - title = sys.argv[4] - xlab = sys.argv[5] - breaks = int( sys.argv[6] ) - if breaks == 0: - breaks = "Sturges" - if sys.argv[7] == "true": - density = True - else: density = False - if len( sys.argv ) >= 9 and sys.argv[8] == "true": - frequency = True - else: frequency = False - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - i = 0 - for i, line in enumerate( file( in_fname ) ): - valid = True - line = line.rstrip('\r\n') - # Skip comments - if line and not line.startswith( '#' ): - # Extract values and convert to floats - row = [] - try: - fields = line.split( "\t" ) - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - else: - try: - row.append( float( val ) ) - except ValueError: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - invalid_value = fields[column] - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix += row - - if skipped_lines < i: - try: - a = r.array( matrix ) - r.pdf( out_fname, 8, 8 ) - histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks ) - if density: - density = r.density( a ) - if frequency: - scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints - density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] ) - r.lines( density ) - r.dev_off() - except Exception, exc: - stop_err( "%s" %str( exc ) ) - else: - if i == 0: - stop_err("Input dataset is empty.") - else: - stop_err( "All values in column %s are non-numeric." %sys.argv[3] ) - - print "Histogram of column %s. " %sys.argv[3] - if skipped_lines > 0: - print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value ) - - r.quit( save="no" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/histogram2.xml --- a/tools/plotting/histogram2.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -<tool id="histogram_rpy" name="Histogram" version="1.0.3"> - <description>of a numeric column</description> - <command interpreter="python">histogram.py $input $out_file1 $numerical_column "$title" "$xlab" $breaks $density $frequency</command> - <inputs> - <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="numerical_column" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" /> - <param name="breaks" type="integer" size="4" value="0" label="Number of breaks (bars)"/> - <param name="title" type="text" size="30" value="Histogram" label="Plot title"/> - <param name="xlab" type="text" size="30" value="V1" label="Label for x axis"/> - <param name="density" type="boolean" checked="yes" label="Include smoothed density"/> - <param name="frequency" type="boolean" checked="no" label="Plot as frequency (counts)"/> - </inputs> - <outputs> - <data format="pdf" name="out_file1" /> - </outputs> - <tests> - <test> - <param name="input" value="histogram_in1.tabular" ftype="tabular"/> - <param name="numerical_column" value="2"/> - <param name="breaks" value="0"/> - <param name="title" value="Histogram"/> - <param name="xlab" value="V1"/> - <param name="density" value="true"/> - <param name="frequency" value="false"/> - <output name="out_file1" file="histogram_out1.pdf"/> - </test> - </tests> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <help> - -.. class:: infomark - -**TIP:** To remove comment lines that do not begin with a *#* character, use *Text Manipulation->Remove beginning* - - .. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool computes a histogram of the numerical values in a column of a dataset. - -- All invalid, blank and comment lines in the dataset are skipped. The number of skipped lines is displayed in the resulting history item. -- **Column for x axis** - only numerical columns are possible. -- **Number of breaks(bars)** - breakpoints between histogram cells. Value of '0' will determine breaks automatically. -- **Plot title** - the histogram title. -- **Label for x axis** - the label of the x axis for the histogram. -- **Include smoothed density** - if checked, the resulting graph will join the given corresponding points with line segments. - ------ - -**Example** - -- Input file:: - - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - -- Create a histogram on column 2 of the above dataset. - -.. image:: ./static/images/histogram2.png - -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/plot_filter.py --- a/tools/plotting/plot_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ - -def validate(incoming): - """Validator for the plotting program""" - - bins = incoming.get("bins","") - col = incoming.get("col","") - - if not bins or not col: - raise Exception, "You need to specify a number for bins and columns" - - try: - bins = int(bins) - col = int(col) - except: - raise Exception, "Parameters are not valid numbers, columns:%s, bins:%s" % (col, bins) - - if not 1<bins<100: - raise Exception, "The number of bins %s must be a number between 1 and 100" % bins - diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/plotter.py --- a/tools/plotting/plotter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -#!/usr/bin/env python - -# python histogram input_file output_file column bins -import sys, os -import matplotlib; matplotlib.use('Agg') - -from pylab import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -if __name__ == '__main__': - # parse the arguments - - if len(sys.argv) != 6: - stop_err('Usage: python histogram.py input_file column bins output_file style') - sys.exit() - - mode = sys.argv[5] - HIST = mode == 'hist' - try: - col = int(float(sys.argv[2])) - if HIST: - bin = int(float(sys.argv[3])) - else: - # hack, this parameter is the plotting style for scatter plots - if sys.argv[3] == 'P': - style = 'o' - elif sys.argv[3] == 'LP': - style = 'o-' - else: - style = '-' - - except: - msg = 'Parameter were not numbers %s, %s' % (sys.argv[3], sys.argv[4]) - stop_err(msg) - - # validate arguments - inp_file = sys.argv[1] - out_file = sys.argv[4] - - if HIST: - print "Histogram on column %s (%s bins)" % (col, bin) - else: - print "Scatterplot on column %s" % (col) - - xcol= col -1 - # read the file - values = [] - try: - count = 0 - for line in file(inp_file): - count += 1 - line = line.strip() - if line and line[0] != '#': - values.append(float(line.split()[xcol])) - except Exception, e: - stop_err('%s' % e) - stop_err("Non numerical data at line %d, column %d" % (count, col) ) - - # plot the data - - if HIST: - n, bins, patches = hist(values, bins=bin, normed=0) - else: - plot(values, style) - - xlabel('values') - ylabel('counts') - - if HIST: - title('Histogram of values over column %s (%s bins)' % (col, len(bins)) ) - else: - title('Scatterplot over column %s' % col ) - grid(True) - - # the plotter detects types by file extension - png_out = out_file + '.png' # force it to png - savefig(png_out) - - # shuffle it back and clean up - data = file(png_out, 'rb').read() - fp = open(out_file, 'wb') - fp.write(data) - fp.close() - os.remove(png_out) diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/r_wrapper.sh --- a/tools/plotting/r_wrapper.sh Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -#!/bin/sh - -### Run R providing the R script in $1 as standard input and passing -### the remaining arguments on the command line - -# Function that writes a message to stderr and exits -function fail -{ - echo "$@" >&2 - exit 1 -} - -# Ensure R executable is found -which R > /dev/null || fail "'R' is required by this tool but was not found on path" - -# Extract first argument -infile=$1; shift - -# Ensure the file exists -test -f $infile || fail "R input file '$infile' does not exist" - -# Invoke R passing file named by first argument to stdin -R --vanilla --slave $* < $infile diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/scatterplot.py --- a/tools/plotting/scatterplot.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster - -import sys -from rpy import * - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - - in_fname = sys.argv[1] - out_fname = sys.argv[2] - try: - columns = int( sys.argv[3] ) - 1, int( sys.argv[4] ) - 1 - except: - stop_err( "Columns not specified, your query does not contain a column of numerical data." ) - title = sys.argv[5] - xlab = sys.argv[6] - ylab = sys.argv[7] - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - invalid_column = 0 - i = 0 - for i, line in enumerate( file( in_fname ) ): - valid = True - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - row = [] - fields = line.split( "\t" ) - for column in columns: - try: - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - else: - row.append( float( fields[column] ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i + 1 - try: - invalid_value = fields[column] - except: - invalid_value = '' - invalid_column = column + 1 - break - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix.append( row ) - - if skipped_lines < i: - try: - r.pdf( out_fname, 8, 8 ) - r.plot( array( matrix ), type="p", main=title, xlab=xlab, ylab=ylab, col="blue", pch=19 ) - r.dev_off() - except Exception, exc: - stop_err( "%s" %str( exc ) ) - else: - stop_err( "All values in both columns %s and %s are non-numeric or empty." % ( sys.argv[3], sys.argv[4] ) ) - - print "Scatter plot on columns %s, %s. " % ( sys.argv[3], sys.argv[4] ) - if skipped_lines > 0: - print "Skipped %d lines starting with line #%d, value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column ) - - r.quit( save="no" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/scatterplot.xml --- a/tools/plotting/scatterplot.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -<tool id="scatterplot_rpy" name="Scatterplot"> - <description>of two numeric columns</description> - <command interpreter="python">scatterplot.py $input $out_file1 $col1 $col2 "$title" "$xlab" "$ylab"</command> - <inputs> - <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="col1" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" /> - <param name="col2" type="data_column" data_ref="input" numerical="True" label="Numerical column for y axis" /> - <param name="title" size="30" type="text" value="Scatterplot" label="Plot title"/> - <param name="xlab" size="30" type="text" value="V1" label="Label for x axis"/> - <param name="ylab" size="30" type="text" value="V2" label="Label for y axis"/> - </inputs> - <outputs> - <data format="pdf" name="out_file1" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <!-- TODO: uncomment the following test when we have tools.update_state() working for - multiple dependents with the same dependency. - <tests> - <test> - <param name="input" value="scatterplot_in1.tabular" ftype="tabular"/> - <param name="col1" value="2"/> - <param name="col2" value="3"/> - <param name="title" value="Scatterplot"/> - <param name="xlab" value="V1"/> - <param name="ylab" value="V2"/> - <output name="out_file1" file="scatterplot_out1.pdf" /> - </test> - </tests> - --> - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool creates a simple scatter plot between two variables containing numeric values of a selected dataset. - -- All invalid, blank and comment lines in the dataset are skipped. The number of skipped lines is displayed in the resulting history item. - -- **Plot title** The scatterplot title -- **Label for x axis** and **Label for y axis** The labels for x and y axis of the scatterplot. - ------ - -**Example** - -- Input file:: - - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - -- Create a simple scatterplot between the variables in column 2 and column 3 of the above dataset. - -.. image:: ./static/images/scatterplot.png - -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/plotting/xy_plot.xml --- a/tools/plotting/xy_plot.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ -<tool id="XY_Plot_1" name="Plotting tool" version="1.0.1"> - <description>for multiple series and graph types</description> - <command interpreter="bash">r_wrapper.sh $script_file</command> - - <inputs> - <param name="main" type="text" value="" size="30" label="Plot Title"/> - <param name="xlab" type="text" value="" size="30" label="Label for x axis"/> - <param name="ylab" type="text" value="" size="30" label="Label for y axis"/> - <repeat name="series" title="Series"> - <param name="input" type="data" format="tabular" label="Dataset"/> - <param name="xcol" type="data_column" data_ref="input" label="Column for x axis"/> - <param name="ycol" type="data_column" data_ref="input" label="Column for y axis"/> - <conditional name="series_type"> - <param name="type" type="select" label="Series Type"> - <option value="line" selected="true">Line</option> - <option value="points">Points</option> - </param> - <when value="line"> - <param name="lty" type="select" label="Line Type"> - <option value="1">Solid</option> - <option value="2">Dashed</option> - <option value="3">Dotted</option> - </param> - <param name="col" type="select" label="Line Color"> - <option value="1">Black</option> - <option value="2">Red</option> - <option value="3">Green</option> - <option value="4">Blue</option> - <option value="5">Cyan</option> - <option value="6">Magenta</option> - <option value="7">Yellow</option> - <option value="8">Gray</option> - </param> - <param name="lwd" type="float" label="Line Width" value="1.0"/> - </when> - <when value="points"> - <param name="pch" type="select" label="Point Type"> - <option value="1">Circle (hollow)</option> - <option value="2">Triangle (hollow)</option> - <option value="3">Cross</option> - <option value="4">Diamond (hollow)</option> - <option value="15">Square (filled)</option> - <option value="16">Circle (filled)</option> - <option value="17">Triangle (filled)</option> - </param> - <param name="col" type="select" label="Point Color"> - <option value="1">Black</option> - <option value="2">Red</option> - <option value="3">Green</option> - <option value="4">Blue</option> - <option value="5">Cyan</option> - <option value="6">Magenta</option> - <option value="7">Yellow</option> - <option value="8">Gray</option> - </param> - <param name="cex" type="float" label="Point Scale" value="1.0"/> - </when> - </conditional> - </repeat> - </inputs> - - <configfiles> - <configfile name="script_file"> - ## Setup R error handling to go to stderr - options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) - ## Determine range of all series in the plot - xrange = c( NULL, NULL ) - yrange = c( NULL, NULL ) - #for $i, $s in enumerate( $series ) - s${i} = read.table( "${s.input.file_name}" ) - x${i} = s${i}[,${s.xcol}] - y${i} = s${i}[,${s.ycol}] - xrange = range( x${i}, xrange ) - yrange = range( y${i}, yrange ) - #end for - ## Open output PDF file - pdf( "${out_file1}" ) - ## Dummy plot for axis / labels - plot( NULL, type="n", xlim=xrange, ylim=yrange, main="${main}", xlab="${xlab}", ylab="${ylab}" ) - ## Plot each series - #for $i, $s in enumerate( $series ) - #if $s.series_type['type'] == "line" - lines( x${i}, y${i}, lty=${s.series_type.lty}, lwd=${s.series_type.lwd}, col=${s.series_type.col} ) - #elif $s.series_type.type == "points" - points( x${i}, y${i}, pch=${s.series_type.pch}, cex=${s.series_type.cex}, col=${s.series_type.col} ) - #end if - #end for - ## Close the PDF file - devname = dev.off() - </configfile> - </configfiles> - - <outputs> - <data format="pdf" name="out_file1" /> - </outputs> - - <tests> - <test> - <param name="main" value="Example XY Plot"/> - <param name="xlab" value="Column 1"/> - <param name="ylab" value="Column 2"/> - <param name="input" value="2.tabular" ftype="tabular"/> - <param name="xcol" value="1"/> - <param name="ycol" value="2"/> - <param name="type" value="line"/> - <param name="lty" value="2"/> - <param name="col" value="2"/> - <param name="lwd" value="1.0"/> - <output name="out_file1" file="XY_Plot_1_out.pdf"/> - </test> - </tests> -<help> -.. class:: infomark - -This tool allows you to plot values contained in columns of a dataset against each other and also allows you to have different series corresponding to the same or different datasets in one plot. - ------ - -.. class:: warningmark - -This tool throws an error if the columns selected for plotting are absent or are not numeric and also if the lengths of these columns differ. - ------ - -**Example** - -Input file:: - - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - -Create a two series XY plot on the above data: - -- Series 1: Red Dashed-Line plot between columns 1 and 2 -- Series 2: Blue Circular-Point plot between columns 3 and 2 - -.. image:: ./static/images/xy_example.jpg -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/best_regression_subsets.py --- a/tools/regVariation/best_regression_subsets.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -#!/usr/bin/env python - -from galaxy import eggs - -import sys, string -from rpy import * -import numpy - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -infile = sys.argv[1] -y_col = int(sys.argv[2])-1 -x_cols = sys.argv[3].split(',') -outfile = sys.argv[4] -outfile2 = sys.argv[5] -print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) -fout = open(outfile,'w') - -for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - -if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - -y_vals = [] -x_vals = [] - -for k,col in enumerate(x_cols): - x_cols[k] = int(col)-1 - x_vals.append([]) - -NA = 'NA' -for ind,line in enumerate( file( infile )): - if line and not line.startswith( '#' ): - try: - fields = line.split("\t") - try: - yval = float(fields[y_col]) - except Exception, ey: - yval = r('NA') - y_vals.append(yval) - for k,col in enumerate(x_cols): - try: - xval = float(fields[col]) - except Exception, ex: - xval = r('NA') - x_vals[k].append(xval) - except: - pass - -response_term = "" - -x_vals1 = numpy.asarray(x_vals).transpose() - -dat= r.list(x=array(x_vals1), y=y_vals) - -r.library("leaps") - -set_default_mode(NO_CONVERSION) -try: - leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat)) -except RException, rex: - stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") -set_default_mode(BASIC_CONVERSION) - -summary = r.summary(leaps) -tot = len(x_vals) -pattern = "[" -for i in range(tot): - pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' ' -pattern = pattern.strip() + ']' -print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern) -for ind,item in enumerate(summary['outmat']): - print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind]) - - -r.pdf( outfile2, 8, 8 ) -r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion") -r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion") -r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion") -r.plot(leaps, scale="bic", main="Best subsets using bic Criterion") - -r.dev_off() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/best_regression_subsets.xml --- a/tools/regVariation/best_regression_subsets.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ -<tool id="BestSubsetsRegression1" name="Perform Best-subsets Regression"> - <description> </description> - <command interpreter="python"> - best_regression_subsets.py - $input1 - $response_col - $predictor_cols - $out_file1 - $out_file2 - 1>/dev/null - 2>/dev/null - </command> - <inputs> - <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> - <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> - <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" > - <validator type="no_options" message="Please select at least one column."/> - </param> - </inputs> - <outputs> - <data format="input" name="out_file1" metadata_source="input1" /> - <data format="pdf" name="out_file2" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <tests> - <!-- Testing this tool will not be possible because this tool produces a pdf output file. - --> - </tests> - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* - ------ - -.. class:: infomark - -**What it does** - -This tool uses the 'regsubsets' function from R statistical package for regression subset selection. It outputs two files, one containing a table with the best subsets and the corresponding summary statistics, and the other containing the graphical representation of the results. - ------ - -.. class:: warningmark - -**Note** - -- This tool currently treats all predictor and response variables as continuous variables. - -- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. - -- The 6 columns in the output are described below: - - - Column 1 (Vars): denotes the number of variables in the model - - Column 2 ([c2 c3 c4...]): represents a list of the user-selected predictor variables (full model). An asterix denotes the presence of the corresponding predictor variable in the selected model. - - Column 3 (R-sq): the fraction of variance explained by the model - - Column 4 (Adj. R-sq): the above R-squared statistic adjusted, penalizing for higher number of predictors (p) - - Column 5 (Cp): Mallow's Cp statistics - - Column 6 (bic): Bayesian Information Criterion. - - - </help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/categorize_elements_satisfying_criteria.pl --- a/tools/regVariation/categorize_elements_satisfying_criteria.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,172 +0,0 @@ -#!/usr/bin/perl -w - -# The program takes as input a set of categories, such that each category contains many elements. -# It also takes a table relating elements with criteria, such that each element is assigned a number -# representing the number of times the element satisfies a certain criterion. -# The first input is a TABULAR format file, such that the left column represents the name of categories and, -# all other columns represent the names of elements. -# The second input is a TABULAR format file relating elements with criteria, such that the first line -# represents the names of criteria and the left column represents the names of elements. -# The output is a TABULAR format file relating catergories with criteria, such that each categoy is -# assigned a number representing the total number of times its elements satisfies a certain criterion. -# Each category is assigned as many numbers as criteria. - -use strict; -use warnings; - -#variables to handle information of the categories input file -my @categoryElementsArray = (); -my @categoriesArray = (); -my $categoryMemberNames; -my $categoryName; -my %categoryMembersHash = (); -my $memberNumber = 0; -my $totalMembersNumber = 0; -my $totalCategoriesNumber = 0; -my @categoryCountersTwoDimArray = (); -my $lineCounter1 = 0; - -#variables to handle information of the criteria and elements data input file -my $elementLine; -my @elementDataArray = (); -my $elementName; -my @criteriaArray = (); -my $criteriaNumber = 0; -my $totalCriteriaNumber = 0; -my $lineCounter2 = 0; - -#variable representing the row and column indices used to store results into a two-dimensional array -my $row = 0; -my $column = 0; - -# check to make sure having correct files -my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n"; -die $usage unless @ARGV == 3; - -#get the categories input file -my $categories_inputFile = $ARGV[0]; - -#get the criteria and data input file -my $elements_data_inputFile = $ARGV[1]; - -#get the output file -my $categorized_data_outputFile = $ARGV[2]; - -#open the input and output files -open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n"); -open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile \n"); -open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n"); - -#store the first input file into an array -my @categoriesData = <INPUT1>; - -#reset the value of $lineCounter1 to 0 -$lineCounter1 = 0; - -#iterate through the first input file to get the names of categories and their corresponding elements -foreach $categoryMemberNames (@categoriesData){ - chomp ($categoryMemberNames); - - @categoryElementsArray = split(/\t/, $categoryMemberNames); - - #store the name of the current category into an array - $categoriesArray [$lineCounter1] = $categoryElementsArray[0]; - - #store the name of the current category into a two-dimensional array - $categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0]; - - #get the total number of elements in the current category - $totalMembersNumber = @categoryElementsArray; - - #store the names of categories and their corresponding elements into a hash - for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) { - - $categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1]; - } - - $lineCounter1++; -} - -#store the second input file into an array -my @elementsData = <INPUT2>; - -#reset the value of $lineCounter2 to 0 -$lineCounter2 = 0; - -#iterate through the second input file in order to count the number of elements -#in each category that satisfy each criterion -foreach $elementLine (@elementsData){ - chomp ($elementLine); - - $lineCounter2++; - - @elementDataArray = split(/\t/, $elementLine); - - #if at the first line, get the total number of criteria and the total - #number of catergories and initialize the two-dimensional array - if ($lineCounter2 == 1){ - @criteriaArray = @elementDataArray; - $totalCriteriaNumber = @elementDataArray; - - $totalCategoriesNumber = @categoriesArray; - - #initialize the two-dimensional array - for ($row = 0; $row < $totalCategoriesNumber; $row++) { - - for ($column = 1; $column <= $totalCriteriaNumber; $column++) { - - $categoryCountersTwoDimArray [$row][$column] = 0; - } - } - } - else{ - #get the element data - $elementName = $elementDataArray[0]; - - #do the counting and store the result in the two-dimensional array - for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) { - - if ($elementDataArray[$criteriaNumber + 1] > 0){ - - $categoryName = $categoryMembersHash{$elementName}; - - my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray; - - $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1]; - } - } - } -} - -print OUTPUT "\t"; - -#store the criteria names into the output file -for ($column = 1; $column <= $totalCriteriaNumber; $column++) { - - if ($column < $totalCriteriaNumber){ - print OUTPUT $criteriaArray[$column - 1] . "\t"; - } - else{ - print OUTPUT $criteriaArray[$column - 1] . "\n"; - } -} - -#store the category names and their corresponding number of elements satisfying criteria into the output file -for ($row = 0; $row < $totalCategoriesNumber; $row++) { - - for ($column = 0; $column <= $totalCriteriaNumber; $column++) { - - if ($column < $totalCriteriaNumber){ - print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t"; - } - else{ - print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n"; - } - } -} - -#close the input and output file -close(OUTPUT); -close(INPUT2); -close(INPUT1); - diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/categorize_elements_satisfying_criteria.xml --- a/tools/regVariation/categorize_elements_satisfying_criteria.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -<tool id="categorize_elements_satisfying_criteria" name="Categorize Elements" version="1.0.0"> - <description>satisfying criteria</description> - - <command interpreter="perl"> - categorize_elements_satisfying_criteria.pl $inputFile1 $inputFile2 $outputFile1 - </command> - - <inputs> - <param format="tabular" name="inputFile1" type="data" label="Select file containing categories and their elements"/> - <param format="tabular" name="inputFile2" type="data" label="Select file containing criteria and elements data"/> - </inputs> - - <outputs> - <data format="tabular" name="outputFile1"/> - </outputs> - - <tests> - <test> - <param name="inputFile1" value="categories.tabular" ftype="tabular" /> - <param name="inputFile2" value="criteria_elements_data.tabular" ftype="tabular" /> - <output name="outputFile1" file="categorized_elements.tabular" /> - </test> - </tests> - - - <help> - -.. class:: infomark - -**What it does** - -The program takes as input a set of categories, such that each category contains many elements. It also takes a table relating elements with criteria, such that each element is assigned a number representing the number of times the element satisfies a certain criterion. - -- The first input is a TABULAR format file, such that the left column represents the names of categories and, all other columns represent the names of elements in each category. -- The second input is a TABULAR format file relating elements with criteria, such that the first line represents the names of criteria and the left column represents the names of elements. -- The output is a TABULAR format file relating catergories with criteria, such that each categoy is assigned a number representing the total number of times its elements satisfies a certain criterion.. Each category is assigned as many numbers as criteria. - - -**Example** - -Let the first input file be a group of motif categories as follows:: - - Deletion_Hotspots deletionHoptspot1 deletionHoptspot2 deletionHoptspot3 - Dna_Pol_Pause_Frameshift dnaPolPauseFrameshift1 dnaPolPauseFrameshift2 dnaPolPauseFrameshift3 dnaPolPauseFrameshift4 - Indel_Hotspots indelHotspot1 - Insertion_Hotspots insertionHotspot1 insertionHotspot2 - Topoisomerase_Cleavage_Sites topoisomeraseCleavageSite1 topoisomeraseCleavageSite2 topoisomeraseCleavageSite3 - - -And let the second input file represent the number of times each motif occurs in a certain window size of indel flanking regions, as follows:: - - 10bp 20bp 40bp - deletionHoptspot1 1 1 2 - deletionHoptspot2 1 1 1 - deletionHoptspot3 0 0 0 - dnaPolPauseFrameshift1 1 1 1 - dnaPolPauseFrameshift2 0 2 1 - dnaPolPauseFrameshift3 0 0 0 - dnaPolPauseFrameshift4 0 1 2 - indelHotspot1 0 0 0 - insertionHotspot1 0 0 1 - insertionHotspot2 1 1 1 - topoisomeraseCleavageSite1 1 1 1 - topoisomeraseCleavageSite2 1 2 1 - topoisomeraseCleavageSite3 0 0 2 - -Running the program will give the total number of times the motifs of each category occur in every window size of indel flanking regions:: - - 10bp 20bp 40bp - Deletion_Hotspots 2 2 3 - Dna_Pol_Pause_Frameshift 1 4 4 - Indel_Hotspots 0 0 0 - Insertion_Hotspots 1 1 2 - Topoisomerase_Cleavage_Sites 2 3 4 - - </help> - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/compute_motif_frequencies_for_all_motifs.pl --- a/tools/regVariation/compute_motif_frequencies_for_all_motifs.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,153 +0,0 @@ -#!/usr/bin/perl -w - -# a program to compute the frequencies of each motif at a window size, determined by the user, in both -# upstream and downstream sequences flanking indels in all chromosomes. -# the first input is a TABULAR format file containing the motif names and sequences, such that the file -# consists of two columns: the left column represents the motif names and the right column represents -# the motif sequence, one line per motif. -# the second input is a TABULAR format file containing the windows into which both upstream and downstream -# sequences flanking indels have been divided. -# the fourth input is an integer number representing the number of windows to be considered in both -# upstream and downstream flanking sequences. -# the output is a TABULAR format file consisting of three columns: the left column represents the motif -# name, the middle column represents the motif frequency in the window of the upstream sequence flanking -# an indel, and the the right column represents the motif frequency in the window of the downstream -# sequence flanking an indel, one line per indel. -# The total number of lines in the output file = number of motifs x number of indels. - -use strict; -use warnings; - -#variable to handle the window information -my $window = ""; -my $windowNumber = 0; -my $totalWindowsNumber = 0; -my $upstreamAndDownstreamFlankingSequencesWindows = ""; - -#variable to handle the motif information -my $motif = ""; -my $motifName = ""; -my $motifSequence = ""; -my $motifNumber = 0; -my $totalMotifsNumber = 0; -my $upstreamMotifFrequencyCounter = 0; -my $downstreamMotifFrequencyCounter = 0; - -#arrays to sotre window and motif data -my @windowsArray = (); -my @motifNamesArray = (); -my @motifSequencesArray = (); - -#variable to handle the indel information -my $indelIndex = 0; - -#variable to store line counter value -my $lineCounter = 0; - -# check to make sure having correct files -my $usage = "usage: compute_motif_frequencies_for_all_motifs.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] \n"; -die $usage unless @ARGV == 4; - -#get the input arguments -my $motifsInputFile = $ARGV[0]; -my $indelFlankingSequencesWindowsInputFile = $ARGV[1]; -my $numberOfConsideredWindows = $ARGV[2]; -my $motifFrequenciesOutputFile = $ARGV[3]; - -#open the input files -open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n"); -open (INPUT2, "<", $indelFlankingSequencesWindowsInputFile) || die("Could not open file indelFlankingSequencesWindowsInputFile \n"); -open (OUTPUT, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n"); - -#store the motifs input file in the array @motifsData -my @motifsData = <INPUT1>; - -#iterated through the motifs (lines) of the motifs input file -foreach $motif (@motifsData){ - chomp ($motif); - #print ($motif . "\n"); - - #split the motif data into its name and its sequence - my @motifNameAndSequenceArray = split(/\t/, $motif); - - #store the name of the motif into the array @motifNamesArray - push @motifNamesArray, $motifNameAndSequenceArray[0]; - - #store the sequence of the motif into the array @motifSequencesArray - push @motifSequencesArray, $motifNameAndSequenceArray[1]; -} - -#compute the size of the motif names array -$totalMotifsNumber = @motifNamesArray; - - -#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData -my @windowsData = <INPUT2>; - -#check if the number of considered window entered by the user is 0 or negative, if so make it equal to 1 -if ($numberOfConsideredWindows <= 0){ - $numberOfConsideredWindows = 1; -} - -#iterated through the motif sequences to check their occurrences in the considered windows -#and store the count of their occurrences in the corresponding ouput file -for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ - - #get the motif name - $motifName = $motifNamesArray[$motifNumber]; - - #get the motif sequence - $motifSequence = $motifSequencesArray[$motifNumber]; - - #iterated through the lines of the second input file. Each line represents - #the windows of the upstream and downstream flanking sequences of an indel - foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){ - - chomp ($upstreamAndDownstreamFlankingSequencesWindows); - $lineCounter++; - - #split both upstream and downstream flanking sequences into their windows - my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows); - - if ($lineCounter == 1){ - $totalWindowsNumber = @windowsArray; - $indelIndex = ($totalWindowsNumber - 1)/2; - } - - #reset the motif frequency counters - $upstreamMotifFrequencyCounter = 0; - $downstreamMotifFrequencyCounter = 0; - - #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter - for ($windowNumber = $indelIndex - 1; $windowNumber > $indelIndex - $numberOfConsideredWindows - 1; $windowNumber--){ - - #get the window - $window = $windowsArray[$windowNumber]; - - #if the motif is found in the window, then increment its corresponding counter - if ($window =~ m/$motifSequence/i){ - $upstreamMotifFrequencyCounter++; - } - } - - #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter - for ($windowNumber = $indelIndex + 1; $windowNumber < $indelIndex + $numberOfConsideredWindows + 1; $windowNumber++){ - - #get the window - $window = $windowsArray[$windowNumber]; - - #if the motif is found in the window, then increment its corresponding counter - if ($window =~ m/$motifSequence/i){ - $downstreamMotifFrequencyCounter++; - } - } - - #store the result into the output file of the motif - print OUTPUT $motifName . "\t" . $upstreamMotifFrequencyCounter . "\t" . $downstreamMotifFrequencyCounter . "\n"; - } -} - -#close the input and output files -close(OUTPUT); -close(INPUT2); -close(INPUT1); \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/compute_motif_frequencies_for_all_motifs.xml --- a/tools/regVariation/compute_motif_frequencies_for_all_motifs.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ -<tool id="compute_motif_frequencies_for_all_motifs" name="Compute Motif Frequencies For All Motifs" version="1.0.0"> - <description>motif by motif</description> - - <command interpreter="perl"> - compute_motif_frequencies_for_all_motifs.pl $inputFile1 $inputFile2 $inputWindowSize3 $outputFile1 - </command> - - <inputs> - <param format="tabular" name="inputFile1" type="data" label="Select the motifs file"/> - <param format="tabular" name="inputFile2" type="data" label="Select the indel flanking sequences windows file"/> - <param type="integer" name="inputWindowSize3" size="6" value="0" label="What is the number of 10bp windows in which the motif frequencies will be computed?" help="'0' = one window only"/> - </inputs> - - <outputs> - <data format="tabular" name="outputFile1"/> - </outputs> - - <tests> - <test> - <param name="inputFile1" value="motifs2.tabular" /> - <param name="inputFile2" value="flankingSequencesWindows10_2.tabular" /> - <param name="inputWindowSize3" value="0" /> - <output name="outputFile1" file="motifFrequencies_every_indels0.tabular" /> - </test> - - <test> - <param name="inputFile1" value="motifs2.tabular" /> - <param name="inputFile2" value="flankingSequencesWindows10_2.tabular" /> - <param name="inputWindowSize3" value="4" /> - <output name="outputFile1" file="motifFrequencies_every_indels4.tabular" /> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This program computes the frequencies of each motif at a window size, determined by the user, in both upstream and downstream sequences flanking indels in all chromosomes. - -- The first input is a TABULAR format file containing the motif names and sequences, one line per motif, such that the file consists of two columns: - - - The left column represents the motif names - - The right column represents the motif sequence, as follows:: - - dnaPolPauseFrameshift1 GAG - dnaPolPauseFrameshift2 ACG - xSites1 CCG - -- The second input is a TABULAR format file representing the windows of both upstream and downstream flanking sequences. It consists of multiple left columns representing the windows of the upstream flanking sequences, followed by one column representing the indels, then followed by multiple right columns representing the windows of the downstream flanking sequences, as follows:: - - cgaggtcagg agatcgagac catcctggct aacatggtga aatcccgtct ctactaaaaa indel aaatttatat ttataaacaa ttttaataca cctatgttta ttatacattt - GCCAGTTTAT GGTCTAACAA GGAGAGAAAC AGGGGGCTGA AGGGGTTTCT TAACCTCCAG indel TTCCGGGCTC TGTCCCTAAC CCCCAGCTAG GTAAGTGGCA AAGCACTTCT - CAGTGGGACC AAGCACTGAA CCACTTTGGG GAGAATCTCA CACTGGGGCC CTCTGACACC indel tatatatttt tttttttttt tttttttttt tttttttttg agatggtgtc - AGAGCAGCAG CACCCACTTT TGCAGTGTGT GACGTTGGTG GAGCCATCGA AGTCTGTGCT indel GAGCCCTCCC CAGTGCTCCG AGGAGCTGCT GTTCCCCCTG GAGCTCAGAA - -- The third input is an integer number representing the number of windows to be considered starting from the indel and leftward for the upstream flanking sequence and, starting from the indel and rightward for the downstream flanking sequence. - -- The output is a TABULAR format file consisting of three columns: - - - The left column represents the motif name - - The middle column represents the motif frequency in the specified windows of the upstream sequence flanking an indel - - The right column represents the motif frequency in the specified windows of the downstream sequence flanking an indel - - There is line per indel in the output file, such that the total number of lines in the output file = number of motifs x number of indels. - -Note: The number of windows entered by the user must be a positive integer >= 1. if negative integer or 0 is entered by the user, the program will consider it as 1. - - </help> - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/compute_motifs_frequency.pl --- a/tools/regVariation/compute_motifs_frequency.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,252 +0,0 @@ -#!/usr/bin/perl -w - -# a program to compute the frequency of each motif at each window in both upstream and downstream sequences flanking indels -# in a chromosome/genome. -# the first input is a TABULAR format file containing the motif names and sequences, such that the file consists of two -# columns: the left column represents the motif names and the right column represents the motif sequence, one line per motif. -# the second input is a TABULAR format file containing the upstream and downstream sequences flanking indels, one line per indel. -# the fourth input is an integer number representing the window size according to which the upstream and downstream sequences -# flanking each indel will be divided. -# the first output is a TABULAR format file containing the windows into which both upstream and downstream sequences flanking -# indels are divided. -# the second output is a TABULAR format file containing the motifs and their corresponding frequencies at each window in both -# upstream and downstream sequences flanking indels, one line per motif. - -use strict; -use warnings; - -#variable to handle the falnking sequences information -my $sequence = ""; -my $upstreamFlankingSequence = ""; -my $downstreamFlankingSequence = ""; -my $discardedSequenceLength = 0; -my $lengthOfDownstreamFlankingSequenceAfterTrimming = 0; - -#variable to handle the window information -my $window = ""; -my $windowStartIndex = 0; -my $windowNumber = 0; -my $totalWindowsNumber = 0; -my $totalNumberOfWindowsInUpstreamSequence = 0; -my $totalNumberOfWindowsInDownstreamSequence = 0; -my $totalWindowsNumberInBothFlankingSequences = 0; -my $totalWindowsNumberInMotifCountersTwoDimArray = 0; -my $upstreamAndDownstreamFlankingSequencesWindows = ""; - -#variable to handle the motif information -my $motif = ""; -my $motifSequence = ""; -my $motifNumber = 0; -my $totalMotifsNumber = 0; - -#arrays to sotre window and motif data -my @windowsArray = (); -my @motifNamesArray = (); -my @motifSequencesArray = (); -my @motifCountersTwoDimArray = (); - -#variables to store line counter values -my $lineCounter1 = 0; -my $lineCounter2 = 0; - -# check to make sure having correct files -my $usage = "usage: compute_motifs_frequency.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] [TABULAR.out]\n"; -die $usage unless @ARGV == 5; - -#get the input and output arguments -my $motifsInputFile = $ARGV[0]; -my $indelFlankingSequencesInputFile = $ARGV[1]; -my $windowSize = $ARGV[2]; -my $indelFlankingSequencesWindowsOutputFile = $ARGV[3]; -my $motifFrequenciesOutputFile = $ARGV[4]; - -#open the input and output files -open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n"); -open (INPUT2, "<", $indelFlankingSequencesInputFile) || die("Could not open file $indelFlankingSequencesInputFile \n"); -open (OUTPUT1, ">", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n"); -open (OUTPUT2, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n"); - -#store the motifs input file in the array @motifsData -my @motifsData = <INPUT1>; - -#iterated through the motifs (lines) of the motifs input file -foreach $motif (@motifsData){ - chomp ($motif); - #print ($motif . "\n"); - - #split the motif data into its name and its sequence - my @motifNameAndSequenceArray = split(/\t/, $motif); - - #store the name of the motif into the array @motifNamesArray - push @motifNamesArray, $motifNameAndSequenceArray[0]; - - #store the sequence of the motif into the array @motifSequencesArray - push @motifSequencesArray, $motifNameAndSequenceArray[1]; -} - -#compute the size of the motif names array -$totalMotifsNumber = @motifNamesArray; - -#store the input file in the array @sequencesData -my @sequencesData = <INPUT2>; - -#iterated through the sequences of the second input file in order to create windwos file -foreach $sequence (@sequencesData){ - chomp ($sequence); - $lineCounter1++; - - my @indelAndSequenceArray = split(/\t/, $sequence); - - #get the upstream falnking sequence - $upstreamFlankingSequence = $indelAndSequenceArray[3]; - - #if the window size is 0, then the whole upstream will be one window only - if ($windowSize == 0){ - $totalNumberOfWindowsInUpstreamSequence = 1; - $windowSize = length ($upstreamFlankingSequence); - } - else{ - #compute the total number of windows into which the upstream flanking sequence will be divided - $totalNumberOfWindowsInUpstreamSequence = length ($upstreamFlankingSequence) / $windowSize; - - #compute the length of the subsequence to be discared from the upstream flanking sequence if any - $discardedSequenceLength = length ($upstreamFlankingSequence) % $windowSize; - - #check if the sequence could be split into windows of equal sizes - if ($discardedSequenceLength != 0) { - #trim the upstream flanking sequence - $upstreamFlankingSequence = substr($upstreamFlankingSequence, $discardedSequenceLength); - } - } - - #split the upstream flanking sequence into windows - for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInUpstreamSequence; $windowNumber++){ - $windowStartIndex = $windowNumber * $windowSize; - print OUTPUT1 (substr($upstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t"); - } - - #add a column representing the indel - print OUTPUT1 ("indel" . "\t"); - - #get the downstream falnking sequence - $downstreamFlankingSequence = $indelAndSequenceArray[4]; - - #if the window size is 0, then the whole upstream will be one window only - if ($windowSize == 0){ - $totalNumberOfWindowsInDownstreamSequence = 1; - $windowSize = length ($downstreamFlankingSequence); - } - else{ - #compute the total number of windows into which the downstream flanking sequence will be divided - $totalNumberOfWindowsInDownstreamSequence = length ($downstreamFlankingSequence) / $windowSize; - - #compute the length of the subsequence to be discared from the upstream flanking sequence if any - $discardedSequenceLength = length ($downstreamFlankingSequence) % $windowSize; - - #check if the sequence could be split into windows of equal sizes - if ($discardedSequenceLength != 0) { - #compute the length of the sequence to be discarded - $lengthOfDownstreamFlankingSequenceAfterTrimming = length ($downstreamFlankingSequence) - $discardedSequenceLength; - - #trim the downstream flanking sequence - $downstreamFlankingSequence = substr($downstreamFlankingSequence, 0, $lengthOfDownstreamFlankingSequenceAfterTrimming); - } - } - - #split the downstream flanking sequence into windows - for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInDownstreamSequence; $windowNumber++){ - $windowStartIndex = $windowNumber * $windowSize; - print OUTPUT1 (substr($downstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t"); - } - - print OUTPUT1 ("\n"); -} - -#compute the total number of windows on both upstream and downstream sequences flanking the indel -$totalWindowsNumberInBothFlankingSequences = $totalNumberOfWindowsInUpstreamSequence + $totalNumberOfWindowsInDownstreamSequence; - -#add an additional cell to store the name of the motif and another one for the indel itself -$totalWindowsNumberInMotifCountersTwoDimArray = $totalWindowsNumberInBothFlankingSequences + 1 + 1; - -#initialize the two dimensional array $motifCountersTwoDimArray. the first column will be initialized with motif names -for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ - - for ($windowNumber = 0; $windowNumber < $totalWindowsNumberInMotifCountersTwoDimArray; $windowNumber++){ - - if ($windowNumber == 0){ - $motifCountersTwoDimArray [$motifNumber] [0] = $motifNamesArray[$motifNumber]; - } - elsif ($windowNumber == $totalNumberOfWindowsInUpstreamSequence + 1){ - $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = "indel"; - } - else{ - $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = 0; - } - } -} - -close(OUTPUT1); - -#open the file the contains the windows of the upstream and downstream flanking sequences, which is actually the first output file -open (INPUT3, "<", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n"); - -#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData -my @windowsData = <INPUT3>; - -#iterated through the lines of the first output file. Each line represents -#the windows of the upstream and downstream flanking sequences of an indel -foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){ - - chomp ($upstreamAndDownstreamFlankingSequencesWindows); - $lineCounter2++; - - #split both upstream and downstream flanking sequences into their windows - my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows); - - $totalWindowsNumber = @windowsArray; - - #iterate through the windows to search for matched motifs and increment their corresponding counters accordingly - WINDOWS: - for ($windowNumber = 0; $windowNumber < $totalWindowsNumber; $windowNumber++){ - - #get the window - $window = $windowsArray[$windowNumber]; - - #if the window is the one that contains the indel, then skip the indel window - if ($window eq "indel") { - next WINDOWS; - } - else{ #iterated through the motif sequences to check their occurrences in the winodw - #and increment their corresponding counters accordingly - - for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ - #get the motif sequence - $motifSequence = $motifSequencesArray[$motifNumber]; - - #if the motif is found in the window, then increment its corresponding counter - if ($window =~ m/$motifSequence/i){ - $motifCountersTwoDimArray [$motifNumber] [$windowNumber + 1]++; - } - } - } - } -} - -#store the motif counters values in the second output file -for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ - - for ($windowNumber = 0; $windowNumber <= $totalWindowsNumber; $windowNumber++){ - - print OUTPUT2 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] . "\t"; - #print ($motifCountersTwoDimArray [$motifNumber] [$windowNumber] . " "); - } - print OUTPUT2 "\n"; - #print ("\n"); -} - -#close the input and output files -close(OUTPUT2); -close(OUTPUT1); -close(INPUT3); -close(INPUT2); -close(INPUT1); \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/compute_motifs_frequency.xml --- a/tools/regVariation/compute_motifs_frequency.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -<tool id="compute_motifs_frequency" name="Compute Motif Frequencies" version="1.0.0"> - <description>in indel flanking regions</description> - - - <command interpreter="perl"> - compute_motifs_frequency.pl $inputFile1 $inputFile2 $inputNumber3 $outputFile1 $outputFile2 - </command> - - - <inputs> - - <param format="tabular" name="inputFile1" type="data" label="Select motifs file"/> - - <param format="tabular" name="inputFile2" type="data" label="Select indel flanking regions file from your history"/> - - <param type="integer" name="inputNumber3" size="5" value="0" label="What is the size of each window?" help="'0' = all the upstream flanking sequence will be one window only, and the same for the downstream flanking sequence."/> - - </inputs> - - - <outputs> - <data format="tabular" name="outputFile1"/> - <data format="tabular" name="outputFile2"/> - </outputs> - - <tests> - <test> - <param name="inputFile1" value="motifs1.tabular" /> - <param name="inputFile2" value="indelsFlankingSequences1.tabular" /> - <param name="inputNumber3" value="0" /> - <output name="outputFile1" file="flankingSequencesWindows0.tabular" /> - <output name="outputFile2" file="motifFrequencies0.tabular" /> - </test> - - <test> - <param name="inputFile1" value="motifs1.tabular" /> - <param name="inputFile2" value="indelsFlankingSequences1.tabular" /> - <param name="inputNumber3" value="10" /> - <output name="outputFile1" file="flankingSequencesWindows10.tabular" /> - <output name="outputFile2" file="motifFrequencies10.tabular" /> - </test> - </tests> - - - <help> - -.. class:: infomark - -**What it does** - -This program computes the frequency of motifs in the flanking regions of indels found in a chromosome or a genome. -Each indel has an upstream flanking sequence and a downstream flanking one. Each of the upstream and downstream flanking -sequences will be divided into a certain number of windows according to the window size input by the user. -The frequency of a motif in a certain window in one of the two flanking sequences is the total sum of occurrences of -that motif in that window of that flanking sequence over all indels. The indel flanking regions file will be taken -from your history or it will be uploaded, whereas the motifs file should be uploaded. - -- The first input file is the motifs file and it is a tabular file consisting of two columns: - - - the first column represents the motif name - - the second column represents the motif sequence, as follows:: - - dnaPolPauseFrameshift1 GAG - dnaPolPauseFrameshift2 ACG - xSites1 CCG - -- The second input file is the indels flanking regions file and it is a tabular file consisting of five columns: - - - the first column represents the indel start coordinate - - the second column represents the indel end coordinate - - the third column represents the indel length - - the fourth column represents the upstream flanking sequence - - the fifth column represents the upstream flanking sequence, as follows:: - - 16694766 16694768 3 GTGGGTCCTGCCCAGCCTCTGCCTCAGAGGGAAGAGTAGAGAACTGGG AGAGCAGGTCCTTAGGGAGCCCGAGGAAGTCCCTGACGCCAGCTGTTCTCGCGGACGAA - 25169542 25169545 4 caagcccacaagccttcagaccatagcaCGGGCTCCAGAGGTGTGAGG CAGGTCAGGTGCTTTAGAAGTCAAAAACTCTCAGTAAGGCAAATCACCCCCTATCTCCT - 41929580 41929585 6 ggctgtcgtatggaatctggggctcaggactctgtcccatttctctaa accattctgcTTCAACCCAGACACTGACTGTTTTCCAAATTTACTTGTTTGTTTGTTTT - - ------ - -.. class:: warningmark - -**Notes** - -- The lengths of the upstream flanking sequences must be equal for all indels. -- The lengths of the downstream flanking sequences must be equal for all indels. -- If the length of the upstream flanking sequence L is not an integer multiple of the window size S, in other words if L/S = m + r where m is the result of division and r is the remainder, then the upstream flanking sequence will be divided into m windows only starting from the indel, and the rest of the sequence will not be considered. The same rule applies to the downstream flanking sequence. - ------ - -The **output** of this program is two files: - -- The first output file is a tabular file and represents the windows of both upstream and downstream flanking sequences. It consists of multiple left columns representing the windows of the upstream flanking sequence, followed by one column representing the indels, then followed by multiple right columns representing the windows of the downstream flanking sequence, as follows:: - - cgaggtcagg agatcgagac catcctggct aacatggtga aatcccgtct ctactaaaaa indel aaatttatat ttataaacaa ttttaataca cctatgttta ttatacattt - GCCAGTTTAT GGTCTAACAA GGAGAGAAAC AGGGGGCTGA AGGGGTTTCT TAACCTCCAG indel TTCCGGGCTC TGTCCCTAAC CCCCAGCTAG GTAAGTGGCA AAGCACTTCT - CAGTGGGACC AAGCACTGAA CCACTTTGGG GAGAATCTCA CACTGGGGCC CTCTGACACC indel tatatatttt tttttttttt tttttttttt tttttttttg agatggtgtc - AGAGCAGCAG CACCCACTTT TGCAGTGTGT GACGTTGGTG GAGCCATCGA AGTCTGTGCT indel GAGCCCTCCC CAGTGCTCCG AGGAGCTGCT GTTCCCCCTG GAGCTCAGAA - -- The second output file is a tabular file and represents the motif frequencies in every window of every flanking sequence. The first column on the left represents the names of motifs. The other columns represent the frequencies of motifs in the windows that correspond to the ones in the first output file, as follows:: - - dnaPolPauseFrameshift1 2 3 1 0 1 2 indel 0 2 2 1 3 - dnaPolPauseFrameshift2 2 3 1 0 1 2 indel 0 2 2 1 3 - xSites1 3 2 0 1 1 2 indel 1 1 3 2 3 - - </help> - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/compute_q_values.pl --- a/tools/regVariation/compute_q_values.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ -# A program to compute the q-values based on the p-values of multiple simultaneous tests. -# The q-valules are computed using a specific R package created by John Storey called "qvalue". -# The input is a TABULAR format file consisting of one column only that represents the p-values -# of multiple simultaneous tests, one line for every p-value. -# The first output is a TABULAR format file consisting of one column only that represents the q-values -# corresponding to p-values, one line for every q-value. -# the second output is a TABULAR format file consisting of three pages: the first page represents -# the p-values histogram, the second page represents the q-values histogram, and the third page represents -# the four Q-plots as introduced in the "qvalue" package manual. - -use strict; -use warnings; -use IO::Handle; -use File::Temp qw/ tempfile tempdir /; -my $tdir = tempdir( CLEANUP => 0 ); - -# check to make sure having correct input and output files -my $usage = "usage: compute_q_values.pl [TABULAR.in] [lambda] [pi0_method] [fdr_level] [robust] [TABULAR.out] [PDF.out] \n"; -die $usage unless @ARGV == 7; - -#get the input arguments -my $p_valuesInputFile = $ARGV[0]; -my $lambdaValue = $ARGV[1]; -my $pi0_method = $ARGV[2]; -my $fdr_level = $ARGV[3]; -my $robustValue = $ARGV[4]; -my $q_valuesOutputFile = $ARGV[5]; -my $p_q_values_histograms_QPlotsFile = $ARGV[6]; - -if($lambdaValue =~ /sequence/){ - $lambdaValue = "seq(0, 0.95, 0.05)"; -} - -#open the input files -open (INPUT, "<", $p_valuesInputFile) || die("Could not open file $p_valuesInputFile \n"); -open (OUTPUT1, ">", $q_valuesOutputFile) || die("Could not open file $q_valuesOutputFile \n"); -open (OUTPUT2, ">", $p_q_values_histograms_QPlotsFile) || die("Could not open file $p_q_values_histograms_QPlotsFile \n"); -#open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); - -#save all error messages into the error file $errorFile using the error file handle ERROR -#STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); - -#warn "Hello Error File \n"; - -#variable to store the name of the R script file -my $r_script; - -# R script to implement the calcualtion of q-values based on multiple simultaneous tests p-values -# construct an R script file and save it in a temp directory -chdir $tdir; -$r_script = "q_values_computation.r"; - -open(Rcmd,">", $r_script) or die "Cannot open $r_script \n\n"; -print Rcmd " - #options(show.error.messages = FALSE); - - #load necessary packages - suppressPackageStartupMessages(library(tcltk)); - library(qvalue); - - #read the p-values of the multiple simultaneous tests from the input file $p_valuesInputFile - p <- scan(\"$p_valuesInputFile\", quiet = TRUE); - - #compute the q-values that correspond to the p-values of the multiple simultaneous tests - qobj <- qvalue(p, pi0.meth = \"$pi0_method\", lambda = $lambdaValue, fdr.level = $fdr_level, robust = $robustValue); - #qobj <- qvalue(p, pi0.meth = \"smoother\", lambda = seq(0, 0.95, 0.05), fdr.level = 0.05); - #qobj <- qvalue(p, pi0.meth = \"bootstrap\", fdr.level = 0.05); - - #draw the p-values histogram, the q-values histogram, and the four Q-plots - # and save them on multiple pages of the output file $p_q_values_histograms_QPlotsFile - pdf(file = \"$p_q_values_histograms_QPlotsFile\", width = 6.25, height = 6, family = \"Times\", pointsize = 12, onefile = TRUE) - hist(qobj\$pvalues); - #dev.off(); - - hist(qobj\$qvalues); - #dev.off(); - - qplot(qobj); - dev.off(); - - #save the q-values in the output file $q_valuesOutputFile - qobj\$pi0 <- signif(qobj\$pi0,digits=6) - qwrite(qobj, filename=\"$q_valuesOutputFile\"); - - #options(show.error.messages = TRUE); - #eof\n"; -close Rcmd; - -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); - -#close the input and output and error files -#close(ERROR); -close(OUTPUT2); -close(OUTPUT1); -close(INPUT); diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/compute_q_values.xml --- a/tools/regVariation/compute_q_values.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,155 +0,0 @@ -<tool id="compute_q_values" name="Compute q-values" version="1.0.1"> - <description>based on multiple simultaneous tests p-values</description> - - <command interpreter="perl"> - compute_q_values.pl $inputFile1 $inputLambda2 $inputPI0_method3 $inputFDR_level4 $inputRobust5 $outputFile1 $outputFile2 - </command> - - <inputs> - <param format="tabular" name="inputFile1" type="data" label="Select the p-values file"/> - - <param type="text" name="inputLambda2" size="100" value="sequence_from_0_to_0.95_increment_0.05" label="What is the lambda value?" help="Either choose the default sequence or one deciaml value between 0 and 1"/> - - <param name="inputPI0_method3" type="select" label="Choose the PI method:"> - <option value="smoother">smoother</option> - <option value="bootstrap">bootstrap</option> - </param> - - <param type="float" name="inputFDR_level4" size="5" value="" label="What is the FDR level?" help="The FDR level must be between 0 and 1"/> - - <param name="inputRobust5" type="select" label="Do you want to make the estimate more robust:" help="Choose TRUE for small p-values"> - <option value="FALSE">FALSE</option> - <option value="TRUE">TRUE</option> - </param> - </inputs> - - <outputs> - <data format="tabular" name="outputFile1"/> - <data format="pdf" name="outputFile2"/> - </outputs> - - <tests> - <test> - <param name="inputFile1" value="p_values.tabular" ftype="tabular" /> - <param name="inputLambda2" value="sequence_from_0_to_0.95_increment_0.05" /> - <param name="inputPI0_method3" value="smoother" /> - <param name="inputFDR_level4" value="0.05" /> - <param name="inputRobust5" value="FALSE" /> - <output name="outputFile1" file="q_values.tabular" /> - <output name="outputFile1" file="p_q_hists_Q_plots.pdf" /> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This program computes the q-values based on the p-values of multiple simultaneous tests. The q-values are computed using a specific R package, created by John Storey and Alan Dabney, called "qvalue". The program takes five inputs: - -- The first input is a TABULAR format file consisting of one column only that represents the p-values of multiple simultaneous tests, one line for every p-value. -- The second input is the lambda parameter. The user can choose either the default: seq(0, 0.95, 0.05) or a decimal number between 0.0 and 1.0. -- The third input is PI method which is either "smoother" or "bootstrap". -- The fourth input is the FDR (false discovery rate) level which is a decimal number between 0.0 and 1.0. -- The fifth input is either TRUE or FALSE for the estimate robustness. - -The program gives two outputs: - -- The first output is a TABULAR format file consisting of three columns: - - - the left column represents the p-values of multiple simultaneous tests, one line for every p-value - - the middle column represents the q-values corresponding to the p-values - - the third column represent the significance values, either 1 for significant or 0 for non-significant - -- The second output is a PDF format file consisting of three pages: - - - the first page represents the p-values histogram - - the second page represents the q-values histogram - - the third page represents the four Q-plots as introduced in the "qvalue" package manual. - - -**Example** - -Let us have the first input file of p-values as follows:: - - 0.140627492 - 0.432249886 - 0.122120877 - 0.142010182 - 0.012909858 - 0.000142807 - 0.039841941 - 0.035173303 - 0.011340057 - 1.01E-05 - 0.212738282 - 0.091256284 - 0.547375415 - 0.189589833 - 6.18E-12 - 0.001235875 - 1.10E-05 - 9.75E-07 - 2.13E-18 - 2.54E-16 - 1.20E-19 - 9.76E-14 - 0.359181534 - 0.03661672 - 0.400459987 - 0.387436466 - 0.342075061 - 0.904129283 - 0.031152635 - -Running the program will give the following output:: - - pi0: 0.140311054 - - FDR level: 0.05 - - p-value q-value significant - 0.1406275 0.02889212 1 - 0.4322499 0.06514199 0 - 0.1221209 0.02760624 1 - 0.1420102 0.02889212 1 - 0.01290986 0.00437754 1 - 0.000142807 6.46E-05 1 - 0.03984194 0.01013235 1 - 0.0351733 0.009932946 1 - 0.01134006 0.004194811 1 - 1.01E-05 5.59E-06 1 - 0.2127383 0.03934711 1 - 0.09125628 0.02184257 1 - 0.5473754 0.07954578 0 - 0.1895898 0.03673547 1 - 6.18E-12 5.03E-12 1 - 0.001235875 0.00050288 1 - 1.10E-05 5.59E-06 1 - 9.75E-07 6.61E-07 1 - 2.13E-18 4.33E-18 1 - 2.54E-16 3.45E-16 1 - 1.20E-19 4.88E-19 1 - 9.76E-14 9.93E-14 1 - 0.3591815 0.06089654 0 - 0.03661672 0.009932946 1 - 0.40046 0.0626723 0 - 0.3874365 0.0626723 0 - 0.3420751 0.06051785 0 - 0.9041293 0.1268593 0 - 0.03115264 0.009750824 1 - - -.. image:: ./static/operation_icons/p_hist.png - - -.. image:: ./static/operation_icons/q_hist.png - - -.. image:: ./static/operation_icons/Q_plots.png - - - </help> - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/delete_overlapping_indels.pl --- a/tools/regVariation/delete_overlapping_indels.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,94 +0,0 @@ -#!/usr/bin/perl -w - -# This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, -# the first encountered one is kept and all others are removed. It requires three inputs: -# The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment. -# The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file. -# The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file. -# The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones. -# Note: The number of the first column is 1. - -use strict; -use warnings; - -#varaibles to handle information related to indels -my $indel1 = ""; -my $indel2 = ""; -my @indelArray1 = (); -my @indelArray2 = (); -my $lineCounter1 = 0; -my $lineCounter2 = 0; -my $totalNumberofNonOverlappingIndels = 0; - -# check to make sure having correct files -my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n"; -die $usage unless @ARGV == 4; - -my $inputFile = $ARGV[0]; -my $indelStartColumn = $ARGV[1] - 1; -my $indelEndColumn = $ARGV[2] - 1; -my $outputFile = $ARGV[3]; - -#verifie column numbers -if ($indelStartColumn < 0 ){ - die ("The indel start column number is invalid \n"); -} -if ($indelEndColumn < 0 ){ - die ("The indel end column number is invalid \n"); -} - -#open the input and output files -open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); -open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); - -#store the input file in the array @rawData -my @indelsRawData = <INPUT>; - -#iterated through the indels of the input file -INDEL1: -foreach $indel1 (@indelsRawData){ - chomp ($indel1); - $lineCounter1++; - - #get the first indel - @indelArray1 = split(/\t/, $indel1); - - #our purpose is to detect overlapping indels and to store one copy of them only in the output file - #all other non-overlapping indels will stored in the output file also - - $lineCounter2 = 0; - - #iterated through the indels of the input file - INDEL2: - foreach $indel2 (@indelsRawData){ - chomp ($indel2); - $lineCounter2++; - - if ($lineCounter2 > $lineCounter1){ - #get the second indel - @indelArray2 = split(/\t/, $indel2); - - #check if the two indels are overlapping - if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){ - #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n"); - #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n"); - - #break out of the loop and go back to the outerloop - next INDEL1; - } - else{ - #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n"); - } - } - } - - print OUTPUT $indel1 . "\n"; - $totalNumberofNonOverlappingIndels++; -} - -#print("The total number of indels is: " . $lineCounter1 . "\n"); -#print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n"); - -#close the input and output files -close(OUTPUT); -close(INPUT); \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/delete_overlapping_indels.xml --- a/tools/regVariation/delete_overlapping_indels.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ -<tool id="delete_overlapping_indels" name="Delete Overlapping Indels" version="1.0.0"> - <description>from a chromosome indels file</description> - - <command interpreter="perl"> - delete_overlapping_indels.pl $inputFile1 $inputIndelStartColumnNumber2 $inputIndelEndColumnNumber3 $outputFile1 - </command> - - <inputs> - <param format="tabular" name="inputFile1" type="data" label="Select indels file"/> - <param type="data_column" name="inputIndelStartColumnNumber2" data_ref="inputFile1" accept_default="true" label="Choose the indel start coordinate column number" /> - <param type="data_column" name="inputIndelEndColumnNumber3" data_ref="inputFile1" accept_default="true" label="Choose the the indel end coordinate column number" /> - </inputs> - - <outputs> - <data format="tabular" name="outputFile1"/> - </outputs> - - <tests> - <test> - <param name="inputFile1" value="indels1.tabular" /> - <param name="inputIndelStartColumnNumber2" value="5" /> - <param name="inputIndelEndColumnNumber3" value="6" /> - <output name="outputFile1" file="non_overlapping_indels1.tabular" /> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, the first encountered one is kept and all others are removed. -It requires three inputs: - -- The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment. -- The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file. -- The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file. -- The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones. - -Note: The number of the first column is 1. - - -**Example** - -Let us have the following insertions in the human genome. The start and end coordinates of insertions are on columns 5 and 6 respectively:: - - 3 hg18.chr22_insert 3 hg18.chr22 14508610 14508612 3924 - panTro2.chr2b 132518950 132518951 3910 + rheMac2.chr17 14311798 14311799 3896 + - 7 hg18.chr22_insert 13 hg18.chr22 14513678 14513690 348 - panTro2.chr2b 132517876 132517877 321 + rheMac2.chr17 14274462 14274463 337 + - 7 hg18.chr22_insert 6 hg18.chr22 14513688 14513699 348 - panTro2.chr2b 132517879 132517880 321 + rheMac2.chr17 14274465 14274466 337 + - 25 hg18.chr22_insert 9 hg18.chr22 14529501 14529509 385 - panTro2.chr22 14528775 14528776 376 - rheMac2.chr9 42869449 42869450 375 - - 36 hg18.chr22_insert 4 hg18.chr22 14566316 14566319 540 - panTro2.chr2b 132492077 132492078 533 + rheMac2.chr10 59230438 59230439 533 - - 40 hg18.chr22_insert 7 hg18.chr22 14508610 14508616 2337 - panTro2.chr2b 132487750 132487751 2313 + rheMac2.chr10 59128305 59128306 2332 + - 41 hg18.chr22_insert 4 hg18.chr22 14571556 14571559 2483 - panTro2.chr2b 132485878 132485879 2481 + rheMac2.chr10 59126094 59126095 2508 + - -By removing the overlapping indels which, we get:: - - 3 hg18.chr22_insert 3 hg18.chr22 14508610 14508612 3924 - panTro2.chr2b 132518950 132518951 3910 + rheMac2.chr17 14311798 14311799 3896 + - 7 hg18.chr22_insert 13 hg18.chr22 14513678 14513690 348 - panTro2.chr2b 132517876 132517877 321 + rheMac2.chr17 14274462 14274463 337 + - 25 hg18.chr22_insert 9 hg18.chr22 14529501 14529509 385 - panTro2.chr22 14528775 14528776 376 - rheMac2.chr9 42869449 42869450 375 - - 36 hg18.chr22_insert 4 hg18.chr22 14566316 14566319 540 - panTro2.chr2b 132492077 132492078 533 + rheMac2.chr10 59230438 59230439 533 - - 41 hg18.chr22_insert 4 hg18.chr22 14571556 14571559 2483 - panTro2.chr2b 132485878 132485879 2481 + rheMac2.chr10 59126094 59126095 2508 + - - </help> - -</tool> \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/draw_stacked_barplots.pl --- a/tools/regVariation/draw_stacked_barplots.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -#!/usr/bin/perl -w - -# This program draws, in a pdf file, a stacked bars plot for different categories of data and for -# different criteria. For each criterion a stacked bar is drawn, such that the height of each stacked -# sub-bar represents the number of elements in each category satisfying that criterion. -# The input consists of a TABULAR format file, where the left column represents the names of categories -# and the other columns are headed by the names of criteria, such that each data value in the file -# represents the number of elements in a certain category satisfying a certain criterion. -# The output is a PDF file containing a stacked bars plot representing the number of elements in each -# category satisfying each criterion. The drawing is done using R code. - - -use strict; -use warnings; - -my $criterion; -my @criteriaArray = (); -my $criteriaNumber = 0; -my $lineCounter = 0; - -#variable to store the names of R script file -my $r_script; - -# check to make sure having correct files -my $usage = "usage: draw_stacked_bar_plot.pl [TABULAR.in] [PDF.out] \n"; -die $usage unless @ARGV == 2; - -my $categoriesInputFile = $ARGV[0]; - -my $categories_criteria_bars_plot_outputFile = $ARGV[1]; - -#open the input file -open (INPUT, "<", $categoriesInputFile) || die("Could not open file $categoriesInputFile \n"); -open (OUTPUT, ">", $categories_criteria_bars_plot_outputFile) || die("Could not open file $categories_criteria_bars_plot_outputFile \n"); - -# R script to implement the drawing of a stacked bar plot representing thes significant motifs in each category of motifs -#construct an R script file -$r_script = "motif_significance_bar_plot.r"; -open(Rcmd,">", $r_script) or die "Cannot open $r_script \n\n"; -print Rcmd " - #store the table content of the first file into a matrix - categoriesTable <- read.table(\"$categoriesInputFile\", header = TRUE); - categoriesMatrix <- as.matrix(categoriesTable); - - - #compute the sum of elements in the column with the maximum sum in each matrix - columnSumsVector <- colSums(categoriesMatrix); - maxColumn <- max (columnSumsVector); - - if (maxColumn %% 10 != 0){ - maxColumn <- maxColumn + 10; - } - - plotHeight = maxColumn/8; - criteriaVector <- names(categoriesTable); - - pdf(file = \"$categories_criteria_bars_plot_outputFile\", width = length(criteriaVector), height = plotHeight, family = \"Times\", pointsize = 12, onefile = TRUE); - - - - #draw the first barplot - barplot(categoriesMatrix, ylab = \"No. of elements in each category\", xlab = \"Criteria\", ylim = range(0, maxColumn), col = \"black\", density = c(10, 20, 30, 40, 50, 60, 70, 80), angle = c(45, 90, 135), names.arg = criteriaVector); - - #draw the legend - legendX = 0.2; - legendY = maxColumn; - - legend (legendX, legendY, legend = rownames(categoriesMatrix), density = c(10, 20, 30, 40, 50, 60, 70, 80), angle = c(45, 90, 135)); - - dev.off(); - - #eof\n"; -close Rcmd; -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); - -#close the input files -close(OUTPUT); -close(INPUT); diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/draw_stacked_barplots.xml --- a/tools/regVariation/draw_stacked_barplots.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -<tool id="draw_stacked_barplots" name="Draw Stacked Bar Plots" version="1.0.0"> - <description>for different categories and different criteria</description> - - <command interpreter="perl"> - draw_stacked_barplots.pl $inputFile1 $outputFile1 - </command> - - <inputs> - <param format="tabular" name="inputFile1" type="data" label="Select the input file"/> - </inputs> - - <outputs> - <data format="pdf" name="outputFile1"/> - </outputs> - - <tests> - <test> - <param name="inputFile1" value="categories_criteria.tabular" /> - <output name="outputFile1" file="stacked_barplot.pdf" /> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This program draws, in a pdf file, a stacked bars plot for different categories of data and for different criteria. For each criterion a stacked bar is -drawn, such that the height of each stacked sub-bar represents the number of elements in each category satisfying that criterion. - -- The input consists of a TABULAR format file, where the left column represents the names of categories and the other columns are headed by the names of criteria, such that each data value in the file represents the number of elements in a certain category satisfying a certain criterion: - -- The output is a PDF file containing a stacked bars plot representing the number of elements in each category satisfying each criterion. The drawing is done using R code. - -**Example** - -Let us suppose that the input file represent the number of significant motifs in each motif category for each window size:: - - 10bp 20bp 40bp 80bp 160bp 320bp 640bp 1280bp - Deletion_Hotspots 2 3 4 4 5 6 7 7 - Dna_Pol_Pause/Frameshift_Hotspots 8 10 14 17 18 15 19 20 - Indel_Hotspots 1 1 1 2 1 0 0 0 - Insertion_Hotspots 0 0 1 2 2 2 2 5 - Topoisomerase_Cleavage_Sites 2 3 5 4 3 3 4 4 - Translin_Targets 0 0 2 2 3 3 3 2 - VDJ_Recombination_Signals 0 0 1 1 1 2 2 2 - X-like_Sites 4 4 4 5 6 7 7 10 - - -Runnig the program will give the following output:: - - The stacked bars plot representing the data in the input file. - -.. image:: ./static/operation_icons/stacked_bars_plot.png - - </help> - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/featureCounter.py --- a/tools/regVariation/featureCounter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda -""" -Calculate count and coverage of one query on another, and append the Coverage and counts to -the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features. - -usage: %prog bed_file_1 bed_file_2 out_file - -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file - -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file -""" -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import sys, traceback, fileinput -from warnings import warn -from bx.intervals.io import * -from bx.cookbook import doc_optparse -from bx.intervals.operations import quicksect -from galaxy.tools.util.galaxyops import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def counter(node, start, end): - global full, partial - if node.start <= start and node.maxend > start: - if node.end >= end or (node.start == start and end > node.end > start): - full += 1 - elif end > node.end > start: - partial += 1 - if node.left and node.left.maxend > start: - counter(node.left, start, end) - if node.right: - counter(node.right, start, end) - elif start < node.start < end: - if node.end <= end: - full += 1 - else: - partial += 1 - if node.left and node.left.maxend > start: - counter(node.left, start, end) - if node.right: - counter(node.right, start, end) - else: - if node.left: - counter(node.left, start, end) - -def count_coverage( readers, comments=True ): - primary = readers[0] - secondary = readers[1] - secondary_copy = readers[2] - - rightTree = quicksect.IntervalTree() - for item in secondary: - if type( item ) is GenomicInterval: - rightTree.insert( item, secondary.linenum, item.fields ) - - bitsets = secondary_copy.binned_bitsets() - - global full, partial - - for interval in primary: - if type( interval ) is Header: - yield interval - if type( interval ) is Comment and comments: - yield interval - elif type( interval ) == GenomicInterval: - chrom = interval.chrom - start = int(interval.start) - end = int(interval.end) - full = 0 - partial = 0 - if chrom not in bitsets: - bases_covered = 0 - percent = 0.0 - full = 0 - partial = 0 - else: - bases_covered = bitsets[ chrom ].count_range( start, end-start ) - if (end - start) == 0: - percent = 0 - else: - percent = float(bases_covered) / float(end - start) - if bases_covered: - root = rightTree.chroms[chrom] #root node for the chrom tree - counter(root, start, end) - interval.fields.append(str(bases_covered)) - interval.fields.append(str(percent)) - interval.fields.append(str(full)) - interval.fields.append(str(partial)) - yield interval - -def main(): - options, args = doc_optparse.parse( __doc__ ) - - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) - chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) - in1_fname, in2_fname, out_fname = args - except: - stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." ) - - g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) - g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) - - - out_file = open( out_fname, "w" ) - - try: - for line in count_coverage([g1,g2,g2_copy]): - if type( line ) is GenomicInterval: - out_file.write( "%s\n" % "\t".join( line.fields ) ) - else: - out_file.write( "%s\n" % line ) - except ParseError, exc: - out_file.close() - fail( str( exc ) ) - - out_file.close() - - if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) - if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) - elif g2_copy.skipped > 0: - print skipped( g2_copy, filedesc=" of 2nd dataset" ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/featureCounter.xml --- a/tools/regVariation/featureCounter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -<tool id="featureCoverage1" name="Feature coverage" version="2.0.0"> - <description></description> - <command interpreter="python">featureCounter.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}</command> - <inputs> - <param format="interval" name="input1" type="data" help="First dataset"> - <label>What portion of</label> - </param> - <param format="interval" name="input2" type="data" help="Second dataset"> - <label>is covered by</label> - </param> - </inputs> - <outputs> - <data format="interval" name="output" metadata_source="input1" /> - </outputs> - - <tests> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <output name="output" file="6_feature_coverage.bed" /> - </test> - <test> - <param name="input1" value="chrY1.bed" /> - <param name="input2" value="chrY2.bed" /> - <output name="output" file="chrY_Coverage.bed" /> - </test> - </tests> - <help> - -.. class:: infomark - -**What it does** - -This tool finds the coverage of intervals in the first dataset on intervals in the second dataset. The coverage and count are appended as 4 new columns in the resulting dataset. - ------ - -**Example** - -- If **First dataset** consists of the following windows:: - - chrX 1 10001 seg 0 - - chrX 10001 20001 seg 0 - - chrX 20001 30001 seg 0 - - chrX 30001 40001 seg 0 - - -- and **Second dataset** consists of the following exons:: - - chrX 5000 6000 seg2 0 - - chrX 5500 7000 seg2 0 - - chrX 9000 22000 seg2 0 - - chrX 24000 34000 seg2 0 - - chrX 36000 38000 seg2 0 - - -- the **Result** is the coverage of exons of the second dataset in each of the windows contained in first dataset:: - - chrX 1 10001 seg 0 - 3001 0.3001 2 1 - chrX 10001 20001 seg 0 - 10000 1.0 1 0 - chrX 20001 30001 seg 0 - 8000 0.8 0 2 - chrX 30001 40001 seg 0 - 5999 0.5999 1 1 - -- To clarify, the following line of output ( added columns are indexed by a, b and c ):: - - a b c d - chrX 1 10001 seg 0 - 3001 0.3001 2 1 - - implies that 2 exons (c) fall fully in this window (chrX:1-10001), 1 exon (d) partially overlaps this window, and these 3 exons cover 30.01% (c) of the window size, spanning 3001 nucleotides (a). - - * a: number of nucleotides in this window covered by the features in (c) and (d) - features overlapping with each other will be merged to calculate (a) - * b: fraction of window size covered by features in (c) and (d) - features overlapping with each other will be merged to calculate (b) - * c: number of features in the 2nd dataset that fall **completely** within this window - * d: number of features in the 2nd dataset that **partially** overlap this window - -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/getIndelRates_3way.py --- a/tools/regVariation/getIndelRates_3way.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,249 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda - -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) - -import sys, os, tempfile -import traceback -import fileinput -from warnings import warn - -from galaxy.tools.util.galaxyops import * -from bx.intervals.io import * - -from bx.intervals.operations import quicksect - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def counter(node, start, end, sort_col): - global full, blk_len, blk_list - if node.start < start: - if node.right: - counter(node.right, start, end, sort_col) - elif start <= node.start <= end and start <= node.end <= end: - full += 1 - if node.other[0] not in blk_list: - blk_list.append(node.other[0]) - blk_len += int(node.other[sort_col+2]) - if node.left and node.left.maxend > start: - counter(node.left, start, end, sort_col) - if node.right: - counter(node.right, start, end, sort_col) - elif node.start > end: - if node.left: - counter(node.left, start, end, sort_col) - - -infile = sys.argv[1] -fout = open(sys.argv[2],'w') -int_file = sys.argv[3] -if int_file != "None": #User has specified an interval file - try: - fint = open(int_file, 'r') - dbkey_i = sys.argv[4] - chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) - except: - stop_err("Unable to open input Interval file") - -def main(): - - for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - - if len( elems ) != 18: - stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) - - for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - elems = line.split('\t') - try: - assert int(elems[0]) - assert len(elems) == 18 - if int_file != "None": - if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: - stop_err("The species build corresponding to your interval file is not present in the Indel file.") - if dbkey_i in elems[3]: - sort_col = 4 - elif dbkey_i in elems[8]: - sort_col = 9 - elif dbkey_i in elems[13]: - sort_col = 14 - else: - species = [] - species.append( elems[3].split('.')[0] ) - species.append( elems[8].split('.')[0] ) - species.append( elems[13].split('.')[0] ) - sort_col = 0 #Based on block numbers - break - except: - continue - - - fin = open(infile, 'r') - skipped = 0 - - if int_file == "None": - sorted_infile = tempfile.NamedTemporaryFile() - cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile - try: - os.system(cmdline) - except: - stop_err("Encountered error while sorting the input file.") - print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2]) - prev_bnum = -1 - sorted_infile.seek(0) - for line in sorted_infile.readlines(): - line = line.rstrip('\r\n') - elems = line.split('\t') - try: - assert int(elems[0]) - assert len(elems) == 18 - new_bnum = int(elems[0]) - if new_bnum != prev_bnum: - if prev_bnum != -1: - irate = [] - drate = [] - for i,elem in enumerate(inserts): - try: - irate.append(str("%.2e" %(inserts[i]/blen[i]))) - except: - irate.append('0') - try: - drate.append(str("%.2e" %(deletes[i]/blen[i]))) - except: - drate.append('0') - print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) - inserts = [0.0, 0.0, 0.0] - deletes = [0.0, 0.0, 0.0] - blen = [] - blen.append( int(elems[6]) ) - blen.append( int(elems[11]) ) - blen.append( int(elems[16]) ) - line_sp = elems[1].split('.')[0] - sp_ind = species.index(line_sp) - if elems[1].endswith('insert'): - inserts[sp_ind] += 1 - elif elems[1].endswith('delete'): - deletes[sp_ind] += 1 - prev_bnum = new_bnum - except Exception, ei: - #print >>sys.stderr, ei - continue - irate = [] - drate = [] - for i,elem in enumerate(inserts): - try: - irate.append(str("%.2e" %(inserts[i]/blen[i]))) - except: - irate.append('0') - try: - drate.append(str("%.2e" %(deletes[i]/blen[i]))) - except: - drate.append('0') - print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) - sys.exit() - - - inf = open(infile, 'r') - start_met = False - end_met = False - sp_file = tempfile.NamedTemporaryFile() - for n, line in enumerate(inf): - line = line.rstrip('\r\n') - elems = line.split('\t') - try: - assert int(elems[0]) - assert len(elems) == 18 - if dbkey_i not in elems[1]: - if not(start_met): - continue - else: - sp_end = n - break - else: - print >>sp_file, line - if not(start_met): - start_met = True - sp_start = n - except: - continue - - try: - assert sp_end - except: - sp_end = n+1 - - sp_file.seek(0) - win = NiceReaderWrapper( fileinput.FileInput( int_file ), - chrom_col=chr_col_i, - start_col=start_col_i, - end_col=end_col_i, - strand_col=strand_col_i, - fix_strand=True) - - indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), - chrom_col=1, - start_col=sort_col, - end_col=sort_col+1, - strand_col=-1, - fix_strand=True) - - indelTree = quicksect.IntervalTree() - for item in indel: - if type( item ) is GenomicInterval: - indelTree.insert( item, indel.linenum, item.fields ) - result=[] - - global full, blk_len, blk_list - for interval in win: - if type( interval ) is Header: - pass - if type( interval ) is Comment: - pass - elif type( interval ) == GenomicInterval: - chrom = interval.chrom - start = int(interval.start) - end = int(interval.end) - if start > end: - warn( "Interval start after end!" ) - ins_chr = "%s.%s_insert" %(dbkey_i,chrom) - del_chr = "%s.%s_delete" %(dbkey_i,chrom) - irate = 0 - drate = 0 - if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: - pass - else: - if ins_chr in indelTree.chroms: - full = 0.0 - blk_len = 0 - blk_list = [] - root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree - counter(root, start, end, sort_col) - if blk_len: - irate = full/blk_len - - if del_chr in indelTree.chroms: - full = 0.0 - blk_len = 0 - blk_list = [] - root = indelTree.chroms[del_chr] #root node for the chrom insertion tree - counter(root, start, end, sort_col) - if blk_len: - drate = full/blk_len - - interval.fields.append(str("%.2e" %irate)) - interval.fields.append(str("%.2e" %drate)) - print >>fout, "\t".join(interval.fields) - fout.flush() - -if __name__ == "__main__": - main() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/getIndelRates_3way.xml --- a/tools/regVariation/getIndelRates_3way.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -<tool id="indelRates_3way" name="Estimate Indel Rates" version="1.0.0"> - <description> for 3-way alignments</description> - <command interpreter="python"> - getIndelRates_3way.py $input1 $out_file1 - #if $region.type == "align" - "None" - #else - $region.input2 $input2.dbkey $input2.metadata.chromCol,$input2.metadata.startCol,$input2.metadata.endCol,$input2.metadata.strandCol - #end if - </command> - <inputs> - <page> - <param format="tabular" name="input1" type="data" label="Select dataset containing Indels"/> - - <conditional name="region"> - <param name="type" type="select" label="Estimate rates corresponding to" multiple="false"> - <option value="win" selected="True">Intervals in your history</option> - <option value="align">Alignment block</option> - </param> - <when value="win"> - <param format="interval" name="input2" type="data" label="Choose intervals"> - <validator type="unspecified_build" /> - </param> - </when> - <when value="align" /> - </conditional> - - </page> - </inputs> - <outputs> - <data format="tabular" name="out_file1" metadata_source="input1"/> - </outputs> - - <tests> - <test> - <param name="input1" value="indels_3way.tabular"/> - <param name="type" value="align"/> - <output name="out_file1" file="indelrates_3way.tabular"/> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This tool estimates the insertion and deletion rates for alignments in a window of specified size. Rates are computed over the total adjusted lengths (adjusted by disregarding masked bases) of all the alignments blocks from the indel file that fall within that window. - ------ - -.. class:: warningmark - -**Note** - -This tool only works on the output of the 'Estimate Indel Rates for 3-way alignments' tool. - -</help> - - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/getIndels.py --- a/tools/regVariation/getIndels.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ -#!/usr/bin/env python - -""" -Estimate INDELs for pair-wise alignments. - -usage: %prog maf_input out_file1 out_file2 -""" - -from __future__ import division -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -try: - pkg_resources.require("numpy") -except: - pass -import psyco_full -import sys -from bx.cookbook import doc_optparse -from galaxy.tools.exception_handling import * -import bx.align.maf - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - # Parsing Command Line here - options, args = doc_optparse.parse( __doc__ ) - - try: - inp_file, out_file1 = args - except: - print >> sys.stderr, "Tool initialization error." - sys.exit() - - try: - fin = open(inp_file,'r') - except: - print >> sys.stderr, "Unable to open input file" - sys.exit() - try: - fout1 = open(out_file1,'w') - #fout2 = open(out_file2,'w') - except: - print >> sys.stderr, "Unable to open output file" - sys.exit() - - try: - maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) - except: - print >>sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - maf_count = 0 - - print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length" - for block_ind, block in enumerate(maf_reader): - if len(block.components) < 2: - continue - seq1 = block.components[0].text - src1 = block.components[0].src - start1 = block.components[0].start - if len(block.components) == 2: - seq2 = block.components[1].text - src2 = block.components[1].src - start2 = block.components[1].start - #for pos in range(len(seq1)): - nt_pos1 = start1-1 #position of the nucleotide (without counting gaps) - nt_pos2 = start2-1 - pos = 0 #character column position - gaplen1 = 0 - gaplen2 = 0 - prev_pos_gap1 = 0 - prev_pos_gap2 = 0 - while pos < len(seq1): - if prev_pos_gap1 == 0: - gaplen1 = 0 - if prev_pos_gap2 == 0: - gaplen2 = 0 - - if seq1[pos] == '-': - if seq2[pos] != '-': - nt_pos2 += 1 - gaplen1 += 1 - prev_pos_gap1 = 1 - #write 2 - if prev_pos_gap2 == 1: - prev_pos_gap2 = 0 - print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2) - if pos == len(seq1)-1: - print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1) - else: - prev_pos_gap1 = 0 - prev_pos_gap2 = 0 - """ - if prev_pos_gap1 == 1: - prev_pos_gap1 = 0 - print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1) - elif prev_pos_gap2 == 1: - prev_pos_gap2 = 0 - print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2) - """ - else: - nt_pos1 += 1 - if seq2[pos] != '-': - nt_pos2 += 1 - #write both - if prev_pos_gap1 == 1: - prev_pos_gap1 = 0 - print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1) - elif prev_pos_gap2 == 1: - prev_pos_gap2 = 0 - print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2) - else: - gaplen2 += 1 - prev_pos_gap2 = 1 - #write 1 - if prev_pos_gap1 == 1: - prev_pos_gap1 = 0 - print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1) - if pos == len(seq1)-1: - print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2) - pos += 1 -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/getIndels_2way.xml --- a/tools/regVariation/getIndels_2way.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -<tool id="getIndels_2way" name="Fetch Indels"> - <description> from pairwise alignments</description> - <command interpreter="python"> - getIndels.py $input1 $out_file1 - </command> - <inputs> - <page> - <param format="maf" name="input1" type="data" label="Select data"/> - </page> - </inputs> - <outputs> - <data format="tabular" name="out_file1" metadata_source="input1"/> - </outputs> - <requirements> - <requirement type="python-module">numpy</requirement> - </requirements> - <tests> - <test> - <param name="input1" value="6.maf"/> - <output name="out_file1" file="6_indels.tabular"/> - </test> - </tests> - <help> - -.. class:: infomark - -**What it does** - -This tool estimates the number of indels for every alignment block of the MAF file. - ------ - -.. class:: warningmark - -**Note** - -Any block/s not containing exactly 2 species will be omitted. - ------ - -**Example** - -- For the following alignment block:: - - a score=7233.0 - s hg18.chr1 100 35 + 247249719 AT--GACTGAGGACTTAGTTTAAGATGTTCCTACT - s rheMac2.chr11 200 31 + 134511895 ATAAG-CGGACGACTTAGTTTAAGATGTTCC---- - -- running this tool will return:: - - #Block Source Seq1_Start Seq1_End Seq2_Start Seq2_End Indel_length - 1 hg18.chr1 101 102 202 204 2 - 1 rheMac2.chr11 103 104 204 205 1 - 1 rheMac2.chr11 129 133 229 230 4 - -</help> - - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/getIndels_3way.xml --- a/tools/regVariation/getIndels_3way.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -<tool id="indels_3way" name="Fetch Indels" version="1.0.3"> - <description> from 3-way alignments</description> - <command interpreter="perl"> - parseMAF_smallIndels.pl $input1 $out_file1 $outgroup - </command> - <inputs> - <page> - <param format="maf" name="input1" type="data" label="Select data"/> - <param name="outgroup" type="select" label="Select outgroup species"> - <options> - <filter type="data_meta" ref="input1" key="species" /> - </options> - </param> - </page> - </inputs> - <outputs> - <data format="tabular" name="out_file1" metadata_source="input1"/> - <!--<data format="tabular" name="out_file2" metadata_source="input1"/>--> - </outputs> - <tests> - <test> - <param name="input1" value="3way.maf"/> - <param name="outgroup" value="canFam2"/> - <output name="out_file1" file="indels_3way.tabular"/> - </test> - </tests> - <help> - -.. class:: infomark - -**What it does** - -This tool consists of the first module from the computational pipeline to identify indels as described in Kvikstad et al., 2007. Note that the generated output does not include subsequent filtering steps. - -Deletions in a particular species are identified as one or more consecutive gap columns within an alignment block, given that the orthologous positions in the other two species contain nucleotides of -equal length. -Similarly, insertions in a particular species are identified as one or more consecutive nucleotide columns within an alignment block, given that the orthologous positions in the other two -species contain gaps. - -*Kvikstad E. M. et al. (2007). A Macaques-Eye View of Human Insertions and Deletions: Differences in Mechanisms. PLoS Computational Biology 3(9):e176* - ------ - -.. class:: warningmark - -**Note** - -Any block/s not containing exactly 3 sequences will be omitted. - - </help> - - -</tool> \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/linear_regression.py --- a/tools/regVariation/linear_regression.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,147 +0,0 @@ -#!/usr/bin/env python - -from galaxy import eggs -import sys, string -from rpy import * -import numpy - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -infile = sys.argv[1] -y_col = int(sys.argv[2])-1 -x_cols = sys.argv[3].split(',') -outfile = sys.argv[4] -outfile2 = sys.argv[5] - -print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) -fout = open(outfile,'w') -elems = [] -for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - -if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - -y_vals = [] -x_vals = [] - -for k,col in enumerate(x_cols): - x_cols[k] = int(col)-1 - x_vals.append([]) - -NA = 'NA' -for ind,line in enumerate( file( infile )): - if line and not line.startswith( '#' ): - try: - fields = line.split("\t") - try: - yval = float(fields[y_col]) - except: - yval = r('NA') - y_vals.append(yval) - for k,col in enumerate(x_cols): - try: - xval = float(fields[col]) - except: - xval = r('NA') - x_vals[k].append(xval) - except: - pass - -x_vals1 = numpy.asarray(x_vals).transpose() - -dat= r.list(x=array(x_vals1), y=y_vals) - -set_default_mode(NO_CONVERSION) -try: - linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat)) -except RException, rex: - stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") -set_default_mode(BASIC_CONVERSION) - -coeffs=linear_model.as_py()['coefficients'] -yintercept= coeffs['(Intercept)'] -summary = r.summary(linear_model) - -co = summary.get('coefficients', 'NA') -""" -if len(co) != len(x_vals)+1: - stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.") -""" - -try: - yintercept = r.round(float(yintercept), digits=10) - pvaly = r.round(float(co[0][3]), digits=10) -except: - pass - -print >>fout, "Y-intercept\t%s" %(yintercept) -print >>fout, "p-value (Y-intercept)\t%s" %(pvaly) - -if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable - try: - slope = r.round(float(coeffs['x']), digits=10) - except: - slope = 'NA' - try: - pval = r.round(float(co[1][3]), digits=10) - except: - pval = 'NA' - print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope) - print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval) -else: #Multiple regression case with >1 predictors - ind=1 - while ind < len(coeffs.keys()): - try: - slope = r.round(float(coeffs['x'+str(ind)]), digits=10) - except: - slope = 'NA' - print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,slope) - try: - pval = r.round(float(co[ind][3]), digits=10) - except: - pval = 'NA' - print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval) - ind+=1 - -rsq = summary.get('r.squared','NA') -adjrsq = summary.get('adj.r.squared','NA') -fstat = summary.get('fstatistic','NA') -sigma = summary.get('sigma','NA') - -try: - rsq = r.round(float(rsq), digits=5) - adjrsq = r.round(float(adjrsq), digits=5) - fval = r.round(fstat['value'], digits=5) - fstat['value'] = str(fval) - sigma = r.round(float(sigma), digits=10) -except: - pass - -print >>fout, "R-squared\t%s" %(rsq) -print >>fout, "Adjusted R-squared\t%s" %(adjrsq) -print >>fout, "F-statistic\t%s" %(fstat) -print >>fout, "Sigma\t%s" %(sigma) - -r.pdf( outfile2, 8, 8 ) -if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable - sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept) - try: - r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression") - r.abline(a=yintercept, b=slope, col="red") - except: - pass -else: - r.pairs(dat, main="Scatterplot Matrix", col="blue") -try: - r.plot(linear_model) -except: - pass -r.dev_off() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/linear_regression.xml --- a/tools/regVariation/linear_regression.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -<tool id="LinearRegression1" name="Perform Linear Regression" version="1.0.1"> - <description> </description> - <command interpreter="python"> - linear_regression.py - $input1 - $response_col - $predictor_cols - $out_file1 - $out_file2 - 1>/dev/null - </command> - <inputs> - <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> - <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" numerical="True"/> - <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" numerical="True" multiple="true" > - <validator type="no_options" message="Please select at least one column."/> - </param> - </inputs> - <outputs> - <data format="input" name="out_file1" metadata_source="input1" /> - <data format="pdf" name="out_file2" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <tests> - <test> - <param name="input1" value="regr_inp.tabular"/> - <param name="response_col" value="3"/> - <param name="predictor_cols" value="1,2"/> - <output name="out_file1" file="regr_out.tabular"/> - <output name="out_file2" file="regr_out.pdf"/> - </test> - </tests> - <help> - - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* - ------ - -.. class:: infomark - -**What it does** - -This tool uses the 'lm' function from R statistical package to perform linear regression on the input data. It outputs two files, one containing the summary statistics of the performed regression, and the other containing diagnostic plots to check whether model assumptions are satisfied. - -*R Development Core Team (2009). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.* - ------ - -.. class:: warningmark - -**Note** - -- This tool currently treats all predictor and response variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. - -- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. - -- The summary statistics in the output are described below: - - - sigma: the square root of the estimated variance of the random error (standard error of the residiuals) - - R-squared: the fraction of variance explained by the model - - Adjusted R-squared: the above R-squared statistic adjusted, penalizing for the number of the predictors (p) - - p-value: p-value for the t-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative. - - - </help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/maf_cpg_filter.py --- a/tools/regVariation/maf_cpg_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda -#Adapted from bx/scripts/maf_mask_cpg.py -""" -Mask out potential CpG sites from a maf. Restricted or inclusive definition -of CpG sites can be used. The total fraction masked is printed to stderr. - -usage: %prog < input > output restricted - -m, --mask=N: Character to use as mask ('?' is default) -""" - -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -try: - pkg_resources.require( "numpy" ) -except: - pass -import bx.align -import bx.align.maf -from bx.cookbook import doc_optparse -import sys -import bx.align.sitemask.cpg - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - options, args = doc_optparse.parse( __doc__ ) - try: - inp_file, out_file, sitetype, definition = args - if options.mask: - mask = int(options.mask) - else: - mask = 0 - except: - print >> sys.stderr, "Tool initialization error." - sys.exit() - - reader = bx.align.maf.Reader( open(inp_file, 'r') ) - writer = bx.align.maf.Writer( open(out_file,'w') ) - - mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'} - mask = mask_chr_dict[mask] - - if sitetype == "CpG": - if int(definition) == 1: - cpgfilter = bx.align.sitemask.cpg.Restricted( mask=mask ) - defn = "CpG-Restricted" - else: - cpgfilter = bx.align.sitemask.cpg.Inclusive( mask=mask ) - defn = "CpG-Inclusive" - else: - cpgfilter = bx.align.sitemask.cpg.nonCpG( mask=mask ) - defn = "non-CpG" - cpgfilter.run( reader, writer.write ) - - print "%2.2f percent bases masked; Mask character = %s, Definition = %s" %(float(cpgfilter.masked)/float(cpgfilter.total) * 100, mask, defn) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/maf_cpg_filter.xml --- a/tools/regVariation/maf_cpg_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ -<tool id="cpgFilter" name="Mask CpG/non-CpG sites" version="1.0.0"> - <description> from MAF file</description> - <command interpreter="python"> - maf_cpg_filter.py - $input - $out_file1 - $masksite.type - #if $masksite.type == "CpG": - $masksite.definition - #else: - "NA" - #end if - -m $mask_char - </command> - <inputs> - <page> - <param format="maf" name="input" type="data" label="Select data"/> - <param name="mask_char" size="5" type="select" label="Mask character"> - <option value="0" selected="true">#</option> - <option value="1">$</option> - <option value="2">^</option> - <option value="3">*</option> - <option value="4">?</option> - <option value="5">N</option> - </param> - <conditional name="masksite"> - <param name="type" size="5" type="select" label="Sites to be masked"> - <option value="CpG" selected="true">CpG sites</option> - <option value="nonCpG">non-CpG sites</option> - </param> - <when value="CpG"> - <param name="definition" size="5" type="select" label="Definition"> - <option value="0" selected="true">Inclusive</option> - <option value="1">Restricted</option> - </param> - </when> - <when value="nonCpG" /> - </conditional> - </page> - </inputs> - <outputs> - <data format="maf" name="out_file1" metadata_source="input"/> - </outputs> - <requirements> - <requirement type="python-module">numpy</requirement> - </requirements> - <tests> - <test> - <param name="input" value="6.maf"/> - <param name="mask_char" value="0"/> - <param name="type" value="CpG" /> - <param name="definition" value="0" /> - <output name="out_file1" file="6_mask_cpg.maf"/> - </test> - <test> - <param name="input" value="6.maf"/> - <param name="mask_char" value="0"/> - <param name="type" value="nonCpG" /> - <output name="out_file1" file="6_mask_noncpg.maf"/> - </test> - </tests> - <help> - -.. class:: infomark - -**What it does** - -This tool takes a MAF file as input and masks CpG sites in every alignment block of the MAF file. - ------ - -.. class:: warningmark - -**Note** - -*Inclusive definition* defines CpG sites as those sites that are CG in at least one of the species. - -*Restricted definition* considers sites to be CpG if they are CG in at least one of the species, however, sites that are part of overlapping CpGs are excluded. - -For more information on CpG site definitions, please refer this article_. - -.. _article: http://mbe.oxfordjournals.org/cgi/content/full/23/3/565 - - </help> - - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/microsatellite_birthdeath.pl --- a/tools/regVariation/microsatellite_birthdeath.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3984 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use warnings; -use Term::ANSIColor; -use Pod::Checker; -use File::Basename; -use IO::Handle; -use Cwd; -use File::Path qw(make_path remove_tree); -use File::Temp qw/ tempfile tempdir /; -my $tdir = tempdir( CLEANUP => 0 ); -chdir $tdir; -my $dir = getcwd; -#print "current dit=$dir\n"; - -use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags -$infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species -$gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies @exacttags); -use FileHandle; -use IO::Handle; # 5.004 or higher - -#my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat", -#"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12", -#"10","0.8"); -my @ar = @ARGV; -my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar; -$SIMILARITY_THRESH=$SIMILARITY_THRESH/100; -######################### -$SIMILARITY_THRESH = $SIMILARITY_THRESH/100; -my $EDGE_DISTANCE = 10; -my $COMPLEXITY_SUPPORT = 20; -load_thresholds("9_10_12_12"); -######################### - -my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); - -#print "complexity=$complexity\n"; -#<STDIN>; - -#$printer = 1; - -my $rando = int(rand(1000)); -my $localdate = `date`; -$localdate =~ /([0-9]+):([0-9]+):([0-9]+)/; -my $info = $rando.$1.$2.$3; - -#--------------------------------------------------------------------------- -# GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES - - -my @thresharr = (0, split(/,/,$thresholds)); -my $randno=int(rand(100000)); -my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt"; -my $megamatchlck = $megamatch.".lck"; -unlink $megamatchlck; - -#my $selected= $orth; -#my $eventfile = $orth; -#$selected = $selected."_SELECTED"; -#$selected = $selected."_".$SIMILARITY_THRESH; -#my $runtime = $selected.".runtime"; - -my $inputtags = "H:C:O:R:M"; -$inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/; - -my @all_tags = split(/:/, $inputtags); -my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1"; -$inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/; -@sp_ident = split(/:/,$inputsp); -my $junkfile = $orth."_junk"; - -my $sh = load_sameHash(1); -my $rh = load_revHash(1); - -#print "inputs are : \n"; foreach(@ARGV){print $_,"\n";} -#open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!"; -open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!"; -#open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!"; -#my $ctlfile = "baseml\.ctl"; #$ARGV[4]; -#my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK -my %registeredTrees = (); -my @removalReasons = -("microsatellite is compound", -"complex structure", -"if no. if micros is more than no. of species", -"if more than one micro per species ", -"if microsat contains N", -"different motif than required ", -"more than zero interruptions", -"microsat could not form key ", -"orthologous microsats of different motif size ", -"orthologous microsats of different motifs ", -"microsats belong to different alignment blocks altogether", -"microsat near edge", -"microsat in low complexity region", -"microsat flanks dont align well", -"phylogeny not informative"); -my %allowedhash=(); -#--------------------------------------------------------------------------- -# WORKING ON MAKING THE MEGAMATCH FILE -my $chromt=int(rand(10000)); -my $p_chr=$chromt; - -$tree_definition=~s/,/, /g; -$tree_definition =~ s/, +/, /g; -my @exactspeciesset_unarranged = split(/,/,$species_set); -my $largesttree = "$tree_definition;"; -$tree_definition=~s/[\)\(, ]/\t/g; - -my @treespecies=split(/\t+/,$tree_definition); - -foreach my $spec (@treespecies){ - foreach my $espec (@exactspeciesset_unarranged){ - push @exactspecies, $spec if $spec eq $espec; - } -} -#print "exactspecies=@exactspecies\n"; -my $focalspec = $exactspecies[0]; -my $arranged_species_set=join(".",@exactspecies); -@exacttags=@exactspecies; -foreach my $extag (@exacttags){ - $extag =~ s/hg18/H/g; - $extag =~ s/panTro2/C/g; - $extag =~ s/ponAbe2/O/g; - $extag =~ s/rheMac2/R/g; - $extag =~ s/calJac1/M/g; -} -my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); -#print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; -maftoAxt_multispecies($maf, $tree_definition, $chr_name, $species_set); -my @filterseqfiles= ($chr_name); - $largesttree =~ s/hg18/H/g; - $largesttree =~ s/panTro2/C/g; - $largesttree =~ s/ponAbe2/O/g; - $largesttree =~ s/rheMac2/R/g; - $largesttree =~ s/calJac1/M/g; -#--------------------------------------------------------------------------- - -my ($lagestnodes, $largestbranches) = get_nodes($largesttree); -shift (@$lagestnodes); -my @extendedtitle=(); - -my $title = (); -my $parttitle = (); -my @titlearr = (); -my @firsttitle=($focalspec."chrom", $focalspec."start", $focalspec."end", $focalspec."motif", $focalspec."motifsize", $focalspec."threshold"); - -my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details); - -my @fititle=(); - -foreach my $spec (split(",",$species_set)){ - push @fititle, $spec; - foreach my $name (@finames){ - push @fititle, $spec.".".$name; - } -} - - -my @othertitle=qw(somechr somestart somened event source); - -my @fnames = (); -push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ; -push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ; -push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ; -push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ; -push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num); - -my @fullnames = (); - -foreach my $lnode (@$lagestnodes){ - my @pair = @$lnode; - my @nodemutarr = (); - for my $p (@pair){ - # print "p = $p\n"; - $p =~ s/[\(\), ]+//g; - $p =~ s/H/hg18/g; - $p =~ s/C/panTro2/g; - $p =~ s/O/ponAbe2/g; - $p =~ s/R/rheMac2/g; - $p =~ s/M/calJac1/g; - foreach my $n (@fnames) { push @fullnames, $p.".".$n;} - } -} -print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle); - -print SUMMARY "\t",join("\t", @fullnames); -#$title = $title."\t".join("\t", @fullnames); - -print SUMMARY "\t",join("\t", @fnames); -#$title= $title."\t".join("\t", @fnames); - -print SUMMARY "\t","tree","\t", "cleancase", "\n"; -#$title= $title."\t"."tree"."\t"."cleancase". "\n"; - -#print $title; #<STDIN>; - -#print "all_tags = @all_tags\n"; - -for my $no (3 ... $#all_tags+1){ -# print "no=$no\n"; #<STDIN>; - @tags = @all_tags[0 ... $no-1]; - #print "tags = = @tags\n" if $printer == 1; - %template=(); - my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - #next if scalar(@tags) < 4; - - #print "now doing tags = @tags, no = $no\n"; - open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!"; - -# print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion); - - - ##################### T E M P O R A R Y ##################### - my @finaltitle=(); - my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13); - my $endtitle = (); - foreach my $tag (@tags){ - my @tempsingle = (); - - foreach my $single (@singletitle){ - push @tempsingle, $tag.$single; - } - @finaltitle = (@finaltitle, @tempsingle); - } - -# print SUMMARY join("\t",@finaltitle),"\n"; - - ############################################################# - - #--------------------------------------------------------------------------- - # GET THE TREE FROM TREE FILE - my $tree = (); - $tree = "((H, C), O)" if $no == 3; - $tree = "(((H, C), O), R)" if $no == 4; - $tree = "((((H, C), O), R), M)" if $no == 5; -# $tree=~s/;$//g; -# print "our tree = $tree\n"; - #--------------------------------------------------------------------------- - # LOADING HASH CONTAINING ALL POSSIBLE TREES: - $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt"; - load_allPossibleTrees($tree_decipherer, \%template, \%alternate); - - #--------------------------------------------------------------------------- - # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS - %treesToReject=(); - %treesToIgnore=(); - load_treesToReject(@tags); - load_treesToIgnore(@tags); - #--------------------------------------------------------------------------- - # LOADING INPUT DATA INTO HASHES AND ARRAYS - - - #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS - #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF - #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES). - - my %orths=(); - my $counterm = 0; - my $loaded = 0; - my %seen = (); - my @allowedchrs = (); -# print "no = $no\n"; #<STDIN>; - - while (my $line = <ORTH>){ - #print "line=$line\n"; - $line =~ s/>hg18/>H/g; - $line =~ s/>panTro2/>C/g; - $line =~ s/>ponAbe2/>O/g; - $line =~ s/>rheMac2/>R/g; - $line =~ s/>calJac1/>M/g; - my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros - #print "micros=",printarr(@micros),"\n"; #<STDIN>; - shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY - $no_of_species = adjustCoordinates($micros[0]); - next if $no_of_species != $no; - $counterm++; - #------------------------------------------------ - $nextcounter[0]++ if $line =~ /compound/; - next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS - #------------------------------------------------ - #next if $line =~ /[A-Za-z]>[a-zA-Z]/; - #------------------------------------------------ - chomp $line; - my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER - #print "number of species = $match_count\n"; - my $stopper = 0; - foreach my $mic (@micros){ - my @local = split(/\t/,$mic); - if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++; - last; } - # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM - } - next if $stopper ==1; - #------------------------------------------------ - $nextcounter[2]++ if (scalar(@micros) >$no_of_species); - - next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET. - #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY - #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING - #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD. - #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl - my $nexter = 0; - foreach my $tag (@tags){ - my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g); - if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1; - next; - } - } - - if ($nexter == 1){ - $nextcounter[3]++; - next; - } - #------------------------------------------------ - foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM - my @local = split(/\t/,$mic); - if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++; - last;} - } - next if $stopper ==1; - #print "till here 1\n"; #<STDIN>; - #------------------------------------------------ - my @micros_copy = @micros; - - my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST - #2 MICROSAT IN THE CLUSTER. - my @tempfields = split(/\t/,$tempmicro); - my $prevtype = $tempfields[$typecord]; - my $tempmotif = $tempfields[$motifcord]; - - my $tempfirstmotif = (); - if (scalar(@tempfields) > $microsatcord + 2){ - if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS - #2 IN THE FIRST MICROSAT OF THE CLUSTER - $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; - } - } - if ($nexter == 1){ - $nextcounter[6]++; - next; - } #1 DONE OBTAINING INFORMATION REGARDING - #2 THE FIRST MICROSAT FROM THE CLUSTER - - if ($tempmotif =~ /^\[/){ - $tempmotif =~ s/^\[//g; - $tempmotif =~ /([a-zA-Z]+)\].*/; - $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT - } - else {$tempfirstmotif = $tempmotif;} - my $prevmotif = $tempfirstmotif; - - my $key = (); - if ($tempmicro =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { - $key = join("\t",$1, $2, $4, $5); - } - else{ - # print "counld not form a key \n" if $printer == 1; - $nextcounter[7]++; - next; - } - #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT - #2 THE FIRST MICROSAT IN THE CLUSTER, THE - #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS - #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW) - - foreach my $micro (@micros_copy){ - my @fields = split(/\t/,$micro); - #----------------- - if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS - if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; - $nextcounter[6]++; - last; } - } - #----------------- - if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) { - $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1; - $nextcounter[8]++; - last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG - #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.) - $prevtype = $fields[$typecord]; - - my $motif = $fields[$motifcord]; - my $firstmotif = (); - - if ($motif =~ /^\[/){ - $motif =~ s/^\[//g; - $motif =~ /([a-zA-Z]+)\].*/; - $firstmotif = $1; - } - else {$firstmotif = $motif;} - - my $motifpattern = $firstmotif.$firstmotif; - my $prevmotifpattern = $prevmotif.$prevmotif; - - if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) { - $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1; - $nextcounter[9]++; - last; - } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG - #2 TO DIFFERENT MOTIFS - my $prevmotif = $firstmotif; - #----------------- - - for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG - #2 DIFFERENT ALIGNMENT BLOCKS - if ($micro =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { - my $key2 = join("\t",$1, $2, $4, $5); - if ($key2 ne $key){ -# print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1; - $nextcounter[10]++; - $nexter = 1; last; - } - } - else{ - # print "counld not form a key \n" if $printer == 1; - $nexter = 1; last; - } - } - - } - ##################### - if ($nexter == 1){ - # print "nexting\n" if $printer == 1; - next; - } - else{ -# print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1; - push (@{$orths{$key}},$line); - $loaded++; - if ($line =~ /($focalspec)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { -# print "$line\n" if $printer == 1; #if $line =~ /Contig/; -# print "################ ################\n" if $printer == 1; - push @allowedchrs, $2 if !exists $allowedhash{$2}; - $allowedhash{$2} = 1; - my $key = join("\t",$1, $2, $3, $4); - #print "print the shit: $key\n" if $printer == 1; - $seen{$key} = 1; - } - else { #print "Key could not be formed in SPUT for ($org) ($title) ([0-9]+) ([0-9]+)\n"; - } - } - } - close ORTH; - -# print "now studying where we lost microsatellites: @nextcounter\n"; - for my $reason (0 ... $#nextcounter){ -# print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; - } -# print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n"; -# print "done filtering .. counterm = $counterm and loaded = $loaded\n"; - - #---------------------------------------------------------------------------------------------------------------- - # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY. - - while (1){ - if (-e $megamatchlck){ -# print "waiting to write into $megamatchlck\n"; - sleep 10; - } - else{ - open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!"; - open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!"; - last; - } - } - - foreach my $seqfile (@filterseqfiles){ - my $fullpath = $seqfile; - -# print "opening file: $fullpath\n"; - open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!"; - my $matchlines = 0; - - while (my $line = <MATCH>) { - if ($line =~ /($focalspec)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { - my $key = join("\t",$1, $2, $3, $4); - if (exists $seen{$key}){ - while (1){ - $matchlines++; - print MEGAM $line; - $line = <MATCH>; - print MEGAM "\n" if $line !~/[0-9a-zA-Z]/; - last if $line !~/[0-9a-zA-Z]/; - } - } - } - } -# print "matchlines = $matchlines\n"; - close MATCH; - } - close MEGAMLCK; - - unlink $megamatchlck; - close MEGAM; - undef %seen; - #---------------------------------------------------------------------------------------------------------------- - - #--------------------------------------------------------------------------- - # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO - # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE - # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER - #$printer = 1; - - my $microreadcounter=0; - my $contigsentered=0; - my $contignotrightcounter=0; - my $keynotformedcounter=0; - my $keynotfoundcounter= 0; - my $dotcounter = 0; - - open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!"; - - while (my $line = <BO>){ -# print "." if $dotcounter % 100 ==0; -# print "\n" if $dotcounter % 5000 ==0; -# print "dotcounter = $dotcounter\n " if $printer == 1; - next if $line !~ /^[0-9]+/; - $dotcounter++; -# print colored ['green'], "~" x 60, "\n" if $printer == 1; -# print colored ['green'], $line;# if $printer == 1; - chomp $line; - my @fields2 = split(/\t/,$line); - my $key2 = (); - my $alignment_no = (); #1 TEMPORARY - if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { - $key2 = join("\t",$1, $2, $4, $5); - $alignment_no=$1; - } - else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;} - - $no_of_species = adjustCoordinates($line); - $contignotrightcounter++ if $no_of_species != $no; - # print "contignotrightcounter=$contignotrightcounter\n"; - # print "no_of_species=$no_of_species\n"; - # print "no=$no\n"; - - next if $no_of_species != $no; - - # print "key = $key2\n" if $printer == 1; - my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS - #2 ALIGNMENT BLOCK - if (exists($orths{$key2})){ - @clusters = @{$orths{$key2}}; - $contigsentered++; - delete $orths{$key2}; - } - else{ - #print "orth does not exist\n"; - $keynotfoundcounter++; - next; - } - - my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK - my $humseq = (); - foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES - my $seq = <BO>; #2 OF ALL SPECIES. - chomp $seq; - $sequences{$tag} = " ".$seq; - #print "sequences = $sequences{$tag}\n" if $printer == 1; - $humseq = $seq if $tag =~ /H/; - } - - - foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS - #print "x" x 60, "\n" if $printer == 1; - #print colored ['red'],"cluster = $cluster\n"; - $largesttree =~ s/hg18/H/g; - $largesttree =~ s/panTro2/C/g; - $largesttree =~ s/ponAbe2/O/g; - $largesttree =~ s/rheMac2/R/g; - $largesttree =~ s/calJac1/M/g; - - $microreadcounter++; - my @micros = split(/>/,$cluster); - shift @micros; - - my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND - #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK - - my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES - my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME - #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES. - - my %microhash=(); - my %microsathash=(); - my %nonmicrosathash=(); - my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1 - #print "tags=@tags\n"; - for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE - #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER - #3 THE VARIABLE $TAG; - my $tag = $tags[$i]; - # print $seq; - my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES. - #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL" - - foreach my $micro (@micros){ - # print "micro=$micro, tag=$tag\n"; - if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND.. - $locus = $micro; - my @fields = split(/\t/,$micro); - $motif = $fields[$motifcord]; - $microsathash{$tag}=$fields[$microsatcord]; - # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n"; - push(@starts, $fields[$startcord]); - push(@ends, $fields[$endcord]); - $start_hash{$tag}=$fields[$startcord]; - $end_hash{$tag}=$fields[$endcord]; - last; - } - else{$microsathash{$tag}="NULL"} - } - $microhash{$tag}=$locus; - - } - - - - my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE - my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN - #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS. - - #print "starts=@starts... ends=@ends\n"; - - my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES - my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES - - my %up_largeflanks = (); - my %down_largeflanks = (); - - my %locusandflanks = (); - my %locusandlargeflanks = (); - - my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the - #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES - my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the - #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES - - my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO - #2 EXTEME VALUES. - - my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE - #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE - #3 WIHIN THE ALIGNMENT - next if !defined $extreme_start; - next if !defined $extreme_end; - next if $extreme_start > length($sequences{$tags[0]}); - next if $extreme_start < 0; - next if $extreme_end > length($sequences{$tags[0]}); - - for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING - #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES - #3 AS WELL AS THE EXTREME COORDINATES OF THE - #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE - #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN - #6 MORE DETAIL. - my $tag = $tags[$i]; - # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n"; - my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM - #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS - #3 CLUSTER. - - my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g); - if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){ - $edge_microsat=1; - - last; - } - else{ - $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT); - $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT); - - $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT); - $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT); - - - $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); - $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag}; - $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag}; - - if ($microhash{$tag} ne "NULL"){ - $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start); - $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag}); - $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start; -# print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1; - } - else{ - $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); - - } - # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1; - - } - - } - $nextcounter[11]++ if $edge_microsat==1; - next if $edge_microsat==1; - - - my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS - #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE - #3 FUNCTION sub test_complexity - - - for my $i (0 ... $#tags){ -# print "i = $tags[$i]\n" if $printer == 1; - if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){ -# print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1; - $low_complexity =1; last; - } - } - - $nextcounter[12]++ if $low_complexity==1; - next if $low_complexity == 1; - - - my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY - #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN - #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD - #4 AS DESCRIBED IN FUNCTION sub sequence_similarity - my %donepair = (); - for my $i (0 ... $#tags){ - # print "i = $tags[$i]\n" if $printer == 1; -# next if $i == 0; - # print colored ['magenta'],"THIS IS UP\n" if $printer == 1; - - for my $b (0 ... $#tags){ - next if $b == $i; - my $pair = (); - $pair = $i."_".$b if $i < $b; - $pair = $b."_".$i if $b < $i; - next if exists $donepair{$pair}; - my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); - my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); - $donepair{$pair} = $up_similarity."_".$down_similarity; - -# print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n"; - - if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){ - $sequence_dissimilarity =1; - last; - } - } - } - $nextcounter[13]++ if $sequence_dissimilarity==1; - - next if $sequence_dissimilarity == 1; - my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq); - # print "simplified_microsat=$simplified_microsat\n"; <STDIN>; - my ($tree_analysis, $alternative_trees, $conformation) = treeStudy($simplified_microsat); - - if (exists $treesToReject{$tree_analysis}){ - $nextcounter[14]++; - next; - } - -# my $adjuster=(); -# if ($no_of_species == 4){ -# my @sields = split(/\t/,$simplified_microsat); -# my $somend = pop(@sields); -# my $somestart = pop(@sields); -# my $somechr = pop(@sields); -# $adjuster = "NA\t" x 13 ; -# $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; -# } -# if ($no_of_species == 3){ -# my @sields = split(/\t/,$simplified_microsat); -# my $somend = pop(@sields); -# my $somestart = pop(@sields); -# my $somechr = pop(@sields); -# $adjuster = "NA\t" x 26 ; -# $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; -# } -# - $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis}; - $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis}; - - if (exists $treesToIgnore{$tree_analysis}){ - my @appendarr = (); - - print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; - #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; -# print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n"; - - foreach my $lnode (@$lagestnodes){ - my @pair = @$lnode; - my @nodemutarr = (); - for my $p (@pair){ - my @mutinfoarray1 = (); - for (1 ... 38){ - push (@mutinfoarray1, "NA") - } - print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; - } - - } - for (1 ... 38){ - push (@appendarr, "NA") - } - print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; - # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; - next; - } - - my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); - - if ($mutations_array eq "NULL"){ - my @appendarr = (); - - print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; - # print "SUMMARY ", $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; -# print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","EVENT", "\t\t", $cluster,"\n"; - - foreach my $lnode (@$lagestnodes){ - my @pair = @$lnode; - my @nodemutarr = (); - for my $p (@pair){ - my @mutinfoarray1 = (); - for (1 ... 38){ - push (@mutinfoarray1, "NA") - } - print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; - # print join ("\t", "SUMMARY", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; - } - - } - for (1 ... 38){ - push (@appendarr, "NA") - } - print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; - # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; - next; - } - - -# print "sent: \n" if $printer == 1; -# print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1; - - my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); -# print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1; - my @finalmutations_array= (); - @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL"; - @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL"; - - my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$tree_analysis}, \@finalmutations_array); - my $cleancase = "UNCLEAN"; - - $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/; - $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL"; - $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL"; - $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ; - $besttree = "NULL" if $treescore <= 0; - print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; - # print "SUMMARY ", $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; - -# print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","EVENT", "\t\t", $cluster,"\n"; - - my @mutinfoarray =(); - - foreach my $lnode (@$lagestnodes){ - my @pair = @$lnode; - my $joint = "(".join(", ",@pair).")"; - my @nodemutarr = (); - - for my $p (@pair){ - foreach my $mut (@finalmutations_array){ - $mut =~ /node=([A-Z, \(\)]+)/; - push @nodemutarr, $mut if $p eq $1; - } -# print "from pair @pair, p=$p\n"; - @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree); - print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; - # print "SUMMARY ",join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; - } - - } - - @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree); - print SUMMARY join ("\t", @mutinfoarray ),"\t"; - print SUMMARY $cleancase, "\n"; - # print "SUMMARY ",join ("\t", @mutinfoarray,$cleancase ),"\n"; #<STDIN>; - # print "summarized\n"; <STDIN>; - - my %indelcatch = (); - my %substcatch = (); - my %typecatch = (); - my %nodescatch = (); - my $mutconcat = join("\t", @finalmutations_array)."\n"; - my %indelposcatch = (); - my %subsposcatch = (); - - foreach my $fmut ( @finalmutations_array){ -# next if $fmut !~ /indeltype=[a-zA-Z]+/; - #print RUN $fmut, "\n"; - $fmut =~ /node=([a-zA-Z, \(\)]+)/; - my $lnode = $1; - $nodescatch{$1}=1; - - if ($fmut =~ /type=substitution/){ - # print "fmut=$fmut\n"; - $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/; - my $from=$1; - # print "from=$from\n"; - my $to=$2; - # print "to=$to\n"; - push @{$substcatch{$lnode}} , ("from:".$from." to:".$to); - $fmut =~ /position=([0-9]+)/; - push @{$subsposcatch{$lnode}}, $1; - } - - if ($fmut =~ /insertion=[a-zA-Z\-]+/){ - $fmut =~ /insertion=([a-zA-Z\-]+)/; - push @{$indelcatch{$lnode}} , $1; - $fmut =~ /indeltype=([a-zA-Z]+)/; - push @{$typecatch{$lnode}}, $1; - $fmut =~ /position=([0-9]+)/; - push @{$indelposcatch{$lnode}}, $1; - } - if ($fmut =~ /deletion=[a-zA-Z\-]+/){ - $fmut =~ /deletion=([a-zA-Z\-]+)/; - push @{$indelcatch{$lnode}} , $1; - $fmut =~ /indeltype=([a-zA-Z]+)/; - push @{$typecatch{$lnode}}, $1; - $fmut =~ /position=([0-9]+)/; - push @{$indelposcatch{$lnode}}, $1; - } - } - - # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1; - # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1; - # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1; - # #print RUN "."; - - # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; - # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; - - # print colored ['red'],"finalmutations_array=\n" if $printer == 1; - foreach (@finalmutations_array) { -# print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ; -# print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ; - - }# if $line =~ /cal/;# && $line =~ /chr4/; - -# print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; -# print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; -# print "tree analysis = $tree_analysis\n" if $printer == 1; - - # my $mutations = "@$mutations_array"; - - - next; - for my $keys (@$nodes) {foreach my $key (@$keys){ - #print "key = $key, => $branches_hash->{$key}\n"; - } - # print "x" x 50, "\n"; - } - my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat); - } - } - close BO; -# print "now studying where we lost microsatellites:"; -# print "x" x 60,"\n"; - for my $reason (0 ... $#nextcounter){ -# print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; - } -# print "x" x 60,"\n"; -# print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n"; -# print " we lost $keynotformedcounter contigs as they did not form the key, \n"; -# print "$contignotrightcounter contigs as they were not of the right species configuration\n"; -# print "$keynotfoundcounter contigs as they did not contain the microsats\n"; -# print "... In total we went through a file that had $dotcounter contigs...\n"; -# print join ("\n","remaining orth keys = ", (keys %orths),""); - - -# print "now printing counted trees: \n"; - if (scalar(keys %registeredTrees) > 0){ - foreach my $keyb ( sort (keys %registeredTrees) ) - { -# print "$keyb : $registeredTrees{$keyb}\n"; - } - } - - -} - -my @summarizarr = ("+C=+C +R.+C -HCOR,+C", -"+H=+H +R.+H -HCOR,+H", -"-C=-C -R.-C +HCOR,-C", -"-H=-H -R.-H +HCOR,-H", -"+HC=+HC", -"-HC=-HC", -"+O=+O -HCOR,+O", -"-O=-O +HCOR,-O", -"+HCO=+HCO", -"-HCO=-HCO", -"+R=+R +R.+C +R.+H", -"-R=-R -R.-C -R.-H"); - -foreach my $line (@summarizarr){ - next if $line !~ /[A-Za-z0-9]/; -# print $line; - chomp $line; - my @fields = split(/=/,$line); -# print "title = $fields[0]\n"; - my @parts=split(/ +/, $fields[1]); - my %partshash = (); - foreach my $part (@parts){$partshash{$part}=1;} - my $count=0; - foreach my $key ( sort keys %registeredTrees ){ - next if !exists $partshash{$key}; -# print "now adding $registeredTrees{$key} from $key\n"; - $count+=$registeredTrees{$key}; - } -# print "$fields[0] : $count\n"; -} -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -sub largest_number{ - my $counter = 0; - my($max) = shift(@_); - foreach my $temp (@_) { - - #print "finding largest array: $maxcounter \n"; - if($temp > $max){ - $max = $temp; - } - } - return($max); -} - -sub smallest_number{ - my $counter = 0; - my($min) = shift(@_); - foreach my $temp (@_) { - #print "finding largest array: $maxcounter \n"; - if($temp < $min){ - $min = $temp; - } - } - return($min); -} -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -sub baseml_parser{ - my $outputfile = $_[0]; - open(BOUT,"<$outputfile") or die "Cannot open output of upstream baseml $outputfile: $!"; - my @info = (); - my @branchields = (); - my @distanceields = (); - my @bout = <BOUT>; - #print colored ['red'], @bout ,"\n"; - for my $b (0 ... $#bout){ - my $bine=$bout[$b]; - #print colored ['yellow'], "sentence = ",$bine; - if ($bine =~ /TREE/){ - $bine=$bout[$b++]; - $bine=$bout[$b++]; - $bine=$bout[$b++]; - #print "FOUND",$bine; - chomp $bine; - $bine =~ s/^\s+//g; - @branchields = split(/\s+/,$bine); - $bine=$bout[$b++]; - chomp $bine; - $bine =~ s/^\s+//g; - @distanceields = split(/\s+/,$bine); - #print "LASTING..............\n"; - last; - } - else{ - } - } - - close BOUT; -# print "branchfields = @branchields and distanceields = @distanceields\n" if $printer == 1; - my %distance_hash=(); - for my $d (0 ... $#branchields){ - $distance_hash{$branchields[$d]} = $distanceields[$d]; - } - - $info[0] = $distance_hash{"9..1"} + $distance_hash{"9..2"}; - $info[1] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+ $distance_hash{"8..3"}; - $info[2] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"7..4"}; - $info[3] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"6..7"}+$distance_hash{"6..5"}; - -# print "\nsending back: @info\n" if $printer == 1; - - return join("\t",@info); - -} - - -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -sub test_complexity{ - my $printer = 0; - my $sequence = $_[0]; - my $COMPLEXITY_SUPPORT = $_[1]; - my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #1 THIS IS AN ARBITRARY THRESHOLD SET FOR LOW COMPLEXITY. - #2 THE INSPIRATION WAS WEB MILLER'S MAIL SENT ON - #3 19 Apr 2008 WHERE HE CLASSED AS HIGH COMPLEXITY - #4 REGION, IF 40 BP OF SEQUENCE HAS AT LEAST 3 OF - #5 EACH NUCLEOTIDE. HENCE, I NORMALIZE THIS PARAMETER - #6 FOR THE ACTUAL LENGTH OF $FLANK_SUPPORT SET BY - #7 THE USER. - #8 WEB MILLER SENT THE MAIL TO YDK104@PSU.EDU - - - - my $As = ($sequence=~ s/A/A/gi); - my $Ts = ($sequence=~ s/T/T/gi); - my $Gs = ($sequence=~ s/G/G/gi); - my $Cs = ($sequence=~ s/C/C/gi); - #print "seq = $sequence, As=$As, Ts=$Ts, Gs=$Gs, Cs=$Cs\n" if $printer == 1; - - my $ans = (); - - return "HIGH" if $As >= $complexity && $Ts >= $complexity && $Cs >= $complexity && $Gs >= $complexity; - - my @nts = ("A","T","G","C","-"); - - my $lowcomplex = 0; - - foreach my $nt (@nts){ - $lowcomplex =1 if $sequence =~ /(($nt\-*){10,})/i; -# print "caught with a mono of $nt : $1 in $sequence\n" if $sequence =~ /(($nt\-*){10,})/i; - $lowcomplex =1 if $sequence =~ /(($nt[A-Za-z]){10,})/i; - $lowcomplex =1 if $sequence =~ /(([A-Za-z]$nt){10,})/i; -# print "caught with a di with $nt : $2 in $sequence\n" if $sequence =~ /(($nt[A-Za-z]){10,})/i || $sequence =~ /(([A-Za-z]$nt){10,})/i; - my $nont = ($sequence=~ s/$nt/$nt/gi); - - } -# print "leaving for now.. $sequence\n" if $printer == 1 && $lowcomplex == 0; - #<STDIN>; - return "HIGH" if $lowcomplex == 0; - return "LOW" ; -} -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -sub sequence_similarity{ - my $printer = 0; - my @seq1 = split(/\s*/, $_[0]); - my @seq2 = split(/\s*/, $_[1]); - my $similarity_thresh = $_[2]; - my $info = $_[3]; -# print "input = @_\n" if $printer == 1; - my $seq1str = $_[0]; - my $seq2str = $_[1]; - $seq1str=~s/\-//g; $seq2str=~s/\-//g; - my $similarity=0; - - my $nucdiffs=0; - my $nucsims=0; - my $indeldiffs=0; - - for my $i (0...$#seq1){ - $similarity++ if $seq1[$i] =~ /$seq2[$i]/i ; #|| $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i ; - $nucsims++ if $seq1[$i] =~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); - $nucdiffs++ if $seq1[$i] !~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); - $indeldiffs++ if $seq1[$i] !~ /$seq2[$i]/i && $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i; - } - my $sim = $similarity/length($_[0]); - return ( $sim, $nucdiffs, $indeldiffs ); #<= $similarity_thresh; -} -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- - -sub load_treesToReject{ - my @rejectlist = (); - my $alltags = join("",@_); - @rejectlist = qw (-HCOR +HCOR) if $alltags eq "HCORM"; - @rejectlist = qw ( -HCO|+R +HCO|-R) if $alltags eq "HCOR"; - @rejectlist = qw ( -HC|+O +HC|-O) if $alltags eq "HCO"; - - %treesToReject=(); - $treesToReject{$_} = $_ foreach (@rejectlist); - #print "loaded to reject for $alltags; ", $treesToReject{$_},"\n" foreach (@rejectlist); #<STDIN>; -} -#-------------------------------------------------------------------------------------------------------- -sub load_treesToIgnore{ - my @rejectlist = (); - my $alltags = join("",@_); - @rejectlist = qw (-HCOR +HCOR +HCORM -HCORM) if $alltags eq "HCORM"; - @rejectlist = qw ( -HCO|+R +HCO|-R +HCOR -HCOR) if $alltags eq "HCOR"; - @rejectlist = qw ( -HC|+O +HC|-O +HCO -HCO) if $alltags eq "HCO"; - - %treesToIgnore=(); - $treesToIgnore{$_} = $_ foreach (@rejectlist); - #print "loaded ", $treesToIgnore{$_},"\n" foreach (@rejectlist); -} -#-------------------------------------------------------------------------------------------------------- -sub load_thresholds{ - my @threshold_array=split(/[,_]/,$_[0]); - unshift @threshold_array, "0"; - for my $size (1 ... 4){ - $thresholdhash{$size}=$threshold_array[$size]; - } -} -#-------------------------------------------------------------------------------------------------------- -sub load_allPossibleTrees{ - #1 THIS FILE STORES ALL POSSIBLE SCENARIOS OF MICROSATELLITE - #2 BIRTH AND DEATH EVENTS ON A 5-PRIMATE TREE OF H,C,O,R,M - #3 IN FORM OF A TEXT FILE. THIS WILL BE USED AS A TEMPLET - #4 TO COMPARE EACH MICROSATELLITE CLUSTER TO UNDERSTAND THE - #5 EVOLUTION OF EACH LOCUS. WE WILL THEN DISCARD SOME - #6 MICROSATS ACCRODING TO THEIR EVOLUTIONARY BEHAVIOUR ON - #7 THE TREE. MOST PROBABLY WE WILL REMOVE THOSE MICROSATS - #8 THAT ARE NOT SUFFICIENTLY INFORMATIVE, LIKE IN CASE OF - #9 AN OUTGROUP MICROSATELLITE BEING DIFFERENT FRON ALL OTHER - #10 SPECIES IN THE TREE. - my $tree_list = $_[0]; -# print "file to be loaded: $tree_list\n"; - - my @trarr = (); - @trarr = ("#H C O CONCLUSION ALTERNATE", -"+ + + +HCO NA", -"+ _ _ +H NA", -"_ + _ +C NA", -"_ _ + -HC|+O NA", -"+ _ + -C +H", -"_ + + -H +C", -"+ + _ +HC|-O NA", -"_ _ _ -HCO NA") if $tree_list =~ /_HCO\.txt/; - @trarr = ("#H C O R CONCLUSION ALTERNATE", -"_ _ _ _ -HCOR NA", -"+ + + + +HCOR NA", -"+ + + _ +HCO|-R +H.+C.+O", -"+ + _ _ +HC +H.+C;-O", -"+ _ _ _ +H +HC,-C", -"_ + _ _ +C +HC,-H", -"_ _ + _ +O -HC|-H.-C", -"_ _ + + -HC -H.-C", -"+ _ _ + +H|-C.-O +HC,-C", -"_ + _ + +C -H.-O", -"_ + + _ -H +C.+O", -"_ _ _ + -HCO|+R NA", -"+ _ + _ +H.+O|-C NA", -"_ + + + -H -HC,+C", -"+ _ + + -C -HC,+H", -"+ + _ + -O +HC") if $tree_list =~ /_HCOR\.txt/; - - @trarr = ("#H C O R M CONCLUSION ALTERNATE", -"+ + + + _ +HCOR NA", -"+ + + _ + -R +HCO;+HC.+O;+H.+C.+O", -"+ + _ + + -O -HCO,+HC|-HCO,+HC;-HCO,(+H.+C)", -"+ _ + + + -C -HC,+H;+HCO,(+H.+O)", -"_ + + + + -H -HC,+C;-HCO,(+C.+O)", -"_ _ _ _ + -HCOR NA", -"_ _ _ + _ +R -HC.-O;-H.-C.-O", -"_ _ + _ _ +O +HCO,-HC;+HCO,(-H.-C)", -"_ + _ _ _ +C +HC,-H;+HCO,(-H.-O)", -"+ _ _ _ _ +H +HC,-C;+HCO,(-C.-O)", -"+ + + _ _ +HCO +H.+C.+O", -"+ + _ + _ -O +R.+HC|-HCO,+HC;+H.+C.+R|-HCO,(+H.+C)", -"+ _ + + _ -C -HC,+H;+H.+O.+R|-HCO,(+H.+O)", -"_ + + + _ -H -HC,+C;+C.+O.+R|-HCO,(+C.+O)", -"_ _ _ + + -HCO -HC.-O;-H.-C.-O", -"_ _ + _ + +O +HCO,-HC;+HCO,(-H.-C)", -"_ + _ _ + +C +HC,-H;+HCO,(-H.-O)", -"+ _ _ _ + +H -HC,+H;+HCO,(-C.-O)", -"+ + _ _ + +HC -R.-O|+HCO,-O|+H.+C;-HCO,+HC;-HCO,(+H.+C)", -"+ _ + _ + -R.-C|+HCO,-C|+H.+O NA", -"_ + + _ + -R.-H|+HCO,-H|+C.+O NA", -"_ _ + + _ -HC +R.+O|-HCO,+O|+HCO,-HC", -"_ + _ + _ +R.+C|-HCO,+C|-HC,+C +HCO,(-H.-O)", -"+ _ _ + _ +R.+H|-C.-O +HCO,(-C.-O)", -"+ _ _ + + -O.-C|-HCO,+H +R.+H;-HCO,(+R.+H)", -"_ + _ + + -O.-H|-HCO,+C +R.+C;-HCO,(+R.+C)", -"_ + + _ _ +HCO,-H|+O.+C NA", -"+ _ + _ _ +HCO,-C|+O.+H NA", -"_ _ + + + -HC -H.-C|-HCO,+O", -"+ + _ _ _ +HC +H.+C|+HCO,-O|-HCO,+HC;-HCO,(+H.+C)", -"+ + + + + +HCORM NA") if $tree_list =~ /_HCORM\.txt/; - - - my $template_p = $_[1]; - my $alternate_p = $_[2]; - #1 THIS IS THE HASH IN WHICH INFORMATION FROM THE ABOVE FILE - #2 GETS STORED, USING THE WHILE LOOP BELOW. HERE, THE KEY - #3 OF EACH ROW IS THE EVOLUTIONARY CONFIGURATION OF A LOCUS - #4 ON THE PRIMATE TREE, BASED ON PRESENCE/ABSENCE OF A MICROSAT - #5 AT THAT LOCUS, LIKE SAY "+ + + _ _" .. EACH COLUMN BELONGS - #6 TO ONE SPECIES; HERE THE COLUMN NAMES ARE "H C O R M". - #7 THE VALUE FOR EACH ENTRY IS THE MEANING OF THE ABOVE - #8 CONFIGURATION (I.E., CONFIGURAION OF THE KEY. HERE, THE - #9 VALUE WILL BE +HCO, SIGNIFYING A BIRTH IN HUMAN-CHIMP-ORANG - #10 COMMON ANCESTOR. THIS HASH HAS BEEN LOADED HERE TO BE USED - #11 LATER BY THE SUBROUTINE sub treeStudy{} THAT STUDIES - #12 EVOLUTIONARY CONFIGURAION OF EACH MICROSAT LOCUS, AS - #13 MENTIONED ABOVE. - my @keys_array=(); - foreach my $line (@trarr){ - next if $line =~ /^#/; - chomp $line; - my @fields = split("\t", $line); - push @keys_array, $fields[0]; -# print "loading: $fields[0]\n"; - $template_p->{$fields[0]}[0] = $fields[1]; - $template_p->{$fields[0]}[1] = 0; - $alternate_p->{$fields[1]} = $fields[2]; - - } -# print "loaded the trees with keys: @keys_array\n"; - return $template_p, \@keys_array, $alternate_p; -} - -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -sub checkCleanCase{ - my $printer = 0; - my $tree = $_[0]; - my $finalalignment = $_[1]; - - #print "IN checkCleanCase: @_\n"; - #<STDIN>; - my @indivspecies = $tree =~ /[A-Z]/g; - $finalalignment =~ s/\./_/g; - my @captured = $finalalignment =~ /[A-Za-z, \(\):]+\![:A-Za-z, \(\)]/g; - - my $unclean = 0; - - foreach my $sp (@indivspecies){ - foreach my $cap (@captured){ - $cap =~ s/:[A-Za-z\-]+//g; - my @sps = $cap =~ /[A-Z]+/g; - my $spsc = join("", @sps); -# print "checking whether imp species $sp is present in $cap i.e, in $spsc\n " if $printer == 1; - if ($spsc =~ /$sp/){ -# print "foind : $sp\n"; - $unclean = 1; last; - } - } - last if $unclean == 1; - } - #<STDIN>; - return "CLEAN" if $unclean == 0; - return "UNCLEAN"; -} - -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- -#-------------------------------------------------------------------------------------------------------- - - -sub adjustCoordinates{ - my $line = $_[0]; - my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g; - $infocord = 2 + (4*$no_of_species) - 1; - $typecord = 2 + (4*$no_of_species) + 1 - 1; - $motifcord = 2 + (4*$no_of_species) + 2 - 1; - $gapcord = $motifcord+1; - $startcord = $gapcord+1; - $strandcord = $startcord+1; - $endcord = $strandcord + 1; - $microsatcord = $endcord + 1; - $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; - $interr_poscord = $microsatcord + 3; - $no_of_interruptionscord = $microsatcord + 4; - $interrcord = $microsatcord + 2; - #print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $printer == 1; - return $no_of_species; -} - - -sub printhash{ - my $alivehash = $_[0]; - my @tags = @$_[1]; -# print "print hash\n"; - foreach my $tag (@tags){ -# print "$tag=",$alivehash->{$tag},"\n" if exists $alivehash->{$tag}; - } - - return "\n" -} -sub peel_onion{ - my $printer = 0; -# print "received: @_\n" ; #<STDIN>; - $printer = 0; - my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; -# print "in peel onion.. tree = $tree \n" if $printer == 1; - my %sequence_hash=(); - - -# for my $i (0 ... $#sequences){ $sequence_hash{$species[$i]}=$sequences->[$i]; } - - - my %node_sequences=(); - - my %node_alignments = (); #NEW, Nov 28 2008 - my @tags=(); - my @locus_sequences=(); - my %alivehash=(); - foreach my $tag (@$tagarray) { - #print "adding: $tag\n"; - push(@tags, $tag); - $node_sequences{$tag}=join ".",split(/\s*/,$microsathash->{$tag}) if $microsathash->{$tag} ne "NULL"; - $alivehash{$tag}= $tag if $microsathash->{$tag} ne "NULL"; - $node_sequences{$tag}=join ".",split(/\s*/,$nonmicrosathash->{$tag}) if $microsathash->{$tag} eq "NULL"; - $node_alignments{$tag}=join ".",split(/\s*/,$alignment->{$tag}) ; - push @locus_sequences, $node_sequences{$tag}; - #print "adding to node_seq: $tag = ",$node_alignments{$tag},"\n"; - } - - - - my ($nodes_arr, $branches_hash) = get_nodes($tree); - my @nodes=@$nodes_arr; -# print "recieved nodes = " if $printer == 1; -# foreach my $key (@nodes) {print "@$key " if $printer == 1;} - -# print "\n" if $printer == 1; - - #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS - foreach my $keys (@nodes){ - my @pair = @$keys; - my $joint = "(".join(", ",@pair).")"; - my $copykey = join "", @pair; - $copykey =~ s/[\W ]+//g; -# print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; - my $livestatus = 1; - foreach my $copy (split(/\s*/,$copykey)){ - $livestatus = 0 if !exists $alivehash{$copy}; - } - $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; -# print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; - } - - @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. - - my @mutations_array=(); - - my $joint = (); - foreach my $node (@nodes){ - my @pair = @$node; -# print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; - $joint = "(".join(", ",@pair).")"; - my @pair_sequences=(); - - foreach my $tag (@pair){ -# print "$tag: $node_alignments{$tag}\n" if $printer == 1; - print $node_alignments{$tag},"\n" if $printer == 1; - push @pair_sequences, $node_alignments{$tag}; - } -# print "ppeel onion joint = $joint , pair_sequences=>@pair_sequences< , pair=>@pair<\n" if $printer == 1; - - my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); - $node_alignments{$joint}=$compared; - push( @mutations_array,split(/:/,$substitutions_list)); -# print "newly added to node_sequences: $node_alignments{$joint} and list of mutations =\n", join("\n",@mutations_array),"\n" if $printer == 1; - } - -# print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=\%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1; - ## <STDIN> if $printer == 1; - - my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); - -# print "returning: ", $analayzed_mutations, \@nodes, $branches_hash,"\n" if scalar @mutations_array > 0 && $printer == 1; -# print "returning: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1; -# print "final node alignment = $node_alignments{$joint}\n" if $printer == 1; - - # <STDIN> if $printer == 1; - - - return ($analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint}) if scalar @mutations_array > 0; - return ("NULL",\@nodes,$branches_hash, \%alivehash, "NULL") if scalar @mutations_array == 0; -} - -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# - -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# - -sub get_nodes{ - my $printer = 0; - - my $tree=$_[0]; - #$tree =~ s/ +//g; - $tree =~ s/\t+//g; - $tree=~s/;//g; - print "tree=$tree\n" if $printer == 1; - my @nodes = (); - my @onions=($tree); - my %branches=(); - foreach my $bite (@onions){ - $bite=~ s/^\(|\)$//g; - chomp $bite; -# print "tree = $bite \n"; -# <STDIN>; - $bite=~ /([ ,\(\)A-Z]+)\,\s*([ ,\(\)A-Z]+)/; - #$tree =~ /(\(\(\(H, C\), O\), R\))\, (M)/; - my @raw_nodes = ($1, $2); - print "raw nodes = $1 and $2\n" if $printer == 1; - push(@nodes, [@raw_nodes]); - foreach my $node (@raw_nodes) {push (@onions, $node) if $node =~ /,/;} - foreach my $node (@raw_nodes) {$branches{$node}="(".$bite.")"; print "adding to branches: $node = ($bite)\n" if $printer == 1;} - print "onions = @onions\n" if $printer == 1;<STDIN> if $printer == 1; - } - $printer = 0; - return \@nodes, \%branches; -} - - -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -sub analyze_mutations{ - my ($mutations_array, $nodes, $branches_hash, $alignment, $tags, $alivehash, $node_sequences, $microsatstarts, $motif) = @_; - my $locuslength = length($alignment->{$tags->[0]}); - my $printer = 0; - - -# print " IN analyzed_mutations....\n" if $printer == 1; # \n mutations array = @$mutations_array, \nAND locuslength = $locuslength\n" if $printer == 1; - my %mutation_hash=(); - my %froms_megahash=(); - my %tos_megahash=(); - my %position_hash=(); - my @solutions_array=(); - foreach my $mutation (@$mutations_array){ -# print "loadin mutation: $mutation\n" if $printer == 1; - my %localhash= $mutation =~ /([\S ]+)=([\S ]+)/g; - $mutation_hash{$localhash{"position"}} = {%localhash}; - push @{$position_hash{$localhash{"position"}}},$localhash{"node"}; -# print "feeding position hash with $localhash{position}: $position_hash{$localhash{position}}[0]\n" if $printer == 1; - $froms_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"from"}; - $tos_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"to"}; -# print "just a trial: $mutation_hash{$localhash{position}}{position}\n" if $printer == 1; -# print "loadin in tos_megahash: $localhash{position} {$localhash{node} = $localhash{to}\n" if $printer == 1; -# print "loadin in from: $localhash{position} {$localhash{node} = $localhash{from}\n" if $printer == 1; - } - -# print "now going through each position in loculength:\n" if $printer == 1; - ## <STDIN> if $printer == 1; - - for my $pos (0 ... $locuslength-1){ -# print "at position: $pos\n" if $printer == 1; - - if (exists($mutation_hash{$pos})){ - my @local_nodes=@{$position_hash{$pos}}; -# print "found mutation: @{$position_hash{$pos}} : @local_nodes\n" if $printer == 1; - - foreach my $local_node (@local_nodes){ -# print "at local node: $local_node ... from state = $froms_megahash{$pos}{$local_node}\n" if $printer == 1; - my $open_insertion=(); - my $open_deletion=(); - my $open_to_substitution=(); - my $open_from_substitution=(); - if ($froms_megahash{$pos}{$local_node} eq "-"){ - # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; - # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}} && $printer == 1; - #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; - $open_insertion=$tos_megahash{$pos}{$local_node}; - for my $posnext ($pos+1 ... $locuslength-1){ -# print "in first if .... studying posnext: $posnext\n" if $printer == 1; - last if !exists ($froms_megahash{$posnext}{$local_node}); -# print "for posnext: $posnext, there exists $froms_megahash{$posnext}{$local_node}.. already, open_insertion = $open_insertion.. checking is $froms_megahash{$posnext}{$local_node} matters\n" if $printer == 1; - $open_insertion = $open_insertion.$tos_megahash{$posnext}{$local_node} if $froms_megahash{$posnext}{$local_node} eq "-"; -# print "now open_insertion=$open_insertion\n" if $printer == 1; - delete $mutation_hash{$posnext} if $froms_megahash{$posnext}{$local_node} eq "-"; - } - print "1 Feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="),"\n" if $printer == 1; - push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion=")); - } - elsif ($tos_megahash{$pos}{$local_node} eq "-"){ - # print "here exists a microsatellite to $local_node from $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; - # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; - #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; - $open_deletion=$froms_megahash{$pos}{$local_node}; - for my $posnext ($pos+1 ... $locuslength-1){ - print "in 1st elsif studying posnext: $posnext\n" if $printer == 1; - print "nexting as nextpos does not exist\n" if !exists ($tos_megahash{$posnext}{$local_node}) && $printer == 1; - last if !exists ($tos_megahash{$posnext}{$local_node}); - print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; - $open_deletion = $open_deletion.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} eq "-"; - delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} eq "-"; - } - print "2 Feeding in:", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"), "\n" if $printer == 1; - push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion")); - } - elsif ($tos_megahash{$pos}{$local_node} ne "-"){ - # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; - # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; - #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; - # print "microsatstart = $microsatstarts->{$local_node} \n" if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node} && $printer == 1; - next if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node}; - $open_to_substitution=$tos_megahash{$pos}{$local_node}; - $open_from_substitution=$froms_megahash{$pos}{$local_node}; - print "open from substitution: $open_from_substitution \n" if $printer == 1; - for my $posnext ($pos+1 ... $locuslength-1){ - #print "in last elsif studying posnext: $posnext\n"; - last if !exists ($tos_megahash{$posnext}{$local_node}); - print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; - $open_to_substitution = $open_to_substitution.$tos_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; - $open_from_substitution = $open_from_substitution.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; - delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} ne "-" && $froms_megahash{$posnext}{$local_node} ; - } - print "open from substitution: $open_from_substitution \n" if $printer == 1; - - #IS THE STRETCH OF SUBSTITUTION MICROSATELLITE-LIKE? - my @motif_parts=split(/\s*/,$motif); - #GENERATING THE FLEXIBLE LEFT END - my $left_query=(); - for my $k (1 ... $#motif_parts) { - $left_query= $motif_parts[$k]."|)"; - $left_query="(".$left_query; - } - $left_query=$left_query."?"; - - print "left_quewry = $left_query\n" if $printer == 1; - #GENERATING THE FLEXIBLE RIGHT END - my $right_query=(); - for my $k (0 ... ($#motif_parts-1)) { - $right_query= "(|".$motif_parts[$k]; - $right_query=$right_query.")"; - } - $right_query=$right_query."?"; - print "right_query = $right_query\n" if $printer == 1; - print "Hence, searching for: ^$left_query($motif)+$right_query\$\n" if $printer == 1; - - my $motifcomb=$motif x 50; - print "motifcomb = $motifcomb\n" if $printer == 1; - if ( ($motifcomb =~/$open_to_substitution/i) && (length ($open_to_substitution) >= length($motif)) ){ - print "sequence microsat-like\n" if $printer == 1; - my $all_microsat_like = 0; - print "3 feeding in: ", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"), "\n" if $printer == 1; - push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution")); - print "4 feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="), "\n" if $printer == 1; - push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion=")); - - } - else{ - print "5 feeding in: ", join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="), "\n" if $printer == 1; - push (@solutions_array, join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion=")); - } - #IS THE FROM-SEQUENCE MICROSATELLITE-LIKE? - - } - #<STDIN> if $printer ==1; - } - #<STDIN> if $printer ==1; - } - } - - print "\n", "#" x 50, "\n" if $printer == 1; - foreach my $tag (@$tags){ - print "$tag: $alignment->{$tag}\n" if $printer == 1; - } - print "\n", "#" x 50, "\n" if $printer == 1; - - print "returning SOLUTIONS ARRAY : \n",join("\n", @solutions_array),"\n" if $printer == 1; - #print "end\n"; - #<STDIN> if - return \@solutions_array; -} -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# -#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# - -sub base_by_base_simple{ - my $printer = 0; - my ($motif, $locus, $no, $pair0, $pair1, $joint) = @_; - my @seq_array=(); - print "IN SUBROUTUNE base_by_base_simple.. information received = @_\n" if $printer == 1; - print "pair0 = $pair0 and pair1 = $pair1\n" if $printer == 1; - - my @example=split(/\./,$locus->[0]); - print "example, for length = @example\n" if $printer == 1; - for my $i (0...$no-1){push(@seq_array, [split(/\./,$locus->[$i])]); print "for $i, from $locus->[$i], seq_array = >@{$seq_array[$i]}<\n" if $printer == 1;} - - my @compared_sequence=(); - my @substitutions_list; - for my $i (0...scalar(@example)-1){ - - #print "i = $i\n" if $printer == 1; - #print "comparing $seq_array[0][$i] and $seq_array[1][$i] \n" ;#if $printer == 1; - if ($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] !~ /!/){ - - my $resolution= resolve_base($seq_array[0][$i],$seq_array[1][$i], $pair1 ,"keep" ); - # print "ancestral = $resolution\n" if $printer == 1; - - if ($resolution =~ /$seq_array[1][$i]/i && $resolution !~ /!/){ - push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution ); - } - elsif ( $resolution !~ /!/){ - push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); - } - push @compared_sequence,$resolution; - } - elsif ($seq_array[0][$i] !~ /!/ && $seq_array[1][$i] =~ /!/){ - - my $resolution= resolve_base($seq_array[1][$i],$seq_array[0][$i], $pair0, "invert" ); - # print "ancestral = $resolution\n" if $printer == 1; - - if ($resolution =~ /$seq_array[0][$i]/i && $resolution !~ /!/){ - push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); - } - elsif ( $resolution !~ /!/){ - push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution); - } - push @compared_sequence,$resolution; - } - elsif($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] =~ /!/){ - push @compared_sequence, add_bases($seq_array[0][$i],$seq_array[1][$i], $pair0, $pair1, $joint ); - } - else{ - if($seq_array[0][$i] !~ /^$seq_array[1][$i]$/i){ - push @compared_sequence, $pair0.":".$seq_array[0][$i]."!".$pair1.":".$seq_array[1][$i]; - } - else{ - # print "perfect match\n" if $printer == 1; - push @compared_sequence, $seq_array[0][$i]; - } - } - } - print "returning: comared = @compared_sequence \nand substitutions list =\n", join("\n",@substitutions_list),"\n" if $printer == 1; - return join(".",@compared_sequence), join(":", @substitutions_list) if scalar (@substitutions_list) > 0; - return join(".",@compared_sequence), "" if scalar (@substitutions_list) == 0; -} - - -sub resolve_base{ - my $printer = 0; - - print "IN SUBROUTUNE resolve_base.. information received = @_\n" if $printer == 1; - my ($optional, $single, $singlesp, $arg) = @_; - my @options=split(/!/,$optional); - foreach my $option(@options) { - $option=~s/[A-Z\(\) ,]+://g; - if ($option =~ /$single/i){ - print "option = $option , returning single: $single\n" if $printer == 1; - return $single; - } - } - print "returning ",$optional."!".$singlesp.":".$single. "\n" if $arg eq "keep" && $printer == 1; - print "returning ",$singlesp.":".$single."!".$optional. "\n" if $arg eq "invert" && $printer == 1; - return $optional."!".$singlesp.":".$single if $arg eq "keep"; - return $singlesp.":".$single."!".$optional if $arg eq "invert"; - -} - -sub same_length{ - my $printer = 0; - my @locus = @_; - my $temp = shift @locus; - $temp=~s/-|,//g; - foreach my $l (@locus){ - $l=~s/-|,//g; - return 0 if length($l) != length($temp); - $temp = $l; - } - return 1; -} -sub treeStudy{ - my $printer = 0; -# print "template DEFINED.. received: @_\n" if defined %template; -# print "only received = @_" if !defined %template; - my $stopper = 0; - if (!defined %template){ - $stopper = 1; - %template=(); - print "tree decipherer = $tree_decipherer\n" if $printer == 1; - my ( $template_ref, $keys_array)=load_allPossibleTrees($tree_decipherer, \%template); - print "return = $template_ref and @{$keys_array}\n" if $printer == 1; - foreach my $key (@$keys_array){ - print "addding : $template_ref->{$key} for $key\n" if $printer == 1; - $template{$key} = $template_ref->{$key}; - } - } - - for my $templet ( keys %template ) { - # print "$templet => @{$template{$templet}}\n"; - } - <STDIN> if !defined %template; - - my $strict = 0; - - my $H = 0; - my $Hchr = 1; - my $Hstart = 2; - my $Hend = 3; - my $Hmotif = 4; - my $Hmotiflen = 5; - my $Hmicro = 6; - my $Hstrand = 7; - my $Hmicrolen = 8; - my $Hinterpos = 9; - my $Hrelativepos = 10; - my $Hinter = 11; - my $Hinterlen = 12; - - my $C = 13; - my $Cchr = 14; - my $Cstart = 15; - my $Cend = 16; - my $Cmotif = 17; - my $Cmotiflen = 18; - my $Cmicro = 19; - my $Cstrand = 20; - my $Cmicrolen = 21; - my $Cinterpos = 22; - my $Crelativepos = 23; - my $Cinter = 24; - my $Cinterlen = 25; - - my $O = 26; - my $Ochr = 27; - my $Ostart = 28; - my $Oend = 29; - my $Omotif = 30; - my $Omotiflen = 31; - my $Omicro = 32; - my $Ostrand = 33; - my $Omicrolen = 34; - my $Ointerpos = 35; - my $Orelativepos = 36; - my $Ointer = 37; - my $Ointerlen = 38; - - my $R = 39; - my $Rchr = 40; - my $Rstart = 41; - my $Rend = 42; - my $Rmotif = 43; - my $Rmotiflen = 44; - my $Rmicro = 45; - my $Rstrand = 46; - my $Rmicrolen = 47; - my $Rinterpos = 48; - my $Rrelativepos = 49; - my $Rinter = 50; - my $Rinterlen = 51; - - my $Mchr = 52; - my $Mstart = 53; - my $Mend = 54; - my $M = 55; - my $Mmotif = 56; - my $Mmotiflen = 57; - my $Mmicro = 58; - my $Mstrand = 59; - my $Mmicrolen = 60; - my $Minterpos = 61; - my $Mrelativepos = 62; - my $Minter = 63; - my $Minterlen = 64; - - #-------------------------------------------------------------------------------# - my @analysis=(); - - - my %speciesOrder = (); - $speciesOrder{"H"} = 0; - $speciesOrder{"C"} = 1; - $speciesOrder{"O"} = 2; - $speciesOrder{"R"} = 3; - $speciesOrder{"M"} = 4; - #-------------------------------------------------------------------------------# - - my $line = $_[0]; - chomp $line; - - my @f = split(/\t/,$line); - print "received array : @f.. recieved tags = @tags\n" if $printer == 1; - - # collect all motifs - my @motifs=(); - @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif], $f[$Mmotif]) if $tags[$#tags] =~ /M/; - @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]) if $tags[$#tags] =~ /R/; - @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif]) if $tags[$#tags] =~ /O/; -# print "motifs in the array = $f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]\n" if $tags[$#tags] =~ /R/;; - print "motifs = @motifs\n" if $printer == 1; - my @translation = (); - foreach my $motif (@motifs){ - push(@translation, "_") if $motif eq "NA"; - push(@translation, "+") if $motif ne "NA"; - } - my $translate = join(" ", @translation); -# print "translate = >$translate< and analysis = $template{$translate}[0].. on the other hand, ",$template{"- - +"}[0],"\n"; - my @analyses = split(/\|/,$template{$translate}[0]); - - print "motifs = @motifs, analyses = @analyses\n" if $printer == 1; - - if (scalar(@analyses) == 1) { - #print "analysis = $analyses[0]\n"; - if ($analyses[0] !~ /,|\./ ){ - if ($analyses[0] =~ /\+/){ - my $analysis = $analyses[0]; - $analysis =~ s/\+|\-//g; - my @species = split(/\s*/,$analysis); - my @currentMotifs = (); - foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;} - print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; - $template{$translate}[1]++ if $strict == 1 && consistency(@currentMotifs) ne "NULL"; - $template{$translate}[1]++ if $strict == 0; - print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; - } - else{ - my $analysis = $analyses[0]; - $analysis =~ s/\+|\-//g; - my @species = split(/\s*/,$analysis); - my @currentMotifs = (); - my @complementarySpecies = (); - my $allSpecies = join("",@tags); - foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } - foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;;} - print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; - print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; - } - } - - elsif ($analyses[0] =~ /,/) { - my @events = split(/,/,$analyses[0]); - print "events = @events \n " if $printer == 1; - if ($events[0] =~ /\+/){ - my $analysis1 = $events[0]; - $analysis1 =~ s/\+|\-//g; - my $analysis2 = $events[1]; - $analysis2 =~ s/\+|\-//g; - my @nSpecies = split(/\s*/,$analysis2); - print "original anslysis = $analysis1 " if $printer == 1; - foreach my $specie (@nSpecies){ $analysis1=~ s/$specie//g;} - print "processed anslysis = $analysis1 \n" if $printer == 1; - my @currentMotifs = (); - foreach my $specie (split(/\s*/,$analysis1)){push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } - print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; - print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; - } - else{ - my $analysis1 = $events[0]; - $analysis1 =~ s/\+|\-//g; - my $analysis2 = $events[1]; - $analysis2 =~ s/\+|\-//g; - my @pSpecies = split(/\s*/,$analysis2); - my @currentMotifs = (); - foreach my $specie (@pSpecies){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } - print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; - print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; - - } - - } - elsif ($analyses[0] =~ /\./) { - my @events = split(/\./,$analyses[0]); - foreach my $event (@events){ - print "event = $event \n" if $printer == 1; - if ($event =~ /\+/){ - my $analysis = $event; - $analysis =~ s/\+|\-//g; - my @species = split(/\s*/,$analysis); - my @currentMotifs = (); - foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } - #print consistency(@currentMotifs),"<- \n"; - print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; - print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; - } - else{ - my $analysis = $event; - $analysis =~ s/\+|\-//g; - my @species = split(/\s*/,$analysis); - my @currentMotifs = (); - my @complementarySpecies = (); - my $allSpecies = join("",@tags); - foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } - foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } - #print consistency(@currentMotifs),"<- \n"; - print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; - $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; - print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; - } - } - - } - } - else{ - my $finalanalysis = (); - $template{$translate}[1]++; - foreach my $analysis (@analyses){ ;} - } - # test if motifs where microsats are present, as indeed of same the motif composition - - - - for my $templet ( keys %template ) { - # print "now returning: $templet\n"; - if (@{ $template{$templet} }[1] > 0){ - print "returning in the end: $templet and $translate\n" if $printer == 1; - $template{$templet}[1] = 0; - return (@{$template{$templet}}[0], $translate); - } - } - undef %template; - print "sending NULL\n" if $printer == 1; - return ("NULL", $translate); - -} - - -sub consistency{ - my @motifs = @_; - print "in consistency \n" if $printer == 1; - print "motifs sent = >",join("|",@motifs),"< \n" if $printer == 1; - return $motifs[0] if scalar(@motifs) == 1; - my $prevmotif = shift(@motifs); - my $stopper = 0; - for my $i (0 ... $#motifs){ - next if $motifs[$i] eq "NA"; - my $templet = $motifs[$i].$motifs[$i]; - if ($templet !~ /$prevmotif/i){ - $stopper = 1; last; - } - } - return $prevmotif if $stopper == 0; - return "NULL" if $stopper == 1; -} -sub summarize_microsat{ - my $printer = 0; - my $line = $_[0]; - my $humseq = $_[1]; - - my @gaps = $line =~ /[0-9]+\t[0-9]+\t[\+\-]/g; - my @starts = $line =~ /[0-9]+\t[\+\-]/g; - my @ends = $line =~ /[\+\-]\t[0-9]+/g; - print "starts = @starts\tends = @ends\n" if $printer == 1; - for my $i (0 ... $#gaps) {$gaps[$i] =~ s/\t[0-9]+\t[\+\-]//g;} - for my $i (0 ... $#starts) {$starts[$i] =~ s/\t[\+\-]//g;} - for my $i (0 ... $#ends) {$ends[$i] =~ s/[\+\-]\t//g;} - - my $minstart = array_smallest_number(@starts); - my $maxend = array_largest_number(@ends); - - my $humupstream_st = substr($humseq, 0, $minstart); - my $humupstream_en = substr($humseq, 0, $maxend); - my $no_of_gaps_to_start = 0; - my $no_of_gaps_to_end = 0; - $no_of_gaps_to_start = ($humupstream_st =~ s/\-/x/g) if $humupstream_st=~/\-/; - $no_of_gaps_to_end = ($humupstream_en =~ s/\-/x/g) if $humupstream_en=~/\-/; - - my $locusmotif = (); - - - print "IN SUB SUMMARIZE_MICROSAT $line\n" if $printer == 1; - #return "NULL" if $line =~ /compound/; - my $Hstart = "NA"; - my $Hend = "NA"; - chomp $line; - my $match_count = ($line =~ s/>/>/g); - #print "number of species = $match_count\n"; - my @micros = split(/>/,$line); - shift @micros; - my $stopper = 0; - - - foreach my $mic (@micros){ - my @local = split(/\t/,$mic); - if ($local[$microsatcord] =~ /N/) {$stopper =1; last;} - } - return "NULL" if $stopper ==1; - - #------------------------------------------------------ - - my @arranged = (); - for my $arr (0 ... $#exacttags) {$arranged[$arr] = '0';} - - foreach my $micro (@micros){ - for my $i (0 ... $#exacttags){ - if ($micro =~ /^$exacttags[$i]/){ - $arranged[$i] = $micro; - last; - } - } - } -# print "arranged = @arranged \n" ; <STDIN>;; - - my @endstatement = (); - my $turn = 0; - my $species_counter = 0; - # print scalar(@arranged),"\n"; - - my $species_no=0; - - my $orthHchr = 0; - - foreach my $micro (@arranged) { - $micro =~ s/\t\t/\t \t/g; - $micro =~ s/\t,/\t ,/g; - $micro =~ s/,\t/, \t/g; - print "------------------------------------------------------------------------------------------\n" if $printer == 1; - chomp $micro; - if ($micro eq '0'){ - push(@endstatement, join("\t",$exacttags[$species_counter],"NA","NA","NA","NA",0 ,"NA", "NA", 0,"NA","NA","NA", "NA" )); - $species_counter++; - print join("|","ENDSTATEMENT:",@endstatement),"\n" if $printer == 1; - next; - } - # print $micro,"\n"; - print "micro = $micro \n" if $printer == 1; - my @fields = split(/\t/,$micro); - my $microcopy = $fields[$microsatcord]; - $microcopy =~ s/\[|\]|-//g; - my $microsatlength = length($microcopy); - print "microsat = $fields[$microsatcord] and microsatlength = $microsatlength\n" if $printer == 1; -# print "sp_ident = @sp_ident.. species_no=$species_no\n"; - $micro =~ /$sp_ident[$species_no]\s(\S+)\s([0-9]+)\s([0-9]+)/; - - - my $sp_chr=$1; - my $sp_start=$2 + $fields[$startcord] - $fields[$gapcord]; - my $sp_end= $sp_start + $microsatlength - 1; - - $species_no++; - - $micro =~ /$focalspec\s(\S+)\s([0-9]+)\s([0-9]+)/; - $orthHchr=$1; - $Hstart=$2+$minstart-$no_of_gaps_to_start; - $Hend=$2+$maxend-$no_of_gaps_to_end; - - print "Hstart = $Hstart = $fields[4] + $fields[$startcord] - $fields[$gapcord]\n" if $printer == 1; - - my $motif = $fields[$motifcord]; - my $firstmotif = (); - my $strand = $fields[$strandcord]; - # print "strand = $strand\n"; - - - if ($motif =~ /^\[/){ - $motif =~ s/^\[//g; - $motif =~ /([a-zA-Z]+)\].*/; - $firstmotif = $1; - } - - else {$firstmotif = $motif;} - print "firstmotif =$firstmotif : \n" if $printer == 1; - $firstmotif = allCaps($firstmotif); - - if (exists $revHash{$firstmotif} && $turn == 0) { - $turn=1 if $species_counter==0; - $firstmotif = $revHash{$firstmotif}; - } - - elsif (exists $revHash{$firstmotif} && $turn == 1) {$firstmotif = $revHash{$firstmotif}; $turn = 1;} - print "changed firstmotif =$firstmotif\n" if $printer == 1; - # <STDIN>; - $locusmotif = $firstmotif; - - if (scalar(@fields) > $microsatcord + 2){ - print "fields = @fields ... interr_poscord=$interr_poscord=$fields[$interr_poscord] .. interrcord=$interrcord=$fields[$interrcord]\n" if $printer == 1; - - my @interposes = (); - @interposes = split(",",$fields[$interr_poscord]) if $fields[$interr_poscord] =~ /,/; - $interposes[0] = $fields[$interr_poscord] if $fields[$interr_poscord] !~ /,/ ; - print "interposes=@interposes\n" if $printer == 1; - my @relativeposes = (); - my @interruptions = (); - @interruptions = split(",",$fields[$interrcord]) if $fields[$interrcord] =~ /,/; - $interruptions[0] = $fields[$interrcord] if $fields[$interrcord] !~ /,/; - my @interlens = (); - - - for my $i (0 ... $#interposes){ - - my $interpos = $interposes[$i]; - my $nexter = 0; - my $interruption = $interruptions[$i]; - my $interlen = length($interruption); - push (@interlens, $interlen); - - - my $relativepos = (100 * $interpos) / $microsatlength; - print "relativepos = $relativepos ,interpos=$interpos, interruption=$interruption, interlen=$interlen \n" if $printer == 1; - $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; - print "--> = $relativepos\n" if $printer == 1; - $interruption = "IND" if length($interruption) < 1; - - if ($turn == 1){ - $fields[$microsatcord] = switch_micro($fields[$microsatcord]); - $interruption = switch_nucl($interruption) unless $interruption eq "IND"; - $interpos = ($microsatlength - $interpos) - $interlen + 2; - print "turn interpos = $interpos for $fields[$microsatcord]\n" if $printer == 1; - $relativepos = (100 * $interpos) / $microsatlength; - $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; - - - $strand = '+' if $strand eq '-'; - $strand = '-' if $strand eq '+'; - } - print "final relativepos = $relativepos\n" if $printer == 1; - push(@relativeposes, $relativepos); - } - push(@endstatement,join("\t",($exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,join(",",@interposes),join(",",@relativeposes),join(",",@interruptions), join(",",@interlens)))); - } - - else{ - push(@endstatement, join("\t",$exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,"NA","NA","NA", "NA")); - } - - $species_counter++; - } - - $locusmotif = $sameHash{$locusmotif} if exists $sameHash{$locusmotif}; - $locusmotif = $revHash{$locusmotif} if exists $revHash{$locusmotif}; - - my $endst = join("\t", @endstatement, $orthHchr, $Hstart, $Hend); - print join("\t", @endstatement, $orthHchr, $Hstart, $Hend), "\n" if $printer == 1; - - - return (join("\t", @endstatement, $orthHchr, $Hstart, $Hend), $orthHchr, $Hstart, $Hend, $locusmotif, length($locusmotif)); - -} - -sub switch_nucl{ - my @strand = split(/\s*/,$_[0]); - for my $i (0 ... $#strand){ - if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} - if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} - if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} - if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} - } - return join("",@strand); -} - - -sub switch_micro{ - my $micro = reverse($_[0]); - my @strand = split(/\s*/,$micro); - for my $i (0 ... $#strand){ - if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} - if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} - if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} - if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} - if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} - if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} - } - return join("",@strand); -} -sub decipher_history{ - my $printer = 0; - my ($mutations_array, $tags_string, $nodes, $branches_hash, $tree_analysis, $confirmation_string, $alivehash) = @_; - my %mutations_hash=(); - foreach my $mutation (@$mutations_array){ - print "mutation = $mutation\n" if $printer == 1; - my %local = $mutation =~ /([\S ]+)=([\S ]+)/g; - push @{$mutations_hash{$local{"node"}}},$mutation; - print "just for confirmation: $local{node} pushed as: $mutation\n" if $printer == 1; - } - my @nodes; - my @birth_steps=(); - my @death_steps=(); - - my @tags=split(/\s*/,$tags_string); - my @confirmation=split(/\s+/,$confirmation_string); - my %info=(); - - for my $i (0 ... $#tags){ - $info{$tags[$i]}=$confirmation[$i]; - print "feeding info: $tags[$i] = $info{$tags[$i]}\n" if $printer == 1; - } - - for my $keys (@$nodes) { - foreach my $key (@$keys){ -# print "current key = $key\n"; - my $copykey = $key; - $copykey =~ s/[\W ]+//g; - my @copykeys=split(/\s*/,$copykey); - my $states=(); - foreach my $copy (@copykeys){ - $states=$states.$info{$copy}; - } - - print "reduced key = $copykey and state = $states\n" if $printer == 1; - - if (exists $mutations_hash{$key}) { - - if ($states=~/\+/){ - push @birth_steps, @{$mutations_hash{$key}}; - $birth_steps[$#birth_steps] =~ s/\S+=//g; - delete $mutations_hash{$key}; - } - else{ - push @death_steps, @{$mutations_hash{$key}}; - $death_steps[$#death_steps] =~ s/\S+=//g; - delete $mutations_hash{$key}; - } - } - } - } - print "conformation = $confirmation_string\n" if $printer == 1; - push (@birth_steps, "NULL") if scalar(@birth_steps) == 0; - push (@death_steps, "NULL") if scalar(@death_steps) == 0; - print "birth steps = ",join("\n",@birth_steps)," and death steps = ",join("\n",@death_steps),"\n" if $printer == 1; - return \@birth_steps, \@death_steps; -} - -sub fillAlignmentGaps{ - my $printer = 0; - print "received: @_\n" if $printer == 1; - my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; - print "in fillAlignmentGaps.. tree = $tree \n" if $printer == 1; - my %sequence_hash=(); - - my @phases = (); - my $concat = $motif.$motif; - my $motifsize = length($motif); - - for my $i (1 ... $motifsize){ - push @phases, substr($concat, $i, $motifsize); - } - - my $concatalignment = (); - foreach my $tag (@tags){ - $concatalignment = $concatalignment.$alignment->{$tag}; - } -# print "returningg NULL","NULL","NULL", "NULL\n" if $concatalignment !~ /-/; - return 0, "NULL","NULL","NULL", "NULL","NULL" if $concatalignment !~ /-/; - - - - my %node_sequences_temp=(); - my %node_alignments_temp =(); #NEW, Nov 28 2008 - - my @tags=(); - my @locus_sequences=(); - my %alivehash=(); - -# print "IN fillAlignmentGaps\n";# <STDIN>; - my %fillrecord = (); - - my $change = 0; - foreach my $tag (@$tagarray) { - #print "adding: $tag\n"; - push(@tags, $tag); - if (exists $microsathash->{$tag}){ - my $micro = $microsathash->{$tag}; - my $orig_micro = $micro; - ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); - $change = 1 if uc($micro) ne uc($orig_micro); - $node_sequences_temp{$tag}=$micro if $microsathash->{$tag} ne "NULL"; - } - if (exists $nonmicrosathash->{$tag}){ - my $micro = $nonmicrosathash->{$tag}; - my $orig_micro = $micro; - ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); - $change = 1 if uc($micro) ne uc($orig_micro); - $node_sequences_temp{$tag}=$micro if $nonmicrosathash->{$tag} ne "NULL"; - } - - if (exists $alignment->{$tag}){ - my $micro = $alignment->{$tag}; - my $orig_micro = $micro; - ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); - $change = 1 if uc($micro) ne uc($orig_micro); - $node_alignments_temp{$tag}=$micro if $alignment->{$tag} ne "NULL"; - } - - #print "adding to node_sequences: $tag = ",$node_sequences_temp{$tag},"\n" if $printer == 1; - #print "adding to node_alignments: $tag = ",$node_alignments_temp{$tag},"\n" if $printer == 1; - } - - - my %node_sequences=(); - my %node_alignments =(); #NEW, Nov 28 2008 - foreach my $tag (@$tagarray) { - $node_sequences{$tag} = join ".",split(/\s*/,$node_sequences_temp{$tag}); - $node_alignments{$tag} = join ".",split(/\s*/,$node_alignments_temp{$tag}); - } - - print "\n", "#" x 50, "\n" if $printer == 1; - foreach my $tag (@tags){ - print "$tag: $alignment->{$tag} = $node_alignments{$tag}\n" if $printer == 1; - } - print "\n", "#" x 50, "\n" if $printer == 1; -# print "change = $change\n"; - #<STDIN> if $concatalignment=~/\-/; - -# <STDIN> if $printer == 1 && $concatalignment =~ /\-/; - - return 0, "NULL","NULL","NULL", "NULL", "NULL" if $change == 0; - - my ($nodes_arr, $branches_hash) = get_nodes($tree); - my @nodes=@$nodes_arr; - print "recieved nodes = @nodes\n" if $printer == 1; - - - #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS - foreach my $keys (@nodes){ - my @pair = @$keys; - my $joint = "(".join(", ",@pair).")"; - my $copykey = join "", @pair; - $copykey =~ s/[\W ]+//g; - print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; - my $livestatus = 1; - foreach my $copy (split(/\s*/,$copykey)){ - $livestatus = 0 if !exists $alivehash{$copy}; - } - $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; - print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; - } - - - - @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. - - my @mutations_array=(); - - my $joint = (); - foreach my $node (@nodes){ - my @pair = @$node; - print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; - $joint = "(".join(", ",@pair).")"; - print "joint = $joint \n" if $printer == 1; - my @pair_sequences=(); - - foreach my $tag (@pair){ - print "tag = $tag: " if $printer == 1; - print $node_alignments{$tag},"\n" if $printer == 1; - push @pair_sequences, $node_alignments{$tag}; - } -# print "fillgap\n"; - my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); - $node_alignments{$joint}=$compared; - push( @mutations_array,split(/:/,$substitutions_list)); - print "newly added to node_sequences: $node_alignments{$joint} and list of mutations = @mutations_array\n" if $printer == 1; - } - print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1; -# <STDIN> if $printer == 1; - - my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); - -# print "returningt: ", $analayzed_mutations, \@nodes,"\n" if scalar @mutations_array > 0;; -# print "returningy: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1; - print "final node alignment after filling for $joint= " if $printer == 1; - print "$node_alignments{$joint}\n" if $printer == 1; - - - return 1, $analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint} if scalar @mutations_array > 0 ; - return 1, "NULL","NULL","NULL", "NULL", "NULL" if scalar @mutations_array == 0; -} - - - -sub add_mutation{ - my $printer = 0; - print "IN SUBROUTUNE add_mutation.. information received = @_\n" if $printer == 1; - my ($i , $bite, $to, $from) = @_; - print "bite = $bite.. all received info = ",join("^", @_),"\n" if $printer == 1; - print "to=$to\n" if $printer == 1; - print "tis split = ",join(" and ",split(/!/,$to)),"\n" if $printer == 1; - my @toields = split "!",$to; - print "toilds = @toields\n" if $printer == 1; - my @mutations=(); - - foreach my $toield (@toields){ - my @toinfo=split(":",$toield); - print " at toinfo=@toinfo \n" if $printer == 1; - next if $toinfo[1] =~ /$from/i; - my @mutation = @toinfo if $toinfo[1] !~ /$from/i; - print "adding to mutaton list: ", join(",", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="),"\n" if $printer == 1; - push (@mutations, join("\t", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion=")); - } - return @mutations; -} - - -sub add_bases{ - - my $printer = 0; - print "IN SUBROUTUNE add_bases.. information received = @_\n" if $printer == 1; - my ($optional0, $optional1, $pair0, $pair1,$joint) = @_; - my $total_list=(); - - my @total_list0=split(/!/,$optional0); - my @total_list1=split(/!/,$optional1); - my @all_list=(); - my %total_hash0=(); - foreach my $entry (@total_list0) { - $entry = uc $entry; - $entry =~ /(\S+):(\S+)/; - $total_hash0{$2}=$1; - push @all_list, $2; - } - - my %total_hash1=(); - foreach my $entry (@total_list1) { - $entry = uc $entry; - $entry =~ /(\S+):(\S+)/; - $total_hash1{$2}=$1; - push @all_list, $2; - } - - my %alphabetical_hash=(); - my @return_options=(); - - for my $i (0 ... $#all_list){ - my $alph = $all_list[$i]; - if (exists $total_hash0{$alph} && exists $total_hash1{$alph}){ - push(@return_options, $joint.":".$alph); - delete $total_hash0{$alph}; delete $total_hash1{$alph}; - } - if (exists $total_hash0{$alph} && !exists $total_hash1{$alph}){ - push(@return_options, $pair0.":".$alph); - delete $total_hash0{$alph}; - } - if (!exists $total_hash0{$alph} && exists $total_hash1{$alph}){ - push(@return_options, $pair1.":".$alph); - delete $total_hash1{$alph}; - } - - } - - print "returning ",join "!",@return_options,"\n" if $printer == 1; - return join "!",@return_options; - -} - - -sub fillgaps{ -# print "IN fillgaps: @_\n"; - my ($micro, $phasesinput) = @_; - #print "in microsathash ,,.. micro = $micro\n"; - return $micro if $micro !~ /\-/; - my $orig_micro = $micro; - my @phases = @$phasesinput; - - my %tested_patterns = (); - - foreach my $phase (@phases){ - # print "considering phase: $phase\n"; - my @phase_prefixes = (); - my @prephase_left_contexts = (); - my @prephase_right_contexts = (); - my @pregapsize = (); - my @prepostfilins = (); - - my @phase_suffixes; - my @suffphase_left_contexts; - my @suffphase_right_contexts; - my @suffgapsize; - my @suffpostfilins; - - my @postfilins = (); - my $motifsize = length($phases[0]); - - my $change = 0; - - for my $u (0 ... $motifsize-1){ - my $concat = $phase.$phase.$phase.$phase; - my @concatarr = split(/\s*/, $concat); - my $l = 0; - while ($l < $u){ - shift @concatarr; - $l++; - } - $concat = join ("", @concatarr); - - for my $t (0 ... $motifsize-1){ - for my $k (1 ... $motifsize-1){ - push @phase_prefixes, substr($concat, $motifsize+$t, $k); - push @prephase_left_contexts, substr ($concat, $t, $motifsize); - push @prephase_right_contexts, substr ($concat, $motifsize+$t+$k+($motifsize-$k), 1); - push @pregapsize, $k; - push @prepostfilins, substr($concat, $motifsize+$t+$k, ($motifsize-$k)); - # print "reading: $concat, t=$t, k=$k prefix: $prephase_left_contexts[$#prephase_left_contexts] $phase_prefixes[$#phase_prefixes] -x$pregapsize[$#pregapsize] $prephase_right_contexts[$#prephase_right_contexts]\n"; - # print "phase_prefixes = $phase_prefixes[$#phase_prefixes]\n"; - # print "prephase_left_contexts = $prephase_left_contexts[$#prephase_left_contexts]\n"; - # print "prephase_right_contexts = $prephase_right_contexts[$#prephase_right_contexts]\n"; - # print "pregapsize = $pregapsize[$#pregapsize]\n"; - # print "prepostfilins = $prepostfilins[$#prepostfilins]\n"; - } - } - } - - # print "looking if $micro =~ /($phase\-{$motifsize})/i || $micro =~ /^(\-{$motifsize,}$phase)/i\n"; - if ($micro =~ /($phase\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,}$phase)/i){ - # print "micro: $micro needs further gap removal: $1\n"; - while ($micro =~ /$phase(\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,})$phase/i){ - # print "micro: $micro needs further gap removal: $1\n"; - - # print "phase being considered = $phase\n"; - my $num = (); - $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi if $micro =~ /$phase\-{$motifsize,}/i; - $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi if $micro =~ /\-{$motifsize,}$phase/i; - # print "num = $num\n"; - $change = 1 if $num == 1; - } - } - - elsif ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ - while ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ - # print "checking lengths of $1 and $3 for $micro... \n"; - my $num = (); - if (length($1) >= length($3)){ - # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, >= , $3 \n"; - $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; - } - if (length($1) < length($3)){ - # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, < , $3 \n"; - $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; - } - # print "micro changed to $micro\n"; - } - } - elsif ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ - while ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ - # print "$micro matches ([A-Z]+)\-{$motifsize}(($phase)+) = 1=$1, - , 3=$3 \n"; - my $num = 0; - $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; - } - } - elsif ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ - while ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ - # print "$micro matches (($phase)+)\-{$motifsize,}([A-Z]+) = 1=$1, - , 3=$3 \n"; - my $num = 0; - $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; - } - } - - # print "$orig_micro to $micro\n"; - - #s <STDIN>; - - for my $h (0 ... $#phase_prefixes){ - # print "searching using prefix : $prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]\n"; - my $pattern = $prephase_left_contexts[$h].$phase_prefixes[$h].$pregapsize[$h].$prephase_right_contexts[$h]; - # print "returning orig_micro = $orig_micro, micro = $micro \n" if exists $tested_patterns{$pattern}; - if ($micro =~ /$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/i){ - return $orig_micro if exists $tested_patterns{$pattern}; - while ($micro =~ /($prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h])/i){ - $tested_patterns{$pattern} = $pattern; - # print "micro: $micro needs further gap removal: $1\n"; - - # print "prefix being considered = $phase_prefixes[$h]\n"; - my $num = (); - $num = ($micro =~ s/$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/$prephase_left_contexts[$h]$phase_prefixes[$h]$prepostfilins[$h]$prephase_right_contexts[$h]/gi) ; - # print "num = $num, micro = $micro\n"; - $change = 1 if $num == 1; - - return $orig_micro if $num > 1; - } - } - - } - } - return $orig_micro if length($micro) != length($orig_micro); - return $micro; -} - -sub selectMutationArray{ - my $printer =0; - - my $oldmutspt = $_[0]; - my $newmutspt = $_[1]; - my $tagstringpt = $_[2]; - my $alivehashpt = $_[3]; - my $alignmentpt = $_[4]; - my $motif = $_[5]; - - my @alivehasharr=(); - - my @tags = @$tagstringpt; - my $alignmentln = length($alignmentpt->{$tags[0]}); - - foreach my $key (keys %$alivehashpt) { push @alivehasharr, $key; print "we have alive: $key\n" if $printer == 1;} - - my %newside = (); - my %oldside = (); - my %newmuts = (); - - my %commons = (); - my %olds = (); - foreach my $old (@$oldmutspt){ - $olds{$old} = 1; - } - foreach my $new (@$newmutspt){ - $commons{$new} = 1 if exists $olds{$new};; - } - - - foreach my $pos ( 0 ... $alignmentln){ - #print "pos = $pos\n" if $printer == 1; - my $newyes = 0; - foreach my $mut (@$newmutspt){ - $newmuts{$mut} = 1; - chomp $mut; - $newyes++; - $mut =~ s/=\t/= \t/g; - $mut =~ s/=$/= /g; - - $mut =~ /node=([A-Z\(\), ]+)\stype=([a-zA-Z ]+)\sposition=([0-9 ]+)\sfrom=([a-zA-Z\- ]+)\sto=([a-zA-Z\- ]+)\sinsertion=([a-zA-Z\- ]+)\sdeletion=([a-zA-Z\- ]+)/; - my $node = $1; - next if $3 != $pos; - print "new mut = $mut\n" if $printer == 1; - print "node = $node, pos = $3 ... and alivehasharr = >@alivehasharr<\n" if $printer == 1; - my $alivenode = 0; - foreach my $key (@alivehasharr){ - $alivenode = 1 if $key =~ /$node/; - } - # next if $alivenode == 0; - my $indel_type = " "; - if ($2 eq "insertion" || $2 eq "deletion"){ - my $thisindel = (); - $thisindel = $6 if $2 eq "insertion"; - $thisindel = $7 if $2 eq "deletion"; - - $indel_type = "i".checkIndelType($node, $thisindel, $motif,$alignmentpt,$3, $2) if $2 eq "insertion"; - $indel_type = "d".checkIndelType($node, $thisindel, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; - $indel_type = $indel_type."f" if $indel_type =~ /mot/ && length($thisindel) >= length($motif); - } - print "indeltype = $indel_type\n" if $printer == 1; - my $added = 0; - - if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ - print "we have a preexisting one for $pos\n" if $printer == 1; - my @preexisting = @{$newside{$pos}}; - foreach my $pre (@preexisting){ - print "looking at $pre\n" if $printer == 1; - next if $pre !~ /node=$node/; - next if $pre !~ /indeltype=([a-z]+)/; - my $currtype = $1; - - if ($currtype =~ /inon/ && $indel_type =~ /dmot/){ - delete $newside{$pos}; - push @{$newside{$pos}}, $pre; - $added = 1; - } - if ($currtype =~ /dnon/ && $indel_type =~ /imot/){ - delete $newside{$pos}; - push @{$newside{$pos}}, $pre; - $added = 1; - } - if ($currtype =~ /dmot/ && $indel_type =~ /inon/){ - delete $newside{$pos}; - push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; - $added = 1; - } - if ($currtype =~ /imot/ && $indel_type =~ /dnon/){ - delete $newside{$pos}; - push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; - $added = 1; - } - } - } - - print "added = $added\n" if $printer == 1; - push @{$newside{$pos}}, $mut."\tindeltype=$indel_type" if $added == 0; - - print "for new pos,: $pos we have: @{$newside{$pos}}\n " if $printer == 1; - } - } - - foreach my $pos ( 0 ... $alignmentln){ - my $oldyes = 0; - foreach my $mut (@$oldmutspt){ - chomp $mut; - $oldyes++; - $mut =~ s/=\t/= \t/g; - $mut =~ s/=$/= /g; - $mut =~ /node=([A-Z\(\), ]+)\ttype=([a-zA-Z ]+)\tposition=([0-9 ]+)\tfrom=([a-zA-Z\- ]+)\tto=([a-zA-Z\- ]+)\tinsertion=([a-zA-Z\- ]+)\tdeletion=([a-zA-Z\- ]+)/; - my $node = $1; - next if $3 != $pos; - print "old mut = $mut\n" if $printer == 1; - my $alivenode = 0; - foreach my $key (@alivehasharr){ - $alivenode = 1 if $key =~ /$node/; - } - #next if $alivenode == 0; - my $indel_type = " "; - if ($2 eq "insertion" || $2 eq "deletion"){ - $indel_type = "i".checkIndelType($node, $6, $motif,$alignmentpt, $3, $2) if $2 eq "insertion"; - $indel_type = "d".checkIndelType($node, $7, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; - next if $indel_type =~/non/; - } - else{ next;} - - my $imp=0; - $imp = 1 if $indel_type =~ /dmot/ && $alivenode == 0; - $imp = 1 if $indel_type =~ /imot/ && $alivenode == 1; - - - if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ - my @preexisting = @{$newside{$pos}}; - print "we have a preexisting one for $pos: @preexisting\n" if $printer == 1; - next if $imp == 0; - - if (scalar(@preexisting) == 1){ - my $foundmut = $preexisting[0]; - $foundmut=~ /node=([A-Z, \(\)]+)/; - next if $1 eq $node; - - if (exists $oldside{$pos} || exists $commons{$foundmut}){ - print "not replacing, but just adding\n" if $printer == 1; - push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; - push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; - next; - } - - delete $newside{$pos}; - push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; - push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; - print "now new one is : @{$newside{$pos}}\n" if $printer == 1; - } - - print "for pos: $pos: @{$newside{$pos}}\n" if $printer == 1; - next; - } - - - my @news = @{$newside{$pos}} if exists $newside{$pos}; - - print "mut = $mut and news = @news\n" if $printer == 1; - push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; - push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; - } - } - - print "in the end, our collected mutations = \n" if $printer == 1; - my @returnarr = (); - foreach my $key (keys %newside) {push @returnarr,@{$newside{$key}};} - print join("\n", @returnarr),"\n" if $printer == 1; - #<STDIN>; - return @returnarr; - -} - - -sub checkIndelType{ - my $printer = 0; - my $node = $_[0]; - my $indel = $_[1]; - my $motif = $_[2]; - my $alignmentpt = $_[3]; - my $posit = $_[4]; - my $type = $_[5]; - my @phases =(); - my %prephases = (); - my %postphases = (); - #print "motif = $motif\n"; - print "IN checkIndelType ... received: @_\n" if $printer == 1; - my $concat = $motif.$motif.$motif.$motif; - my $motiflength = length($motif); - - if ($motiflength > length ($indel)){ - return "non" if $motif !~ /$indel/i; - return checkIndelType_ComplexAnalysis($node, $indel, $motif, $alignmentpt, $posit, $type); - } - - my $firstpass = 0; - for my $y (0 ... $motiflength-1){ - my $phase = substr($concat, $motiflength+$y, $motiflength); - push @phases, $phase; - $firstpass = 1 if $indel =~ /$phase/i; - for my $k (0 ... length($motif)-1){ - print "at: motiflength=$motiflength , y=$y , k=$k.. for pre: $motiflength+$y-$k and post: $motiflength+$y-$k+$motiflength in $concat\n" if $printer == 1; - my $pre = substr($concat, $motiflength+$y-$k, $k ); - my $post = substr($concat, $motiflength+$y+$motiflength, $k); - print "adding to phases : $phase - $pre and $post\n" if $printer == 1; - push @{$prephases{$phase}} , $pre; - push @{$postphases{$phase}} , $post; - } - - } - print "firstpass 1= $firstpass\n" if $printer == 1; - return "non" if $firstpass ==0; - $firstpass =0; - - foreach my $phase (@phases){ - my @pres = @{$prephases{$phase}}; - my @posts = @{$postphases{$phase}}; - - foreach my $pre (@pres){ - foreach my $post (@posts){ - - $firstpass = 1 if $indel =~ /($pre)?($phase)+($post)?/i && length($indel) > (3 * length($motif)); - $firstpass = 1 if $indel =~ /^($pre)?($phase)+($post)?$/i && length($indel) < (3 * length($motif)); - print "matched here : ($pre)?($phase)+($post)?\n" if $printer == 1; - last if $firstpass == 1; - } - last if $firstpass == 1; - } - last if $firstpass == 1; - } - - print "firstpass 2= $firstpass\n" if $printer == 1; - return "non" if $firstpass ==0; - return "mot" if $firstpass ==1; -} - - -sub checkIndelType_ComplexAnalysis{ - my $printer = 0; - my $node = $_[0]; - my $indel = $_[1]; - my $motif = $_[2]; - my $alignmentpt = $_[3]; - my $pos = $_[4]; - my $type = $_[5]; - my @speciesinvolved = $node =~ /[A-Z]+/g; - - my @seqs = (); - my $residualseq = length($motif) - length($indel); - print "IN COMPLEX ANALYSIS ... received: @_ .... speciesinvolved = @speciesinvolved\n" if $printer == 1; - print "we have position = $pos, sseq = $alignmentpt->{$speciesinvolved[0]}\n" if $printer == 1; - print "residualseq = $residualseq\n" if $printer == 1; - print "pos=$pos... got: @_\n" if $printer == 1; - foreach my $sp (@speciesinvolved){ - my $spseq = $alignmentpt->{$sp}; - #print "orig spseq = $spseq\n"; - my $subseq = (); - - if ($type eq "deletion"){ - my @indelparts = split(/\s*/,$indel); - my @seqparts = split(/\s*/,$spseq); - - for my $p ($pos ... $pos+length($indel)-1){ - $seqparts[$p] = shift @indelparts; - } - $spseq = join("",@seqparts); - } - #print "mod spseq = $spseq\n"; - # $spseq=~ s/\-//g if $type !~ /deletion/; - - print "substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq)\n" if $pos > 0 && $pos < (length($spseq) - length($motif)) && $printer == 1; - print "substr($spseq, 0, length($indel)+$residualseq)\n" if $pos == 0 && $printer == 1; - print "substr($spseq, $pos - $residualseq, length($indel)+$residualseq)\n" if $pos >= (length($spseq) - length($motif)) && $printer == 1; - - $subseq = substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq) if $pos > 0 && $pos < (length($spseq) - length($motif)) ; - $subseq = substr($spseq, 0, length($indel)+$residualseq) if $pos == 0; - $subseq = substr($spseq, $pos - $residualseq, length($indel)+$residualseq) if $pos >= (length($spseq) - length($motif)) ; - print "spseq = $spseq . subseq=$subseq . type = $type\n" if $printer == 1; - #<STDIN> if $subseq !~ /[a-z\-]/i; - $subseq =~ s/\-/$indel/g if $type =~ /insertion/; - push @seqs, $subseq; - print "seqs = @seqs\n" if $printer == 1; - } - return "non" if checkIfSeqsIdentical(@seqs) eq "NO"; - - print "checking for $seqs[0] \n" if $printer == 1; - - my @phases =(); - my %prephases = (); - my %postphases = (); - my $concat = $motif.$motif.$motif.$motif; - my $motiflength = length($motif); - - my $firstpass = 0; - - for my $y (0 ... $motiflength-1){ - my $phase = substr($concat, $motiflength+$y, $motiflength); - push @phases, $phase; - $firstpass = 1 if $seqs[0] =~ /$phase/i; - for my $k (0 ... length($motif)-1){ - my $pre = substr($concat, $motiflength+$y-$k, $k ); - my $post = substr($concat, $motiflength+$y+$motiflength, $k); - print "adding to phases : $phase - $pre and $post\n" if $printer == 1; - push @{$prephases{$phase}} , $pre; - push @{$postphases{$phase}} , $post; - } - - } - print "firstpass 1= $firstpass.. also, res-d = ",(length($seqs[0]))%(length($motif)),"\n" if $printer == 1; - return "non" if $firstpass ==0; - $firstpass =0; - foreach my $phase (@phases){ - - $firstpass = 1 if $seqs[0] =~ /^($phase)+$/i && ((length($seqs[0]))%(length($motif))) == 0; - - if (((length($seqs[0]))%(length($motif))) != 0){ - my @pres = @{$prephases{$phase}}; - my @posts = @{$postphases{$phase}}; - foreach my $pre (@pres){ - foreach my $post (@posts){ - next if $pre !~ /\S/ && $post !~ /\S/; - $firstpass = 1 if ($seqs[0] =~ /^($pre)($phase)+($post)$/i || $seqs[0] =~ /^($pre)($phase)+$/i || $seqs[0] =~ /^($phase)+($post)$/i); - print "caught with $pre $phase $post\n" if $printer == 1; - last if $firstpass == 1; - } - last if $firstpass == 1; - } - } - - last if $firstpass == 1; - } - - #print "indel = $indel.. motif = $motif.. firstpass 2= mot\n" if $firstpass ==1; - #print "indel = $indel.. motif = $motif.. firstpass 2= non\n" if $firstpass ==0; - #<STDIN>;# if $firstpass ==1; - return "non" if $firstpass ==0; - return "mot" if $firstpass ==1; - -} - -sub checkIfSeqsIdentical{ - my @seqs = @_; - my $identical = 1; - - for my $j (1 ... $#seqs){ - $identical = 0 if uc($seqs[0]) ne uc($seqs[$j]); - } - return "NO" if $identical == 0; - return "YES" if $identical == 1; - -} - -sub summarizeMutations{ - my $mutspt = $_[0]; - my @muts = @$mutspt; - my $tree = $_[1]; - - my @returnarr = (); - - for (1 ... 38){ - push @returnarr, "NA"; - } - push @returnarr, "NULL"; - return @returnarr if $tree eq "NULL" || scalar(@muts) < 1; - - - my @bspecies = (); - my @dspecies = (); - my $treecopy = $tree; - $treecopy =~ s/[\(\)]//g; - my @treeparts = split(/[\.,]+/, $treecopy); - - for my $part (@treeparts){ - if ($part =~ /\+/){ - $part =~ s/\+//g; - #my @sp = split(/\s*/, $part); - #foreach my $p (@sp) {push @bspecies, $p;} - push @bspecies, $part; - } - if ($part =~ /\-/){ - $part =~ s/\-//g; - #my @sp = split(/\s*/, $part); - #foreach my $p (@sp) {push @dspecies, $p;} - push @dspecies, $part; - } - - } - #print "-------------------------------------------------------\n"; - - my ($insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) = (0,0,0,0,0,0,0,0); - my ($binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) = (0,0,0,0,0,0,0,0); - my ($dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) = (0,0,0,0,0,0,0,0); - my ($ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) = (0,0,0,0,0,0,0,0); - my ($substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) = (0,0,0,0,"NA","NA"); - - my @insertionsarr = (" "); - my @deletionsarr = (" "); - - my @substitutionsarr = (" "); - - - foreach my $mut (@muts){ - # print "mut = $mut\n"; - chomp $mut; - $mut =~ s/=\t/= /g; - $mut =~ s/=$/= /g; - my %mhash = (); - my @mields = split(/\t/,$mut); - - foreach my $m (@mields){ - my @fields = split(/=/,$m); - next if $fields[1] eq " "; - $mhash{$fields[0]} = $fields[1]; - } - - my $myutype = (); - my $decided = 0; - - my $localnode = $mhash{"node"}; - $localnode =~ s/[\(\)\. ,]//g; - - - foreach my $s (@bspecies){ - if ($localnode eq $s) { - $decided = 1; $myutype = "b"; - } - } - - foreach my $s (@dspecies){ - if ($localnode eq $s) { - $decided = 1; $myutype = "d"; - } - } - - $myutype = "n" if $decided != 1; - - - # print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. \n"; - # <STDIN> if $mhash{"type"} eq "insertion" && $myutype eq "b"; - - - if ($mhash{"type"} eq "substitution"){ - $substitutions++; - $bsubstitutions++ if $myutype eq "b"; - $dsubstitutions++ if $myutype eq "d"; - $nsubstitutions++ if $myutype eq "n"; - # print "substitution: from= $mhash{from}, to = $mhash{to}, and type = myutype\n"; - push @substitutionsarr, "b:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "b"; - push @substitutionsarr, "d:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "d"; - push @substitutionsarr, "n:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "n"; - # print "substitutionsarr = @substitutionsarr\n"; - # <STDIN>; - } - else{ - #print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. indeltype=$mhash{indeltype}\n"; - if ($mhash{"type"} eq "deletion"){ - $deletions++; - - $motdeletions++ if $mhash{"indeltype"} =~ /dmot/; - $motdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/; - - $nondeletions++ if $mhash{"indeltype"} =~ /dnon/; - - $bdeletions++ if $myutype eq "b"; - $ddeletions++ if $myutype eq "d"; - $ndeletions++ if $myutype eq "n"; - - $bmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "b"; - $bmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "b"; - $bnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "b"; - - $dmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "d"; - $dmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "d"; - $dnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "d"; - - $nmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "n"; - $nmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "n"; - $nnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "n"; - - push @deletionsarr, "b:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "b"; - push @deletionsarr, "d:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "d"; - push @deletionsarr, "n:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "n"; - } - - if ($mhash{"type"} eq "insertion"){ - $insertions++; - - $motinsertions++ if $mhash{"indeltype"} =~ /imot/; - $motinsertionsf++ if $mhash{"indeltype"} =~ /imotf/; - $noninsertions++ if $mhash{"indeltype"} =~ /inon/; - - $binsertions++ if $myutype eq "b"; - $dinsertions++ if $myutype eq "d"; - $ninsertions++ if $myutype eq "n"; - - $bmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "b"; - $bmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "b"; - $bnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "b"; - - $dmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "d"; - $dmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "d"; - $dnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "d"; - - $nmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "n"; - $nmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "n"; - $nnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "n"; - - push @insertionsarr, "b:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "b"; - push @insertionsarr, "d:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "d"; - push @insertionsarr, "n:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "n"; - - } - } - } - - - - $indels = "ins=".join(",",@insertionsarr).";dels=".join(",",@deletionsarr) if scalar(@insertionsarr) > 1 || scalar(@deletionsarr) > 1 ; - $subs = join(",",@substitutionsarr) if scalar(@substitutionsarr) > 1; - $indels =~ s/ //g; - $subs =~ s/ //g ; - - #print "indels = $indels, subs=$subs\n"; - ##<STDIN> if $indels =~ /[a-zA-Z0-9]/ || $subs =~ /[a-zA-Z0-9]/ ; - #print "tree = $tree, indels = $indels, subs = $subs, bspecies = @bspecies, dspecies = @dspecies \n"; - my @returnarray = (); - - push (@returnarray, $insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) ; - push (@returnarray, $binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) ; - push (@returnarray, $dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) ; - push (@returnarray, $ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) ; - push (@returnarray, $substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) ; - - push @returnarray, $tree; - - my @copy = @returnarray; - return (@returnarray); - -} - -sub selectBetterTree{ - my $printer = 0; - my $treestudy = $_[0]; - my $alt = $_[1]; - my $mutspt = $_[2]; - my @muts = @$mutspt; - my @trees = (); my @alternatetrees=(); - - @trees = split(/\|/,$treestudy) if $treestudy =~ /\|/; - @alternatetrees = split(/[\|;]/,$alt) if $alt =~ /[\|;\(\)]/; - - $trees[0] = $treestudy if $treestudy !~ /\|/; - $alternatetrees[0] = $alt if $alt !~ /[\|;\(\)]/; - - my @alltrees = (@trees, @alternatetrees); -# push(@alltrees,@alternatetrees); - - my %mutspecies = (); - - print "IN selectBetterTree..treestudy=$treestudy. alt=$alt. for: @_. trees=@trees<. alternatetrees=@alternatetrees\n" if $printer == 1; - #<STDIN>; - foreach my $mut (@muts){ - print colored ['green'],"mut = $mut\n" if $printer == 1; - $mut =~ /node=([A-Z,\(\) ]+)/; - my $node = $1; - $node =~s/[,\(\) ]+//g; - my @indivspecies = $node =~ /[A-Z]+/g; - #print "adding node: $node\n" if $printer == 1; - $mutspecies{$node} = $node; - - #foreach (@indivspecies) { - #$mutspecies{$mut} = $_; #print "for $_ adding $mutspecies{$_}\n"; - #} - - } - - my @treerecords = (); - my $treecount = -1; - foreach my $tree (@alltrees){ - print "checking with tree $tree\n" if $printer == 1; - $treecount++; - $treerecords[$treecount] = 0; - my @indivspecies = ($tree =~ /[A-Z]+/g); - print "indivspecies=@indivspecies\n" if $printer == 1; - foreach my $species (@indivspecies){ - print "checkin if exists species: $species\n" if $printer == 1; - $treerecords[$treecount]+=2 if exists $mutspecies{$species} && $mutspecies{$species} !~ /indeltype=[a-z]mot/; - $treerecords[$treecount]+=1.5 if exists $mutspecies{$species} && $mutspecies{$species} =~ /indeltype=[a-z]mot/; - $treerecords[$treecount]-- if !exists $mutspecies{$species}; - } - - print "for tree $tree, our treecount = $treerecords[$treecount]\n" if $printer == 1; - } - - my @best_tree = array_largest_number_arrayPosition(@treerecords); - print "treerecords = @treerecords. hence, best tree = @best_tree\n" if $printer == 1; - - return ($alltrees[$best_tree[0]], $treerecords[$best_tree[0]]) if scalar(@best_tree) == 1; - print "best_tree[0] = $best_tree[0], and treerecords = $treerecords[$best_tree[0]]\n" if $printer == 1; - return ("NULL", -1) if $treerecords[$best_tree[0]] < 1; - my $rando = int(rand($#trees)); - return ($alltrees[$rando], $treerecords[$rando]) if scalar(@best_tree) > 1; - -} - - - - -sub load_sameHash{ - #my $g = %$_[0]; - $sameHash{"CAGT"}="AGTC"; - $sameHash{"ATGA"}="AATG"; - $sameHash{"CAAC"}="AACC"; - $sameHash{"GGAA"}="AAGG"; - $sameHash{"TAAG"}="AAGT"; - $sameHash{"CGAG"}="AGCG"; - $sameHash{"TAGG"}="AGGT"; - $sameHash{"GCAG"}="AGGC"; - $sameHash{"TAGA"}="ATAG"; - $sameHash{"TGA"}="ATG"; - $sameHash{"CAAG"}="AAGC"; - $sameHash{"CTAA"}="AACT"; - $sameHash{"CAAT"}="AATC"; - $sameHash{"GTAG"}="AGGT"; - $sameHash{"GAAG"}="AAGG"; - $sameHash{"CGA"}="ACG"; - $sameHash{"GTAA"}="AAGT"; - $sameHash{"ACAA"}="AAAC"; - $sameHash{"GCGG"}="GGGC"; - $sameHash{"ATCA"}="AATC"; - $sameHash{"TAAC"}="AACT"; - $sameHash{"GGCA"}="AGGC"; - $sameHash{"TGAG"}="AGTG"; - $sameHash{"AACA"}="AAAC"; - $sameHash{"GAGC"}="AGCG"; - $sameHash{"ACCA"}="AACC"; - $sameHash{"TGAA"}="AATG"; - $sameHash{"ACA"}="AAC"; - $sameHash{"GAAC"}="AACG"; - $sameHash{"GCA"}="AGC"; - $sameHash{"CCAC"}="ACCC"; - $sameHash{"CATA"}="ATAC"; - $sameHash{"CAC"}="ACC"; - $sameHash{"TACA"}="ATAC"; - $sameHash{"GGAC"}="ACGG"; - $sameHash{"AGA"}="AAG"; - $sameHash{"ATAA"}="AAAT"; - $sameHash{"CA"}="AC"; - $sameHash{"CCCA"}="ACCC"; - $sameHash{"TCAA"}="AATC"; - $sameHash{"CAGA"}="AGAC"; - $sameHash{"AATA"}="AAAT"; - $sameHash{"CCA"}="ACC"; - $sameHash{"AGAA"}="AAAG"; - $sameHash{"AGTA"}="AAGT"; - $sameHash{"GACG"}="ACGG"; - $sameHash{"TCAG"}="AGTC"; - $sameHash{"ACGA"}="AACG"; - $sameHash{"CGCA"}="ACGC"; - $sameHash{"GAGT"}="AGTG"; - $sameHash{"GA"}="AG"; - $sameHash{"TA"}="AT"; - $sameHash{"TAA"}="AAT"; - $sameHash{"CAG"}="AGC"; - $sameHash{"GATA"}="ATAG"; - $sameHash{"GTA"}="AGT"; - $sameHash{"CCAA"}="AACC"; - $sameHash{"TAG"}="AGT"; - $sameHash{"CAAA"}="AAAC"; - $sameHash{"AAGA"}="AAAG"; - $sameHash{"CACG"}="ACGC"; - $sameHash{"GTCA"}="AGTC"; - $sameHash{"GGA"}="AGG"; - $sameHash{"GGAT"}="ATGG"; - $sameHash{"CGGG"}="GGGC"; - $sameHash{"CGGA"}="ACGG"; - $sameHash{"AGGA"}="AAGG"; - $sameHash{"TAAA"}="AAAT"; - $sameHash{"GAGA"}="AGAG"; - $sameHash{"ACTA"}="AACT"; - $sameHash{"GCGA"}="AGCG"; - $sameHash{"CACA"}="ACAC"; - $sameHash{"AGAT"}="ATAG"; - $sameHash{"GAGG"}="AGGG"; - $sameHash{"CGAC"}="ACCG"; - $sameHash{"GGAG"}="AGGG"; - $sameHash{"GCCA"}="AGCC"; - $sameHash{"CCAG"}="AGCC"; - $sameHash{"GAAA"}="AAAG"; - $sameHash{"CAGG"}="AGGC"; - $sameHash{"GAC"}="ACG"; - $sameHash{"CAA"}="AAC"; - $sameHash{"GACC"}="ACCG"; - $sameHash{"GGCG"}="GGGC"; - $sameHash{"GGTA"}="AGGT"; - $sameHash{"AGCA"}="AAGC"; - $sameHash{"GATG"}="ATGG"; - $sameHash{"GTGA"}="AGTG"; - $sameHash{"ACAG"}="AGAC"; - $sameHash{"CGG"}="GGC"; - $sameHash{"ATA"}="AAT"; - $sameHash{"GACA"}="AGAC"; - $sameHash{"GCAA"}="AAGC"; - $sameHash{"CAGC"}="AGCC"; - $sameHash{"GGGA"}="AGGG"; - $sameHash{"GAG"}="AGG"; - $sameHash{"ACAT"}="ATAC"; - $sameHash{"GAAT"}="AATG"; - $sameHash{"CACC"}="ACCC"; - $sameHash{"GAT"}="ATG"; - $sameHash{"GCG"}="GGC"; - $sameHash{"GCAC"}="ACGC"; - $sameHash{"GAA"}="AAG"; - $sameHash{"TGGA"}="ATGG"; - $sameHash{"CCGA"}="ACCG"; - $sameHash{"CGAA"}="AACG"; -} - - - -sub load_revHash{ - $revHash{"CTGA"}="AGTC"; - $revHash{"TCTT"}="AAAG"; - $revHash{"CTAG"}="AGCT"; - $revHash{"GGTG"}="ACCC"; - $revHash{"GCC"}="GGC"; - $revHash{"GCTT"}="AAGC"; - $revHash{"GCGT"}="ACGC"; - $revHash{"GTTG"}="AACC"; - $revHash{"CTCC"}="AGGG"; - $revHash{"ATC"}="ATG"; - $revHash{"CGAT"}="ATCG"; - $revHash{"TTAA"}="AATT"; - $revHash{"GTTC"}="AACG"; - $revHash{"CTGC"}="AGGC"; - $revHash{"TCGA"}="ATCG"; - $revHash{"ATCT"}="ATAG"; - $revHash{"GGTT"}="AACC"; - $revHash{"CTTA"}="AAGT"; - $revHash{"TGGC"}="AGCC"; - $revHash{"CCG"}="GGC"; - $revHash{"CGGC"}="GGCC"; - $revHash{"TTAG"}="AACT"; - $revHash{"GTG"}="ACC"; - $revHash{"CTTT"}="AAAG"; - $revHash{"TGCA"}="ATGC"; - $revHash{"CGCT"}="AGCG"; - $revHash{"TTCC"}="AAGG"; - $revHash{"CT"}="AG"; - $revHash{"C"}="G"; - $revHash{"CTCT"}="AGAG"; - $revHash{"ACTT"}="AAGT"; - $revHash{"GGTC"}="ACCG"; - $revHash{"ATTC"}="AATG"; - $revHash{"GGGT"}="ACCC"; - $revHash{"CCTA"}="AGGT"; - $revHash{"CGCG"}="GCGC"; - $revHash{"GTGT"}="ACAC"; - $revHash{"GCCC"}="GGGC"; - $revHash{"GTCG"}="ACCG"; - $revHash{"TCCC"}="AGGG"; - $revHash{"TTCA"}="AATG"; - $revHash{"AGTT"}="AACT"; - $revHash{"CCCT"}="AGGG"; - $revHash{"CCGC"}="GGGC"; - $revHash{"CTT"}="AAG"; - $revHash{"TTGG"}="AACC"; - $revHash{"ATT"}="AAT"; - $revHash{"TAGC"}="AGCT"; - $revHash{"ACTG"}="AGTC"; - $revHash{"TCAC"}="AGTG"; - $revHash{"CTGT"}="AGAC"; - $revHash{"TGTG"}="ACAC"; - $revHash{"ATCC"}="ATGG"; - $revHash{"GTGG"}="ACCC"; - $revHash{"TGGG"}="ACCC"; - $revHash{"TCGG"}="ACCG"; - $revHash{"CGGT"}="ACCG"; - $revHash{"GCTC"}="AGCG"; - $revHash{"TACG"}="ACGT"; - $revHash{"GTTT"}="AAAC"; - $revHash{"CAT"}="ATG"; - $revHash{"CATG"}="ATGC"; - $revHash{"GTTA"}="AACT"; - $revHash{"CACT"}="AGTG"; - $revHash{"TCAT"}="AATG"; - $revHash{"TTA"}="AAT"; - $revHash{"TGTA"}="ATAC"; - $revHash{"TTTC"}="AAAG"; - $revHash{"TACT"}="AAGT"; - $revHash{"TGTT"}="AAAC"; - $revHash{"CTA"}="AGT"; - $revHash{"GACT"}="AGTC"; - $revHash{"TTGC"}="AAGC"; - $revHash{"TTC"}="AAG"; - $revHash{"GCT"}="AGC"; - $revHash{"GCAT"}="ATGC"; - $revHash{"TGGT"}="AACC"; - $revHash{"CCT"}="AGG"; - $revHash{"CATC"}="ATGG"; - $revHash{"CCAT"}="ATGG"; - $revHash{"CCCG"}="GGGC"; - $revHash{"TGCC"}="AGGC"; - $revHash{"TG"}="AC"; - $revHash{"TGCT"}="AAGC"; - $revHash{"GCCG"}="GGCC"; - $revHash{"TCTG"}="AGAC"; - $revHash{"TGT"}="AAC"; - $revHash{"TTAT"}="AAAT"; - $revHash{"TAGT"}="AACT"; - $revHash{"TATG"}="ATAC"; - $revHash{"TTTA"}="AAAT"; - $revHash{"CGTA"}="ACGT"; - $revHash{"TA"}="AT"; - $revHash{"TGTC"}="AGAC"; - $revHash{"CTAT"}="ATAG"; - $revHash{"TATA"}="ATAT"; - $revHash{"TAC"}="AGT"; - $revHash{"TC"}="AG"; - $revHash{"CATT"}="AATG"; - $revHash{"TCG"}="ACG"; - $revHash{"ATTT"}="AAAT"; - $revHash{"CGTG"}="ACGC"; - $revHash{"CTG"}="AGC"; - $revHash{"TCGT"}="AACG"; - $revHash{"TCCG"}="ACGG"; - $revHash{"GTT"}="AAC"; - $revHash{"ATGT"}="ATAC"; - $revHash{"CTTG"}="AAGC"; - $revHash{"CCTT"}="AAGG"; - $revHash{"GATC"}="ATCG"; - $revHash{"CTGG"}="AGCC"; - $revHash{"TTCT"}="AAAG"; - $revHash{"CGTC"}="ACGG"; - $revHash{"CG"}="GC"; - $revHash{"TATT"}="AAAT"; - $revHash{"CTCG"}="AGCG"; - $revHash{"TCTC"}="AGAG"; - $revHash{"TCCT"}="AAGG"; - $revHash{"TGG"}="ACC"; - $revHash{"ACTC"}="AGTG"; - $revHash{"CTC"}="AGG"; - $revHash{"CGC"}="GGC"; - $revHash{"TTG"}="AAC"; - $revHash{"ACCT"}="AGGT"; - $revHash{"TCTA"}="ATAG"; - $revHash{"GTAC"}="ACGT"; - $revHash{"TTGA"}="AATC"; - $revHash{"GTCC"}="ACGG"; - $revHash{"GATT"}="AATC"; - $revHash{"T"}="A"; - $revHash{"CGTT"}="AACG"; - $revHash{"GTC"}="ACG"; - $revHash{"GCCT"}="AGGC"; - $revHash{"TGC"}="AGC"; - $revHash{"TTTG"}="AAAC"; - $revHash{"GGCT"}="AGCC"; - $revHash{"TCA"}="ATG"; - $revHash{"GTGC"}="ACGC"; - $revHash{"TGAT"}="AATC"; - $revHash{"TAT"}="AAT"; - $revHash{"CTAC"}="AGGT"; - $revHash{"TGCG"}="ACGC"; - $revHash{"CTCA"}="AGTG"; - $revHash{"CTTC"}="AAGG"; - $revHash{"GCTG"}="AGCC"; - $revHash{"TATC"}="ATAG"; - $revHash{"TAAT"}="AATT"; - $revHash{"ACT"}="AGT"; - $revHash{"TCGC"}="AGCG"; - $revHash{"GGT"}="ACC"; - $revHash{"TCC"}="AGG"; - $revHash{"TTGT"}="AAAC"; - $revHash{"TGAC"}="AGTC"; - $revHash{"TTAC"}="AAGT"; - $revHash{"CGT"}="ACG"; - $revHash{"ATTA"}="AATT"; - $revHash{"ATTG"}="AATC"; - $revHash{"CCTC"}="AGGG"; - $revHash{"CCGG"}="GGCC"; - $revHash{"CCGT"}="ACGG"; - $revHash{"TCCA"}="ATGG"; - $revHash{"CGCC"}="GGGC"; - $revHash{"GT"}="AC"; - $revHash{"TTCG"}="AACG"; - $revHash{"CCTG"}="AGGC"; - $revHash{"TCT"}="AAG"; - $revHash{"GTAT"}="ATAC"; - $revHash{"GTCT"}="AGAC"; - $revHash{"GCTA"}="AGCT"; - $revHash{"TACC"}="AGGT"; -} - - -sub allCaps{ - my $motif = $_[0]; - $motif =~ s/a/A/g; - $motif =~ s/c/C/g; - $motif =~ s/t/T/g; - $motif =~ s/g/G/g; - return $motif; -} - - -sub all_caps{ - my @strand = split(/\s*/,$_[0]); - for my $i (0 ... $#strand){ - if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} - if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} - if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} - if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} - } - return join("",@strand); -} -sub array_mean{ - return "NA" if scalar(@_) == 0; - my $sum = 0; - foreach my $val (@_){ - $sum = $sum + $val; - } - return ($sum/scalar(@_)); -} -sub array_sum{ - return "NA" if scalar(@_) == 0; - my $sum = 0; - foreach my $val (@_){ - $sum = $sum + $val; - } - return ($sum); -} - -sub variance{ - return "NA" if scalar(@_) == 0; - return 0 if scalar(@_) == 1; - my $mean = array_mean(@_); - my $num = 0; - return 0 if scalar(@_) == 1; -# print "mean = $mean .. array = >@_<\n"; - foreach my $ele (@_){ - # print "$num = $num + ($ele-$mean)*($ele-$mean)\n"; - $num = $num + ($ele-$mean)*($ele-$mean); - } - my $var = $num / scalar(@_); - return $var; -} - -sub array_95confIntervals{ - return "NA" if scalar(@_) <= 0; - my @sorted = sort { $a <=> $b } @_; -# print "@sorted=",scalar(@sorted), "\n"; - my $aDeechNo = int((scalar(@sorted) * 2.5) / 100); - my $saaDeNo = int((scalar(@sorted) * 97.5) / 100); - - return ($sorted[$aDeechNo], $sorted[$saaDeNo]); -} - -sub array_median{ - return "NA" if scalar(@_) == 0; - return $_[0] if scalar(@_) == 1; - my @sorted = sort { $a <=> $b } @_; - my $totalno = scalar(@sorted); - - #print "sorted = @sorted\n"; - - my $pick = (); - if ($totalno % 2 == 1){ - #print "odd set .. totalno = $totalno\n"; - my $mid = $totalno / 2; - my $onehalfno = $mid - $mid % 1; - my $secondhalfno = $onehalfno + 1; - my $onehalf = $sorted[$onehalfno-1]; - my $secondhalf = $sorted[$secondhalfno-1]; - #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; - - $pick = $secondhalf; - } - else{ - #print "even set .. totalno = $totalno\n"; - my $mid = $totalno / 2; - my $onehalfno = $mid; - my $secondhalfno = $onehalfno + 1; - my $onehalf = $sorted[$onehalfno-1]; - my $secondhalf = $sorted[$secondhalfno-1]; - #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; - $pick = ($onehalf + $secondhalf )/2; - - } - #print "pick = $pick..\n"; - return $pick; - -} - - -sub array_numerical_sort{ - return "NA" if scalar(@_) == 0; - my @sorted = sort { $a <=> $b } @_; - return (@sorted); -} - -sub array_smallest_number{ - return "NA" if scalar(@_) == 0; - return $_[0] if scalar(@_) == 1; - my @sorted = sort { $a <=> $b } @_; - return $sorted[0]; -} - - -sub array_largest_number{ - return "NA" if scalar(@_) == 0; - return $_[0] if scalar(@_) == 1; - my @sorted = sort { $a <=> $b } @_; - return $sorted[$#sorted]; -} - - -sub array_largest_number_arrayPosition{ - return "NA" if scalar(@_) == 0; - return 0 if scalar(@_) == 1; - my $maxpos = 0; - my @maxposes = (); - my @maxvals = (); - my $maxval = array_smallest_number(@_); - for my $i (0 ... $#_){ - if ($_[$i] > $maxval){ - $maxval = $_[$i]; - $maxpos = $i; - } - if ($_[$i] == $maxval){ - $maxval = $_[$i]; - if (scalar(@maxposes) == 0){ - push @maxposes, $i; - push @maxvals, $_[$i]; - - } - elsif ($maxvals[0] == $maxval){ - push @maxposes, $i; - push @maxvals, $_[$i]; - } - else{ - @maxposes = (); @maxvals = (); - push @maxposes, $i; - push @maxvals, $_[$i]; - } - - } - - } - return $maxpos if scalar(@maxposes) < 2; - return (@maxposes); -} - -sub array_smallest_number_arrayPosition{ - return "NA" if scalar(@_) == 0; - return 0 if scalar(@_) == 1; - my $minpos = 0; - my @minposes = (); - my @minvals = (); - my $minval = array_largest_number(@_); - my $maxval = array_smallest_number(@_); - #print "starting with $maxval, ending with $minval\n"; - for my $i (0 ... $#_){ - if ($_[$i] < $minval){ - $minval = $_[$i]; - $minpos = $i; - } - if ($_[$i] == $minval){ - $minval = $_[$i]; - if (scalar(@minposes) == 0){ - push @minposes, $i; - push @minvals, $_[$i]; - - } - elsif ($minvals[0] == $minval){ - push @minposes, $i; - push @minvals, $_[$i]; - } - else{ - @minposes = (); @minvals = (); - push @minposes, $i; - push @minvals, $_[$i]; - } - - } - - } - #print "minposes=@minposes\n"; - - return $minpos if scalar(@minposes) < 2; - return (@minposes); -} - -sub basic_stats{ - my @arr = @_; -# print " array_smallest_number= ", array_smallest_number(@arr)," array_largest_number= ", array_largest_number(@arr), " array_mean= ",array_mean(@arr),"\n"; - return ":"; -} -#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx - -sub maftoAxt_multispecies { - my $printer = 0; -# print "in maftoAxt_multispecies : got @_\n"; - my $fname=$_[0]; - open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; - my $treedefinition = $_[1]; - open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; - my $counter = 0; - my $exactspeciesset = $_[3]; - my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); - - $treedefinition=~s/[\)\(, ]/\t/g; - my @species=split(/\t+/,$treedefinition); - my @exactspecies=(); - - foreach my $spec (@species){ - foreach my $espec (@exactspeciesset_unarranged){ - push @exactspecies, $spec if $spec eq $espec; - } - } -# print "exactspecies=@exactspecies\n"; - - ########### - my $select = 2; - #select = 1 if all species need sequences to be present for each block otherwise, it is 0 - #select = 2 only the allowed set make up the alignment. use the removeset - # information to detect alignmenets that have other important genomes aligned. - ########### - my @allowedset = (); - @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; - @allowedset = join("_",0,@species) if $select == 1; - #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; - @allowedset = join("_",0,@exactspecies) if $select == 2; - #print "allowedset = @allowedset and exactspecies = @exactspecies\n"; - - my $start = 0; - my @sequences = (); - my @titles = (); - my $species_counter = "0"; - my $countermatch = 0; - my $outsideSpecies=0; - - while(my $line = <IN>){ - next if $line =~ /^#/; - next if $line =~ /^i/; - chomp $line; - #print "$line"; - my @fields = split(/\s+/,$line); - chomp $line; - if ($line =~ /^a /){ - $start = 1; - } - - if ($line =~ /^s /){ - # print "fields1 = $fields[1] , start = $start\n"; - - foreach my $sp (@species){ - if ($fields[1] =~ /$sp/){ - $species_counter = $species_counter."_".$sp; - push(@sequences, $fields[6]); - my @sp_info = split(/\./,$fields[1]); - my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); - push(@titles, $title); - - } - } - } - - if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ - - my $arranged = reorderSpecies($species_counter, @species); - my $stopper = 1; - my $arrno = 0; - foreach my $set (@allowedset){ - if ($arranged eq $set){ - # print "$arranged == $set\n"; - $stopper = 0; last; - } - $arrno++; - } - - if ($stopper == 0) { - # print " accepted\n"; - @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; - - @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; - my $filteredseq = filter_gaps(@sequences); - - if ($filteredseq ne "SHORT"){ - $counter++; - print OUT join (" ",$counter, @titles), "\n"; - print OUT $filteredseq, "\n"; - print OUT "\n"; - $countermatch++; - } - # my @filtered_seq = split(/\t/,filter_gaps(@sequences) ); - } - else{#print "\n"; - } - - @sequences = (); @titles = (); $start = 0;$species_counter = "0"; - next; - - } - } -# print "countermatch = $countermatch\n"; -} - -sub reorderSpecies{ - my @inarr=@_; - my $currSpecies = shift (@inarr); - my $ordered_species = 0; - my @species=@inarr; - foreach my $order (@species){ - $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; - } - return $ordered_species; - -} - -sub filter_gaps{ - my @sequences = @_; -# print "sequences sent are @sequences\n"; - my $seq_length = length($sequences[0]); - my $seq_no = scalar(@sequences); - my $allgaps = (); - for (1 ... $seq_no){ - $allgaps = $allgaps."-"; - } - - my @seq_array = (); - my $seq_counter = 0; - foreach my $seq (@sequences){ -# my @sequence = split(/\s*/,$seq); - $seq_array[$seq_counter] = [split(/\s*/,$seq)]; -# push @seq_array, [@sequence]; - $seq_counter++; - } - my $g = 0; - while ( $g < $seq_length){ - last if (!exists $seq_array[0][$g]); - my $bases = (); - for my $u (0 ... $#seq_array){ - $bases = $bases.$seq_array[$u][$g]; - } -# print $bases, "\n"; - if ($bases eq $allgaps){ -# print "bases are $bases, position is $g \n"; - for my $seq (@seq_array){ - splice(@$seq , $g, 1); - } - } - else { - $g++; - } - } - - my @outs = (); - - foreach my $seq (@seq_array){ - push(@outs, join("",@$seq)); - } - return "SHORT" if length($outs[0]) <=100; - return (join("\n", @outs)); -} - - -sub allowedSetOfSpecies{ - my @allowed_species = split(/_/,$_[0]); - unshift @allowed_species, 0; -# print "allowed set = @allowed_species \n"; - my @output = (); - for (0 ... scalar(@allowed_species) - 4){ - push(@output, join("_",@allowed_species)); - pop @allowed_species; - } - return join(";",reverse(@output)); - -} - - -sub orderInfo{ - my @info = split(/;/,$_[0]); -# print "info = @info"; - my @old = split(/_/,$_[1]); - my @new = split(/_/,$_[2]); - shift @old; shift @new; - my @outinfo = (); - foreach my $spe (@new){ - for my $no (0 ... $#old){ - if ($spe eq $old[$no]){ - push(@outinfo, $info[$no]); - } - } - } -# print "outinfo = @outinfo \n"; - return join(";", @outinfo); -} - -#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx - -sub printarr { - print ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"; - foreach my $line (@_) {print "$line\n";} - print "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\n"; -} - diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/microsatellite_birthdeath.xml --- a/tools/regVariation/microsatellite_birthdeath.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -<tool id="microsatellite_birthdeath" name="Identify microsatellite births and deaths" version="1.0.0"> - <description> and causal mutational mechanisms from previously identified orthologous microsatellite sets</description> - <command interpreter="perl"> - microsatellite_birthdeath.pl - $alignment - $orthfile - $outfile - ${alignment.metadata.species} - "$tree_definition" - $thresholds - $separation - $simthresh - - </command> - <inputs> - <page> - <param format="maf" name="alignment" type="data" label="Select MAF alignments"/> - - <param format="txt" name="orthfile" type="data" label="Select raw microsatellite data"/> - - <param name="tree_definition" size="200" type="text" value= "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)" label="Tree definition of all species above whether or not selected for microsatellite extraction" - help="For example: ((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/> - - <param name="separation" size="10" type="integer" value="40" label="Total length of flanking DNA used for sequence-similarity comparisons among species" - help="A value of 40 means: 20 bp upstream and 20 bp downstream DNA will be used for similarity comparisons."/> - - <param name="thresholds" size="15" type="text" value="9,10,12,12" label="Minimum Threshold for the number of repeats for microsatellites" - help="A value of 9,10,12,12 means: All monos having fewer than 9 repeats, dis having fewer than 5 repeats, tris having fewer than 4 repeats, tetras having fewer than 3 repeats will be excluded from the output."/> - - <param name="simthresh" size="10" type="integer" value="80" label="Percent sequence similarity of flanking regions (of length same as the above separation distance" - help="Enter a value from 0 to 100"/> - - - </page> - </inputs> - <outputs> - <data format="txt" name="outfile" metadata_source="orthfile"/> - </outputs> - <tests> - <test> - <param name="alignment" value="chr22_5sp.maf"/> - <param name="orthfile" value="chr22_5sp.microraw.tabular"/> - <param name="thresholds" value="9,10,12,12"/> - <param name="tree_definition" value="((((hg18, panTro2), ponAbe2), rheMac2), calJac1)"/> - <param name="separation" value="40"/> - <param name="simthresh" value="80"/> - <output name="outfile" file="chr22_5sp.microtab.tabular"/> - </test> - </tests> - - - <help> - -.. class:: infomark - -**What it does** - -This tool uses raw orthologous microsatellite clusters (identified by the tool "Extract orthologous microsatellites") to identify microsatellite births and deaths along individual lineages of a phylogenetic tree. - -</help> - - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/microsats_alignment_level.py --- a/tools/regVariation/microsats_alignment_level.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,323 +0,0 @@ - #!/usr/bin/env python -#Guruprasad Ananda -""" -Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output. -""" -from galaxy import eggs -import sys, os, tempfile, string, math, re - -def reverse_complement(text): - DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) - comp = [ch for ch in text.translate(DNA_COMP)] - comp.reverse() - return "".join(comp) - -def main(): - if len(sys.argv) != 8: - print >>sys.stderr, "Insufficient number of arguments." - sys.exit() - - infile = open(sys.argv[1],'r') - separation = int(sys.argv[2]) - outfile = sys.argv[3] - align_type = sys.argv[4] - if align_type == "2way": - align_type_len = 2 - elif align_type == "3way": - align_type_len = 3 - mono_threshold = int(sys.argv[5]) - non_mono_threshold = int(sys.argv[6]) - allow_different_units = int(sys.argv[7]) - - print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" %(separation, mono_threshold, non_mono_threshold, allow_different_units==1) - try: - fout = open(outfile, "w") - print >>fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit" - #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik") - sputnik_cmd = "sputnik" - input = infile.read() - skipped = 0 - block_num = 0 - input = input.replace('\r','\n') - for block in input.split('\n\n'): - block_num += 1 - tmpin = tempfile.NamedTemporaryFile() - tmpout = tempfile.NamedTemporaryFile() - tmpin.write(block.strip()) - blk = tmpin.read() - cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name - try: - os.system(cmdline) - except Exception, es: - continue - sputnik_out = tmpout.read() - tmpin.close() - tmpout.close() - if sputnik_out != "": - if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')): - skipped += 1 - continue - align_block = block.strip().split('>') - - lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6} - blockdict={} - r=0 - namelist=[] - for k,sput_block in enumerate(sputnik_out.split('>')[1:]): - whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip() - p = re.compile('\n(\S*nucleotide)') - repeats = p.split(sput_block.strip()) - repeats_count = len(repeats) - j = 1 - name = repeats[0].strip() - try: - coords = re.search('\d+[-_:]\d+',name).group() - coords = coords.replace('_','-').replace(':','-') - except Exception, e: - coords = '0-0' - pass - r += 1 - blockdict[r]={} - try: - sp_name = name[:name.index('.')] - chr_name = name[name.index('.'):name.index('(')] - namelist.append(sp_name + chr_name) - except: - namelist.append(name[:20]) - while j < repeats_count: - try: - if repeats[j].strip() not in lendict: - j += 2 - continue - - if blockdict[r].has_key('types'): - blockdict[r]['types'].append(repeats[j].strip()) #type of microsat - else: - blockdict[r]['types'] = [repeats[j].strip()] #type of microsat - - sequence = ''.join(align_block[r].split('\n')[1:]).replace('\n','').strip() - start = int(repeats[j+1].split('--')[0].split(':')[0].strip()) - #check to see if there are gaps before the start of the repeat, and change the start accordingly - sgaps = 0 - ch_pos = start - 1 - while ch_pos >= 0: - if whole_seq[ch_pos] == '-': - sgaps += 1 - else: - break #break at the 1st non-gap character - ch_pos -= 1 - if blockdict[r].has_key('starts'): - blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS - else: - blockdict[r]['starts'] = [start+sgaps] - - end = int(repeats[j+1].split('--')[0].split(':')[1].strip()) - #check to see if there are gaps after the end of the repeat, and change the end accordingly - egaps = 0 - for ch in whole_seq[end:]: - if ch == '-': - egaps += 1 - else: - break #break at the 1st non-gap character - if blockdict[r].has_key('ends'): - blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS - else: - blockdict[r]['ends'] = [end+egaps] - - repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence - repeat_len = repeats[j+1].split('--')[1].split()[1].strip() - gap_count = repeat_seq.count('-') - #print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count - repeat_len = str(int(repeat_len) - gap_count) - - rel_start = blockdict[r]['starts'][-1] - gaps_before_start = whole_seq[:rel_start].count('-') - - if blockdict[r].has_key('gaps_before_start'): - blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths - else: - blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths - - whole_seq_start= int(coords.split('-')[0]) - if blockdict[r].has_key('whole_seq_start'): - blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths - else: - blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths - - if blockdict[r].has_key('lengths'): - blockdict[r]['lengths'].append(repeat_len) #lengths - else: - blockdict[r]['lengths'] = [repeat_len] #lengths - - if blockdict[r].has_key('counts'): - blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit - else: - blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit - - if blockdict[r].has_key('units'): - blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit - else: - blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit - - except Exception, eh: - pass - j+=2 - #check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'. - delete_index_list = [] - for ind, item in enumerate(blockdict[r]['ends']): - try: - if blockdict[r]['starts'][ind+1]-item < separation: - if ind not in delete_index_list: - delete_index_list.append(ind) - if ind+1 not in delete_index_list: - delete_index_list.append(ind+1) - except Exception, ek: - pass - for index in delete_index_list: #mark them for deletion - try: - blockdict[r]['starts'][index] = 'marked' - blockdict[r]['ends'][index] = 'marked' - blockdict[r]['types'][index] = 'marked' - blockdict[r]['gaps_before_start'][index] = 'marked' - blockdict[r]['whole_seq_start'][index] = 'marked' - blockdict[r]['lengths'][index] = 'marked' - blockdict[r]['counts'][index] = 'marked' - blockdict[r]['units'][index] = 'marked' - except Exception, ej: - pass - #remove 'marked' elements from all the lists - """ - for key in blockdict[r].keys(): - for elem in blockdict[r][key]: - if elem == 'marked': - blockdict[r][key].remove(elem) - """ - #print blockdict - - #make sure that the blockdict has keys for both the species - if (1 not in blockdict) or (2 not in blockdict): - continue - - visited_2 = [0 for x in range(len(blockdict[2]['starts']))] - for ind1,coord_s1 in enumerate(blockdict[1]['starts']): - if coord_s1 == 'marked': - continue - coord_e1 = blockdict[1]['ends'][ind1] - out = [] - for ind2,coord_s2 in enumerate(blockdict[2]['starts']): - if coord_s2 == 'marked': - visited_2[ind2] = 1 - continue - coord_e2 = blockdict[2]['ends'][ind2] - #skip if the 2 repeats are not of the same type or don't have the same repeating unit. - if allow_different_units == 0: - if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): - continue - else: - if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2): - continue - #print >>sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2) - #skip if the repeat number thresholds are not met - if blockdict[1]['types'][ind1] == 'mononucleotide': - if (int(blockdict[1]['counts'][ind1]) < mono_threshold): - continue - else: - if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): - continue - - if blockdict[2]['types'][ind2] == 'mononucleotide': - if (int(blockdict[2]['counts'][ind2]) < mono_threshold): - continue - else: - if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): - continue - #print "s1,e1=%s,%s; s2,e2=%s,%s" %(coord_s1,coord_e1,coord_s2,coord_e2) - if (coord_s1 in range(coord_s2,coord_e2)) or (coord_e1 in range(coord_s2,coord_e2)): - out.append(str(block_num)) - out.append(namelist[0]) - rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] - rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) - out.append(str(rel_start)) - out.append(str(rel_end)) - out.append(blockdict[1]['types'][ind1]) - out.append(blockdict[1]['lengths'][ind1]) - out.append(blockdict[1]['counts'][ind1]) - out.append(blockdict[1]['units'][ind1]) - out.append(namelist[1]) - rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] - rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) - out.append(str(rel_start)) - out.append(str(rel_end)) - out.append(blockdict[2]['types'][ind2]) - out.append(blockdict[2]['lengths'][ind2]) - out.append(blockdict[2]['counts'][ind2]) - out.append(blockdict[2]['units'][ind2]) - print >>fout, '\t'.join(out) - visited_2[ind2] = 1 - out=[] - - if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet. - for ind2, coord_s2 in enumerate(blockdict[2]['starts']): - if coord_s2 == 'marked': - continue - if visited_2[ind] != 0: - continue - coord_e2 = blockdict[2]['ends'][ind2] - out = [] - for ind1,coord_s1 in enumerate(blockdict[1]['starts']): - if coord_s1 == 'marked': - continue - coord_e1 = blockdict[1]['ends'][ind1] - #skip if the 2 repeats are not of the same type or don't have the same repeating unit. - if allow_different_units == 0: - if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): - continue - else: - if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2: - continue - #skip if the repeat number thresholds are not met - if blockdict[1]['types'][ind1] == 'mononucleotide': - if (int(blockdict[1]['counts'][ind1]) < mono_threshold): - continue - else: - if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): - continue - - if blockdict[2]['types'][ind2] == 'mononucleotide': - if (int(blockdict[2]['counts'][ind2]) < mono_threshold): - continue - else: - if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): - continue - - if (coord_s2 in range(coord_s1,coord_e1)) or (coord_e2 in range(coord_s1,coord_e1)): - out.append(str(block_num)) - out.append(namelist[0]) - rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] - rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) - out.append(str(rel_start)) - out.append(str(rel_end)) - out.append(blockdict[1]['types'][ind1]) - out.append(blockdict[1]['lengths'][ind1]) - out.append(blockdict[1]['counts'][ind1]) - out.append(blockdict[1]['units'][ind1]) - out.append(namelist[1]) - rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] - rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) - out.append(str(rel_start)) - out.append(str(rel_end)) - out.append(blockdict[2]['types'][ind2]) - out.append(blockdict[2]['lengths'][ind2]) - out.append(blockdict[2]['counts'][ind2]) - out.append(blockdict[2]['units'][ind2]) - print >>fout, '\t'.join(out) - visited_2[ind2] = 1 - out=[] - - #print >>fout, blockdict - except Exception, exc: - print >>sys.stderr, "type(exc),args,exc: %s, %s, %s" %(type(exc), exc.args, exc) - -if __name__ == "__main__": - main() - diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/microsats_alignment_level.xml --- a/tools/regVariation/microsats_alignment_level.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -<tool id="microsats_align1" name="Extract Orthologous Microsatellites"> - <description> from pair-wise alignments</description> - <command interpreter="python"> - microsats_alignment_level.py $input1 $separation $out_file1 "2way" $mono_threshold $non_mono_threshold $allow_different_units - </command> - <inputs> - <page> - <param format="fasta" name="input1" type="data" label="Select data"/> - <param name="separation" size="10" type="integer" value="10" label="Minimum base pair distance between adjacent microsatellites" - help="A value of 10 means: Adjacent microsatellites separated by less than 10 base pairs will be excluded from the output."/> - <param name="mono_threshold" size="10" type="integer" value="9" label="Minimum Threshold for the number of repeats for mononucleotide microsatellites" - help="A value of 9 means: All mononucleotide microsatellites having fewer than 9 repeats will be excluded from the output."/> - <param name="non_mono_threshold" size="10" type="integer" value="4" label="Minimum Threshold for the number of repeats for non-mononucleotide microsatellites" - help="A value of 4 means: All non-mononucleotide microsatellites having fewer than 4 repeats will be excluded from the output."/> - <param name="allow_different_units" size="5" type="select" label="Allow orthologous positions to have different microsatellite repeat units/motifs?"> - <option value="0" selected="true">No</option> - <option value="1">Yes</option> - </param> - </page> - </inputs> - <outputs> - <data format="tabular" name="out_file1" metadata_source="input1"/> - </outputs> - <requirements> - <requirement type="package">sputnik</requirement> - </requirements> - <tests> - <test> - <param name="input1" value="2way.maf"/> - <param name="separation" value="10"/> - <param name="mono_threshold" value="9"/> - <param name="non_mono_threshold" value="4"/> - <param name="allow_different_units" value="0"/> - <output name="out_file1" file="ortho_ms.tab"/> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This tool uses a modified version of SPUTNIK to fetch microsatellite repeats from the input fasta sequences and extracts orthologous repeats from the sputnik output. The modified version allows detection of mononucleotide microsatellites. More information on SPUTNIK can be found on this website_. The modified version is available here_. - ------ - -.. class:: warningmark - -**Note** - -- Any block/s not containing exactly 2 species will be omitted. - -- This tool will filter out microsatellites based on the user input values for minimum distance and repeat number thresholds. Further, this tool will also filter out microsatellites that have no orthologous microsatellites in one of the species. - -.. _website: http://espressosoftware.com/pages/sputnik.jsp -.. _here: http://www.bx.psu.edu/svn/universe/dependencies/sputnik/ -</help> - - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/microsats_mutability.py --- a/tools/regVariation/microsats_mutability.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,489 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda -""" -This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool. -""" -from galaxy import eggs -import sys, string, re, commands, tempfile, os, fileinput -from galaxy.tools.util.galaxyops import * -from bx.intervals.io import * -from bx.intervals.operations import quicksect - -fout = open(sys.argv[2],'w') -p_group = int(sys.argv[3]) #primary "group-by" feature -p_bin_size = int(sys.argv[4]) -s_group = int(sys.argv[5]) #sub-group by feature -s_bin_size = int(sys.argv[6]) -mono_threshold = 9 -non_mono_threshold = 4 -p_group_cols = [p_group, p_group+7] -s_group_cols = [s_group, s_group+7] -num_generations = int(sys.argv[7]) -region = sys.argv[8] -int_file = sys.argv[9] -if int_file != "None": #User has specified an interval file - try: - fint = open(int_file, 'r') - dbkey_i = sys.argv[10] - chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] ) - except: - stop_err("Unable to open input Interval file") - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def reverse_complement(text): - DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) - comp = [ch for ch in text.translate(DNA_COMP)] - comp.reverse() - return "".join(comp) - -def get_unique_elems(elems): - seen=set() - return[x for x in elems if x not in seen and not seen.add(x)] - -def get_binned_lists(uniqlist, binsize): - binnedlist=[] - uniqlist.sort() - start = int(uniqlist[0]) - bin_ind=0 - l_ind=0 - binnedlist.append([]) - while l_ind < len(uniqlist): - elem = int(uniqlist[l_ind]) - if elem in range(start,start+binsize): - binnedlist[bin_ind].append(elem) - else: - start += binsize - bin_ind += 1 - binnedlist.append([]) - binnedlist[bin_ind].append(elem) - l_ind += 1 - return binnedlist - -def fetch_weight(H,C,t): - if (H-(C-H)) < t: - return 2.0 - else: - return 1.0 - -def mutabilityEstimator(repeats1,repeats2,thresholds): - mut_num = 0.0 #Mutability Numerator - mut_den = 0.0 #Mutability denominator - for ind,H in enumerate(repeats1): - C = repeats2[ind] - t = thresholds[ind] - w = fetch_weight(H,C,t) - mut_num += ((H-C)*(H-C)*w) - mut_den += w - return [mut_num, mut_den] - -def output_writer(blk, blk_lines): - global winspecies, speciesind - all_elems_1=[] - all_elems_2=[] - all_s_elems_1=[] - all_s_elems_2=[] - for bline in blk_lines: - if not(bline): - continue - items = bline.split('\t') - seq1 = items[1] - start1 = items[2] - end1 = items[3] - seq2 = items[8] - start2 = items[9] - end2 = items[10] - if p_group_cols[0] == 6: - items[p_group_cols[0]] = int(items[p_group_cols[0]]) - items[p_group_cols[1]] = int(items[p_group_cols[1]]) - if s_group_cols[0] == 6: - items[s_group_cols[0]] = int(items[s_group_cols[0]]) - items[s_group_cols[1]] = int(items[s_group_cols[1]]) - all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1 - all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2 - if s_group_cols[0] != -1: #sub-group is not None - all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1 - all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2 - uniq_elems_1 = get_unique_elems(all_elems_1) - uniq_elems_2 = get_unique_elems(all_elems_2) - if s_group_cols[0] != -1: - uniq_s_elems_1 = get_unique_elems(all_s_elems_1) - uniq_s_elems_2 = get_unique_elems(all_s_elems_2) - mut1={} - mut2={} - count1 = {} - count2 = {} - """ - if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc) - uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y))) - """ - if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number. - uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size) - uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size) - - if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number. - uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size) - uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size) - - for pitem1 in uniq_elems_1: - #repeats1 = [] - #repeats2 = [] - thresholds = [] - if s_group_cols[0] != -1: #Sub-group by feature is not None - for sitem1 in uniq_s_elems_1: - repeats1 = [] - repeats2 = [] - if type(sitem1) == type(''): - sitem1 = sitem1.strip() - for bline in blk_lines: - belems = bline.split('\t') - if type(pitem1) == list: - if p_group_cols[0] == 6: - belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) - if belems[p_group_cols[0]] in pitem1: - if belems[s_group_cols[0]]==sitem1: - repeats1.append(int(belems[6])) - repeats2.append(int(belems[13])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) - if region == 'align': - count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1) - elif winspecies == 2: - count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2) - else: - if type(sitem1) == list: - if s_group_cols[0] == 6: - belems[s_group_cols[0]] = int(belems[s_group_cols[0]]) - if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1: - repeats1.append(int(belems[6])) - repeats2.append(int(belems[13])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) - if region == 'align': - count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1) - elif winspecies == 2: - count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2) - else: - if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1: - repeats1.append(int(belems[6])) - repeats2.append(int(belems[13])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) - if region == 'align': - count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1) - elif winspecies == 2: - count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2) - else: #Sub-group by feature is None - for bline in blk_lines: - belems = bline.split('\t') - if type(pitem1) == list: - #print >>sys.stderr, "item: " + str(item1) - if p_group_cols[0] == 6: - belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) - if belems[p_group_cols[0]] in pitem1: - repeats1.append(int(belems[6])) - repeats2.append(int(belems[13])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - else: - if belems[p_group_cols[0]]==pitem1: - repeats1.append(int(belems[6])) - repeats2.append(int(belems[13])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) - if region == 'align': - count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count1[str(pitem1)]=sum(repeats1) - elif winspecies == 2: - count1[str(pitem1)]=sum(repeats2) - - for pitem2 in uniq_elems_2: - #repeats1 = [] - #repeats2 = [] - thresholds = [] - if s_group_cols[0] != -1: #Sub-group by feature is not None - for sitem2 in uniq_s_elems_2: - repeats1 = [] - repeats2 = [] - if type(sitem2)==type(''): - sitem2 = sitem2.strip() - for bline in blk_lines: - belems = bline.split('\t') - if type(pitem2) == list: - if p_group_cols[0] == 6: - belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) - if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2: - repeats2.append(int(belems[13])) - repeats1.append(int(belems[6])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) - #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2) - if region == 'align': - count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) - elif winspecies == 2: - count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) - else: - if type(sitem2) == list: - if s_group_cols[0] == 6: - belems[s_group_cols[1]] = int(belems[s_group_cols[1]]) - if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2: - repeats2.append(int(belems[13])) - repeats1.append(int(belems[6])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) - if region == 'align': - count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) - elif winspecies == 2: - count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) - else: - if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2: - repeats1.append(int(belems[13])) - repeats2.append(int(belems[6])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) - if region == 'align': - count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) - elif winspecies == 2: - count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) - else: #Sub-group by feature is None - for bline in blk_lines: - belems = bline.split('\t') - if type(pitem2) == list: - if p_group_cols[0] == 6: - belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) - if belems[p_group_cols[1]] in pitem2: - repeats2.append(int(belems[13])) - repeats1.append(int(belems[6])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - else: - if belems[p_group_cols[1]]==pitem2: - repeats2.append(int(belems[13])) - repeats1.append(int(belems[6])) - if belems[4] == 'mononucleotide': - thresholds.append(mono_threshold) - else: - thresholds.append(non_mono_threshold) - mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) - if region == 'align': - count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2)) - else: - if winspecies == 1: - count2["%s" %(pitem2)]=sum(repeats2) - elif winspecies == 2: - count2["%s" %(pitem2)]=sum(repeats1) - for key in mut1.keys(): - if key in mut2.keys(): - mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1]) - count = count1[key] - del mut2[key] - else: - unit_found = False - if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too - if p_group_cols[0] == 7: - this,other = 0,1 - else: - this,other = 1,0 - groups1 = key.split('\t') - mutn = mut1[key][0] - mutd = mut1[key][1] - count = 0 - for key2 in mut2.keys(): - groups2 = key2.split('\t') - if groups1[other] == groups2[other]: - if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2: - #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1]) - mutn += mut2[key2][0] - mutd += mut2[key2][1] - count += int(count2[key2]) - unit_found = True - del mut2[key2] - #break - if unit_found: - mut = mutn/mutd - else: - mut = mut1[key][0]/mut1[key][1] - count = count1[key] - mut = "%.2e" %(mut/num_generations) - if region == 'align': - print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count) - elif region == 'win': - fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count)) - fout.flush() - - #catch any remaining repeats, for instance if the orthologous position contained different repeat units - for remaining_key in mut2.keys(): - mut = mut2[remaining_key][0]/mut2[remaining_key][1] - mut = "%.2e" %(mut/num_generations) - count = count2[remaining_key] - if region == 'align': - print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) - elif region == 'win': - fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count)) - fout.flush() - #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) - -def counter(node, start, end, report_func): - if start <= node.start < end and start < node.end <= end: - report_func(node) - if node.right: - counter(node.right, start, end, report_func) - if node.left: - counter(node.left, start, end, report_func) - elif node.start < start and node.right: - counter(node.right, start, end, report_func) - elif node.start >= end and node.left and node.left.maxend > start: - counter(node.left, start, end, report_func) - - -def main(): - infile = sys.argv[1] - - for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - - if len( elems ) != 15: - stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." ) - global winspecies, speciesind - if region == 'win': - if dbkey_i in elems[1]: - winspecies = 1 - speciesind = 1 - elif dbkey_i in elems[8]: - winspecies = 2 - speciesind = 8 - else: - stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.") - - fin = open(infile, 'r') - skipped = 0 - blk=0 - win=0 - linestr="" - - if region == 'win': - - msats = NiceReaderWrapper( fileinput.FileInput( infile ), - chrom_col = speciesind, - start_col = speciesind+1, - end_col = speciesind+2, - strand_col = -1, - fix_strand = True) - msatTree = quicksect.IntervalTree() - for item in msats: - if type( item ) is GenomicInterval: - msatTree.insert( item, msats.linenum, item.fields ) - - for iline in fint: - try: - iline = iline.rstrip('\r\n') - if not(iline) or iline == "": - continue - ielems = iline.strip("\r\n").split('\t') - ichr = ielems[chr_col_i] - istart = int(ielems[start_col_i]) - iend = int(ielems[end_col_i]) - isrc = "%s.%s" %(dbkey_i,ichr) - if isrc not in msatTree.chroms: - continue - result = [] - root = msatTree.chroms[isrc] #root node for the chrom - counter(root, istart, iend, lambda node: result.append( node )) - if not(result): - continue - tmpfile1 = tempfile.NamedTemporaryFile('wb+') - for node in result: - tmpfile1.write("%s\n" % "\t".join( node.other )) - - tmpfile1.seek(0) - output_writer(iline, tmpfile1.readlines()) - except: - skipped+=1 - if skipped: - print "Skipped %d intervals as invalid." %(skipped) - elif region == 'align': - if s_group_cols[0] != -1: - print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount" - else: - print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount" - prev_bnum = -1 - try: - for line in fin: - line = line.strip("\r\n") - if not(line) or line == "": - continue - elems = line.split('\t') - try: - assert int(elems[0]) - assert len(elems) == 15 - except: - continue - new_bnum = int(elems[0]) - if new_bnum != prev_bnum: - if prev_bnum != -1: - output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) - linestr = line + "\n" - else: - linestr += line - linestr += "\n" - prev_bnum = new_bnum - output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) - except Exception, ea: - print >>sys.stderr, ea - skipped += 1 - if skipped: - print "Skipped %d lines as invalid." %(skipped) -if __name__ == "__main__": - main() \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/microsats_mutability.xml --- a/tools/regVariation/microsats_mutability.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ -<tool id="microsats_mutability1" name="Estimate microsatellite mutability" version="1.1.0"> - <description>by specified attributes</description> - <command interpreter="python"> - microsats_mutability.py - $input1 - $out_file1 - ${pri_condition.primary_group} - #if $pri_condition.primary_group == "6": - ${pri_condition.binsize} ${pri_condition.subgroup} -1 - #else: - 0 ${pri_condition.sub_condition.subgroup} - #if $pri_condition.sub_condition.subgroup == "6": - ${pri_condition.sub_condition.s_binsize} - #else: - -1 - #end if - #end if - $gens - ${region.type} - #if $region.type == "win": - ${region.input2} $input2.dbkey $input2.metadata.chromCol,$input2.metadata.startCol,$input2.metadata.endCol,$input2.metadata.strandCol - #else: - "None" - #end if - </command> - <inputs> - <page> - <param name="input1" type="data" format="tabular" label="Select dataset containing Orthologous microsatellites"/> - <conditional name="region"> - <param name="type" type="select" label="Estimate rates corresponding to" multiple="false"> - <option value="align">Alignment block</option> - <option value="win">Intervals in your history</option> - </param> - <when value="win"> - <param format="interval" name="input2" type="data" label="Choose intervals"> - <validator type="unspecified_build" /> - </param> - </when> - <when value="align" /> - </conditional> - <param name="gens" size="10" type="integer" value="1" label="Number of generations between the two species in input file"/> - <conditional name="pri_condition"> - <param name="primary_group" type="select" label="Group by" multiple="false"> - <option value="4">Motif type (mono/di/tri etc.)</option> - <option value="7">Repeat Unit (AG, GCT etc.)</option> - <option value="6">Repeat Number </option> - </param> - <when value="6"> - <param name="binsize" size="10" type="integer" value="1" label="Bin-size" help="Bin-size denotes the number of repeat numbers to be considered as a group. Bin-size of 5 will group every 5 consecutive repeat numbers into a group."/> - <param name="subgroup" type="select" label="Sub-group by" multiple="false"> - <option value="-1">None</option> - <option value="4">Motif type (mono/di/tri etc.)</option> - <option value="7">Repeat Unit (AG, GCT etc.)</option> - </param> - </when> - <when value="7"> - <conditional name="sub_condition"> - <param name="subgroup" type="select" label="Sub-group by" multiple="false"> - <option value="-1">None</option> - <option value="4">Motif type (mono/di/tri etc.)</option> - <option value="6">Repeat Number </option> - </param> - <when value="-1"></when> - <when value="4"></when> - <when value="6"> - <param name="s_binsize" size="10" type="integer" value="1" label="Bin size" help="Bin-size denotes the number of repeat numbers to be considered as a group. Bin-size of 5 will group every 5 consecutive repeat numbers into a group."/> - </when> - </conditional> - </when> - <when value="4"> - <conditional name="sub_condition"> - <param name="subgroup" type="select" label="Sub-group by" multiple="false"> - <option value="-1">None</option> - <option value="7">Repeat Unit (AG, GCT etc.)</option> - <option value="6">Repeat Number </option> - </param> - <when value="-1"></when> - <when value="7"></when> - <when value="6"> - <param name="s_binsize" size="10" type="integer" value="1" label="Bin size" help="Bin-size denotes the number of repeat numbers to be considered as a group. Bin-size of 5 will group every 5 consecutive repeat numbers into a group."/> - </when> - </conditional> - </when> - </conditional> - </page> - </inputs> - <outputs> - <data format="tabular" name="out_file1" /> - </outputs> - <!-- - <tests> - <test> - <param name="input1" value="ortho_ms.tab"/> - <param name="type" value="align"/> - <param name="gens" value="1"/> - <param name="primary_group" value="4"/> - <param name="sub_condition|subgroup" value="7"/> - <output name="out_file1" file="ortho_ms_mut.tab"/> - </test> - </tests> - --> -<help> -.. class:: infomark - -**What it does** - -This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool. - -Mutability is computed according to the method described in the following paper: - -*Webster et al., Microsatellite evolution inferred from human-chimpanzee genomic sequence alignments, Proc Natl Acad Sci 2002 June 25; 99(13): 8748-8753* - ------ - -.. class:: warningmark - -**Note** - -The user selected group and subgroup by features, the computed mutability and the count of the number of repeats used to compute that mutability are added as columns to the output. -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl --- a/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5392 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use Term::ANSIColor; -use File::Basename; -use IO::Handle; -use Cwd; -use File::Path; -use File::Temp qw/ tempfile tempdir /; -use vars qw($distance @thresholds @tags $printer $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter); - -$ENV{'PATH'} .= ':' . dirname($0); -my $date = `date`; - -my ($mafile, $orthfile, $threshold_array, $species_set, $tree_definition, $separation) = @ARGV; -if (!$mafile or !$orthfile or !$threshold_array or !$separation or !$tree_definition or !$species_set) { die "missing arguments\n"; } - -#------------------------------------------------------------------------------- -# WHICH SPUTNIK USED? -my $sputnikpath = (); -$sputnikpath = "sputnik"; -#print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath; -#exit if !-e $sputnikpath; -#$sputnikpath = "bx-sputnik" ; -#print "ARGV input = @ARGV\n"; -#print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n"; -#------------------------------------------------------------------------------- -# RUNFILE -#------------------------------------------------------------------------------- -$distance = 1; #bp -$distance++; -#------------------------------------------------------------------------------- -# MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP) -$threshold_array=~ s/,/_/g; -my @thresharr = split("_",$threshold_array); -my @thresholds=@thresharr; -my $mono_threshold = $thresharr[0]; -my $di_threshold = $thresharr[1]; -my $tri_threshold = $thresharr[2]; -my $tetra_threshold = $thresharr[3]; -#my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold)); -my $tdir = tempdir( CLEANUP => 0 ); -chdir $tdir; -my $dir = getcwd; -#print "current dit=$dir\n"; -#------------------------------------------------------------------------------- -# CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY -my @chrfiles=(); - -#my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0]; -my $chromt=int(rand(10000)); -my $p_chr=$chromt; - - -my @exactspeciesset_unarranged = split(/,/,$species_set); -$tree_definition=~s/[\)\(, ]/\t/g; -my @treespecies=split(/\t+/,$tree_definition); -my @exactspecies=(); - -foreach my $spec (@treespecies){ - foreach my $espec (@exactspeciesset_unarranged){ - push @exactspecies, $spec if $spec eq $espec; - } -} -#print "exactspecies=@exactspecies\n"; -my $focalspec = $exactspecies[0]; -my $arranged_species_set=join(".",@exactspecies); -my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); -#print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; -maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set); -#print "done maf to axt conversion\n"; -my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt"); -artificial_axdata_inverter ($chr_name, $reverse_chr_name); -#print "reverse_chr_name=$reverse_chr_name\n"; -#------------------------------------------------------------------------------- -# FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT -foreach my $direct ("reverse_direction","forward_direction"){ - $p_chr=$chromt; - #print "direction = $direct\n"; - $p_chr = $p_chr."r" if $direct eq "reverse_direction"; - $p_chr = $p_chr if $direct eq "forward_direction"; - my $config = $species_set; - $config=~s/,/./g; - my @orgs = split(/\./,$arranged_species_set); - #print "ORGS= @orgs\n"; - my @tag=@orgs; - - - my $tags = join(",", @tag); - my @tags=@tag; - chomp $p_chr; - $tags = join("_", split(/,/, $tags)); - my $pchr = "chr".$p_chr; - - my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array; - my @sp_tags = (); - - #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n"; - foreach my $sp (@tag){ - push(@sp_tags, ($sp.".".$ptag)); - } - - my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]); - my @presp_tags = (); - - foreach my $sp (@tag){ - push(@presp_tags, ($sp.".".$preptag)); - } - - my $resultdir = ""; - my $orthdir = ""; - my $filtereddir = ""; - my $pipedir = ""; - - my @title_queries = (); - push(@title_queries, "^[0-9]+"); - my $sep="\\s"; - for my $or (0 ... $#orgs){ - my $title = join($sep, ($orgs[$or], "[A-Za-z]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]")); - #$title =~ s/chr\\+\\s+\+/chr/g; - push(@title_queries, $title); - } - my $title_query = join($sep, @title_queries); - #print "title_queries=@title_queries\n"; - #print "query = >$title_query<\n"; - #print "orgs = @orgs\n"; - #------------------------------------------------------------------------------- - # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES - my $t1input = $pchr.".".$arranged_species_set.".net.axt"; - - my @t1outputs = (); - - foreach my $sp (@presp_tags){ - push(@t1outputs, $sp."_gap_op"); - } - - multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query); - #print "t1outputs=@t1outputs\n"; - #print "done t1\n"; - #------------------------------------------------------------------------------- - #START T2.PL - - my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = (); - - for my $t (0 ... scalar(@tags)-1){ - multi_species_t2($t1outputs[$t], $tag[$t]); - } - #------------------------------------------------------------------------------- - #START T2.2.PL - - my @temp_tags = @tag; - - foreach my $sp (@presp_tags){ - my $t2input = $sp."_nogap_op_unrand"; - multi_species_t2_2($t2input, shift(@temp_tags)); - } - undef (@temp_tags); - - #------------------------------------------------------------------------------- - #START SPUTNIK - - my @jobIDs = (); - @temp_tags = @tag; - my @sput_filelist = (); - - foreach my $sp (@presp_tags){ - #print "sp = $sp\n"; - my $sputnikoutput = $pipedir.$sp."_sput_op0"; - my $sputnikinput = $pipedir.$sp."_nogap_op_unrand"; - push(@sput_filelist, $sputnikinput); - my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput; - #print "$sputnikcommand\n"; - my @sputnikcommand_system = $sputnikcommand; - system(@sputnikcommand_system); - } - - #------------------------------------------------------------------------------- - #START SPUTNIK OUTPUT CORRECTOR - - foreach my $sp (@presp_tags){ - my $corroutput = $pipedir.$sp."_sput_op1"; - my $corrinput = $pipedir.$sp."_sput_op0"; - sputnikoutput_corrector($corrinput,$corroutput); - - my $t4output = $pipedir.$sp."_sput_op2"; - multi_species_t4($corroutput,$t4output); - - my $t5output = $pipedir.$sp."_sput_op3"; - multi_species_t5($t4output,$t5output); - #print "done t5.pl for $sp\n"; - - my $t6output = $pipedir.$sp."_sput_op4"; - multi_species_t6($t5output,$t6output,scalar(@orgs)); - } - #------------------------------------------------------------------------------- - #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING - - foreach my $sp (@presp_tags){ - my $t9output = $pipedir.$sp."_gap_op_unrand_match"; - my $t9sequence = $pipedir.$sp."_gap_op_unrand2"; - my $t9micro = $pipedir.$sp."_sput_op4"; - t9($t9micro,$t9sequence,$t9output); - - my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match"; - my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2"; - t9($t9micro,$t9sequence2,$t9output2); - } - #print "done both t9.pl for all orgs\n"; - - #------------------------------------------------------------------------------- - # FIND COMPOUND MICROSATELLITES - - @jobIDs = (); - my $species_counter = 0; - - foreach my $sp (@presp_tags){ - my $simple_microsats=$pipedir.$sp."_sput_op4_simple"; - my $compound_microsats=$pipedir.$sp."_sput_op4_compound"; - my $input_micro = $pipedir.$sp."_sput_op4"; - my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match"; - multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array ); - $species_counter++; - } - - #------------------------------------------------------------------------------- - # READING AND FILTERING SIMPLE MICROSATELLITES - my $spcounter2=0; - foreach my $sp (@sp_tags){ - my $presp = $presp_tags[$spcounter2]; - $spcounter2++; - my $simple_microsats=$pipedir.$presp."_sput_op4_simple"; - my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered"; - my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue"; - multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); - } - - #------------------------------------------------------------------------------- - # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS - - $species_counter = 0; - foreach my $sp (@sp_tags){ - my $presp = $presp_tags[$species_counter]; - my $compound_microsats = $pipedir.$presp."_sput_op4_compound"; - my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted"; - my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure"; - my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; - multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags)); - $species_counter++; - } - #------------------------------------------------------------------------------- - # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM.. - $species_counter = 0; - - foreach my $sp (@sp_tags){ - my $presp = $presp_tags[$species_counter]; - my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure"; - my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; - my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound"; - my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; - multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8"); - $species_counter++; - } - #------------------------------------------------------------------------------- - # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES - $species_counter = 0; - - foreach my $sp (@sp_tags){ - my $presp = $presp_tags[$species_counter]; - - my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; - my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; - my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue"; - multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); - - my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted"; - my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; - my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue"; - multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags)); - - my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound"; - my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered"; - my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue"; - multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); - $species_counter++; - } - #print "done filtering both simple and compound microsatellites \n"; - - #------------------------------------------------------------------------------- - - my @combinedarray = (); - my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide"); - my @combinedarray_tags = ("mono", "di", "tri", "tetra"); - $species_counter = 0; - - foreach my $sp (@sp_tags){ - my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple"; - push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra"; - $species_counter++; - } - - #------------------------------------------------------------------------------- - # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION - my $sp_counter = 0; - foreach my $sp (@sp_tags){ - my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; - my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; - my $simple = $pipedir.$sp."_sput_op4_simple_filtered"; - my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple"; - `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`; - for my $i (0 ... 3){ - `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`; - } - $sp_counter++; - } - #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n"; - - #------------------------------------------------------------------------------- - # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES - # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES - @temp_tags = @sp_tags; - my $increment = 1000000; - my @splist = (); - my $targetdir = $pipedir; - $species_counter=0; - - foreach my $sp (@sp_tags){ - my $presp = $presp_tags[$species_counter]; - $species_counter++; - my $localtag = shift @temp_tags; - my $locallist = $targetdir.$localtag."_".$p_chr."_list"; - push(@splist, $locallist); - my $input = $pipedir.$presp."_nogap_op_unrand2_match"; - chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr); - } - - - my @unionarray = (); - #print "splist=@splist\n"; - #------------------------------------------------------------------------------- - # FIND INTERRUPTED MICROSATELLITES - - $species_counter = 0; - - for my $i (0 .. $#combinedarray){ - - @jobIDs = (); - open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!"; - - while (my $sp1 = <JLIST1>){ - #print "$splist[$i]: sp1=$sp1\n"; - chomp $sp1; - - for my $j (0 ... $#combinedarray_tags){ - my $interr = $sp1."_interr_".$combinedarray_tags[$j]; - my $simple = $sp1."_simple_".$combinedarray_tags[$j]; - push @{$unionarray[$i]}, $interr, $simple; - multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8"); - } - } - $species_counter++; - } - close JLIST1; - #------------------------------------------------------------------------------- - # REUNION AND ZIPPING BEFORE T10.PL - - my @allarray = (); - - for my $i (0 ... $#sp_tags){ - my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; - unlink $localfile if -e $localfile; - push(@allarray, $localfile); - - my $unfiltered_localfile= $localfile."_unfiltered"; - my $residue_localfile= $localfile."_residue"; - - unlink $unfiltered_localfile; - #unlink $unfiltered_localfile; - for my $j (0 ... $#{$unionarray[$i]}){ - #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n"; - `cat $unionarray[$i][$j] >> $unfiltered_localfile`; - unlink $unionarray[$i][$j]; - } - - multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) ); - my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered"; - my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue"; - my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue"; - - `cat $analyzed_compound >> $localfile`; - } - #------------------------------------------------------------------------------- - # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS - - my $toescape=0; - - - for my $i (0 ... $#sp_tags){ - my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; - $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./; - my $direction = $1; - #print "localfile = $localfile , direction = $direction\n"; -# `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1; - $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/; - last if $direction =~ /chr[0-9a-zA-Z]+r/; - my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match"; - my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match"; - my $reverselocal = $localfile; - $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g; - merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags)); - #------------------------------------------------------------------------------- - my $forward_separate = $localfile."_separate"; - my $reverse_separate = $reverselocal."_separate"; - my $diff = $forward_separate."_diff"; - my $miss = $forward_separate."_miss"; - my $common = $forward_separate."_common"; - forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags)); - #------------------------------------------------------------------------------- - my $symmetrical_file = $localfile."_symmetrical"; - my $merged_file = $localfile."_merged"; - #print "cating: $merged_file $common into -> $symmetrical_file \n"; - `cat $merged_file $common > $symmetrical_file`; - #------------------------------------------------------------------------------- - my $t10output = $symmetrical_file."_fin_hit_all_2"; - new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs)); - #------------------------------------------------------------------------------- - } - next if $toescape == 1; - #------------------------------------------------------------------------------------------------ - # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES - - - my @micros_array = (); - my $sampletag = (); - for my $i (0 ... $#sp_tags){ - my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2"; - push(@micros_array, $finhitFile); - $sampletag = $sp_tags[$i]; - } - #$sampletag =~ s/^([A-Z]+\.)/ORTH_/; - #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp"; - my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfile, join(":", @orgs), $separation); -} -$date = `date`; -#print "date = $date\n"; -#remove_tree($tdir); -#------------------------------------------------------------------------------------------------ -#------------------------------------------------------------------------------------------------ -#------------------------------------------------------------------------------------------------ -#------------------------------------------------------------------------------------------------ - -#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx - -sub maftoAxt_multispecies { - #print "in maftoAxt_multispecies : got @_\n"; - my $fname=$_[0]; - open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; - my $treedefinition = $_[1]; - open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; - my $counter = 0; - my $exactspeciesset = $_[3]; - my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); - - $treedefinition=~s/[\)\(, ]/\t/g; - my @species=split(/\t+/,$treedefinition); - my @exactspecies=(); - - foreach my $spec (@species){ - foreach my $espec (@exactspeciesset_unarranged){ - push @exactspecies, $spec if $spec eq $espec; - } - } - #print "exactspecies=@exactspecies\n"; - - ########### - my $select = 2; - #select = 1 if all species need sequences to be present for each block otherwise, it is 0 - #select = 2 only the allowed set make up the alignment. use the removeset - # information to detect alignmenets that have other important genomes aligned. - ########### - my @allowedset = (); - @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; - @allowedset = join("_",0,@species) if $select == 1; - #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; - @allowedset = join("_",0,@exactspecies) if $select == 2; - #print "allowedset = @allowedset and exactspecies = @exactspecies\n"; - - my $start = 0; - my @sequences = (); - my @titles = (); - my $species_counter = "0"; - my $countermatch = 0; - my $outsideSpecies=0; - - while(my $line = <IN>){ - next if $line =~ /^#/; - next if $line =~ /^i/; - chomp $line; - #print "$line"; - my @fields = split(/\s+/,$line); - chomp $line; - if ($line =~ /^a /){ - $start = 1; - } - - if ($line =~ /^s /){ - # print "fields1 = $fields[1] , start = $start\n"; - - foreach my $sp (@species){ - if ($fields[1] =~ /$sp/){ - $species_counter = $species_counter."_".$sp; - push(@sequences, $fields[6]); - my @sp_info = split(/\./,$fields[1]); - my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); - push(@titles, $title); - - } - } - } - - if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ - my $arranged = reorderSpecies($species_counter, @species); - my $stopper = 1; - my $arrno = 0; - foreach my $set (@allowedset){ - if ($arranged eq $set){ - # print "$arranged == $set\n"; - $stopper = 0; last; - } - $arrno++; - } - - if ($stopper == 0) { - # print " accepted\n"; - @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; - @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; - my $filteredseq = filter_gaps(@sequences); - - if ($filteredseq ne "SHORT"){ - $counter++; - print OUT join (" ",$counter, @titles), "\n"; - print OUT $filteredseq, "\n"; - print OUT "\n"; - $countermatch++; - } - } - else{#print "\n"; - } - - @sequences = (); @titles = (); $start = 0;$species_counter = "0"; - next; - - } - } -# print "countermatch = $countermatch\n"; -} - -sub reorderSpecies{ - my @inarr=@_; - my $currSpecies = shift (@inarr); - my $ordered_species = 0; - my @species=@inarr; - foreach my $order (@species){ - $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; - } - return $ordered_species; - -} - -sub filter_gaps{ - my @sequences = @_; -# print "sequences sent are @sequences\n"; - my $seq_length = length($sequences[0]); - my $seq_no = scalar(@sequences); - my $allgaps = (); - for (1 ... $seq_no){ - $allgaps = $allgaps."-"; - } - - my @seq_array = (); - my $seq_counter = 0; - foreach my $seq (@sequences){ -# my @sequence = split(/\s*/,$seq); - $seq_array[$seq_counter] = [split(/\s*/,$seq)]; -# push @seq_array, [@sequence]; - $seq_counter++; - } - my $g = 0; - while ( $g < $seq_length){ - last if (!exists $seq_array[0][$g]); - my $bases = (); - for my $u (0 ... $#seq_array){ - $bases = $bases.$seq_array[$u][$g]; - } -# print $bases, "\n"; - if ($bases eq $allgaps){ -# print "bases are $bases, position is $g \n"; - for my $seq (@seq_array){ - splice(@$seq , $g, 1); - } - } - else { - $g++; - } - } - - my @outs = (); - - foreach my $seq (@seq_array){ - push(@outs, join("",@$seq)); - } - return "SHORT" if length($outs[0]) <=100; - return (join("\n", @outs)); -} - - -sub allowedSetOfSpecies{ - my @allowed_species = split(/_/,$_[0]); - unshift @allowed_species, 0; -# print "allowed set = @allowed_species \n"; - my @output = (); - for (0 ... scalar(@allowed_species) - 4){ - push(@output, join("_",@allowed_species)); - pop @allowed_species; - } - return join(";",reverse(@output)); - -} - - -sub orderInfo{ - my @info = split(/;/,$_[0]); -# print "info = @info"; - my @old = split(/_/,$_[1]); - my @new = split(/_/,$_[2]); - shift @old; shift @new; - my @outinfo = (); - foreach my $spe (@new){ - for my $no (0 ... $#old){ - if ($spe eq $old[$no]){ - push(@outinfo, $info[$no]); - } - } - } -# print "outinfo = @outinfo \n"; - return join(";", @outinfo); -} - -#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx - -#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx -sub artificial_axdata_inverter{ - open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!"; - open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!"; - my $linecounter=0; - while (my $line = <IN>){ - $linecounter++; - #print "$linecounter\n"; - chomp $line; - my $final_line = $line; - my $trycounter = 0; - if ($line =~ /^[a-zA-Z\-]/){ - # while ($final_line eq $line){ - my @fields = split(/\s*/,$line); - - $final_line = join("",reverse(@fields)); - # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; - # $trycounter++; - # print "trying again....$trycounter : $final_line\n" if $final_line eq $line; - # } - } - - # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; - if ($line =~ /^[0-9]/){ - $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g; - $final_line = $line; - } - print OUT $final_line,"\n"; - #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; - } - close OUT; -} -#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx - - -#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx - -sub multi_species_t1 { - - my $input1 = $_[0]; -# print "@_\n"; #<STDIN>; - my @tags = split(/_/, $_[1]); - my @outputs = split(/,/, $_[2]); - my $title_query = $_[3]; - my @handles = (); - - open(FILEB,"<$input1")or die "Cannot open file: $input1 $!"; - my $i = 0; - foreach my $path (@outputs){ - $handles[$i] = IO::Handle->new(); - open ($handles[$i], ">$path") or die "Can't open $path : $!"; - $i++; - } - - my $curdef; - my $start = 0; - - while (my $line = <FILEB> ) { - if ($line =~ /^\d/){ - $line =~ s/ +/\t/g; - my @fields = split(/\s+/, $line); - if (($line =~ /$title_query/)){ - my $title = $line; - my $counter = 0; - foreach my $tag (@tags){ - $line = <FILEB>; - print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line; - $counter++; - } - } - else{ - foreach my $tag (@tags){ - my $tine = <FILEB>; - } - } - - } - } - - foreach my $hand (@handles){ - $hand->close(); - } - - close FILEB; -} - -#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx - -#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx - -sub multi_species_t2{ - - my $input = $_[0]; - my $species = $_[1]; - my $output1 = $input."_unr"; - - #------------------------------------------------------------------------------------------ - open (FILEF1, "<$input") or die "Cannot open file $input :$!"; - open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!"; - - my $line1 = <FILEF1>; - - while($line1){ - { - # chomp($line); - if ($line1 =~ (m/^\>$species/)){ - chomp($line1); - print FILEF2 $line1; - $line1 = <FILEF1>; - chomp($line1); - print FILEF2 "\t", $line1,"\n"; - } - } - $line1 = <FILEF1>; - } - - close FILEF1; - close FILEF2; - #------------------------------------------------------------------------------------------ - - my $output2 = $output1."and"; - my $output3 = $output1."and2"; - open(IN,"<$output1"); - open (FILEF3, ">$output2"); - open (FILEF4, ">$output3"); - - - while (<IN>){ - my $line = $_; - chomp($line); - my @fields=split (/\t/, $line); - # print $line,"\n"; - if($fields[5] ne "chrUn_random"){ - print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n"; - print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n"; - } - } - - - close IN; - close FILEF3; - close FILEF4; - unlink $output1; - - #------------------------------------------------------------------------------------------ - # OLD T3.PL RUDIMENT - - my $t3output = $output2; - $t3output =~ s/gap_op_unrand/nogap_op_unrand/g; - - open(IN,"<$output2"); - open(OUTA,">$t3output"); - - - while (<IN>){ - s/-//g unless /^>/; - print OUTA; - } - - close IN; - close OUTA; - #------------------------------------------------------------------------------------------ -} -#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx - - -#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx -sub multi_species_t2_2{ - #print "IN multi_species_t2_2 : @_\n"; - my $input = $_[0]; - my $species = $_[1]; - my $output1 = $input."2"; - - - open (FILEF1, "<$input"); - open (FILEF2, ">$output1"); - - my $line1 = <FILEF1>; - - while($line1){ - { - # chomp($line); - if ($line1 =~ (m/^\>$species/)){ - chomp($line1); - print FILEF2 $line1; - $line1 = <FILEF1>; - chomp($line1); - print FILEF2 "\t", $line1,"\n"; - } - } - $line1 = <FILEF1>; - } - - close FILEF1; - close FILEF2; -} - -#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx - - -#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx -sub sputnikoutput_corrector{ - my $input = $_[0]; - my $output = $_[1]; - open(IN,"<$input") or die "Cannot open file $input :$!"; - open(OUT,">$output") or die "Cannot open file $output :$!"; - my $tine; - while (my $line=<IN>){ - if($line =~/length /){ - $tine = $line; - $tine =~ s/\s+/\t/g; - my @fields = split(/\t/,$tine); - if ($fields[6] > 60){ - print OUT $line; - $line = <IN>; - - while (($line !~ /nucleotide/) && ($line !~ /^>/)){ - chomp $line; - print OUT $line; - $line = <IN>; - } - print OUT "\n"; - print OUT $line; - } - else{ - print OUT $line; - } - } - else{ - print OUT $line; - } - } - close IN; - close OUT; -} -#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx - - -#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx -sub multi_species_t4{ -# print "multi_species_t4 : @_\n"; - my $input = $_[0]; - my $output = $_[1]; - open (FILEA, "<$input"); - open (FILEB, ">$output"); - - my $line = <FILEA>; - - while ($line) { - # chomp $line; - if ($line =~ />/) { - chomp $line; - print FILEB $line, "\n"; - } - - - if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ - chomp $line; - print FILEB $line, " " ; - $line = <FILEA>; - chomp $line; - print FILEB $line,"\n"; - } - - $line = <FILEA>; - } - - - close FILEA; - close FILEB; - -} - -#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx - - -#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx -sub multi_species_t5{ - - my $input = $_[0]; - my $output = $_[1]; - - open(FILEB,"<$input"); - open(FILEC,">$output"); - - my $curdef; - - while (my $line = <FILEB> ) { - - if ($line =~ /^>/){ - chomp $line; - $curdef = $line; - next; - } - - if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ - print FILEC $curdef," ",$line; - } - - } - - - close FILEB; - close FILEC; - -} -#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx - - -#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx -sub multi_species_t6{ - my $input = $_[0]; - my $output = $_[1]; - my $focalstrand=$_[3]; -# print "inpput = @_\n"; - open (FILE, "<$input"); - open (FILE_MICRO, ">$output"); - my $linecounter=0; - while (my $line = <FILE>){ - $linecounter++; - chomp $line; - #print "line = $line\n"; - #MONO# - $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/; - my $strand=$1; - my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g); - #print "line = $line\n"; - my $specfieldsend = 2 + ($no_of_species*4) - 1; - my @fields = split(/\s+/, $line); - my @speciesdata = @fields[0 ... $specfieldsend]; - $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/; - my ($tide, $start, $end) = ($1, $2, $3); - #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n"; - if($line =~ /mononucleotide/){ - print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n"; - } - #DI# - elsif($line =~ /dinucleotide/){ - print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n"; - } - #TRI# - elsif($line =~ /trinucleotide/ ){ - print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n"; - } - #TETRA# - elsif($line =~ /tetranucleotide/){ - print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n"; - } - #PENTA# - elsif($line =~ /pentanucleotide/){ - #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n"; - } - else{ - # print "not: @fields\n"; - } - } -# print "linecounter=$linecounter\n"; - close FILE; - close FILE_MICRO; -} - -sub mono { - my $st = $_[0]; - my $tp = unpack "A1"x(length($st)/1),$st; - my $var1 = substr($tp, 0, 1); - return join ("\t", $var1); -} -sub di { - my $st = $_[0]; - my $tp = unpack "A2"x(length($st)/2),$st; - my $var1 = substr($tp, 0, 2); - return join ("\t", $var1); -} -sub tri { - my $st = $_[0]; - my $tp = unpack "A3"x(length($st)/3),$st; - my $var1 = substr($tp, 0, 3); - return join ("\t", $var1); -} -sub tetra { - my $st = $_[0]; - my $tp = unpack "A4"x(length($st)/4),$st; - my $var1 = substr($tp, 0, 4); - return join ("\t", $var1); -} -sub penta { - my $st = $_[0]; - my $tp = unpack "A5"x(length($st)/5),$st; - my $var1 = substr($tp, 0, 5); - return join ("\t", $var1); -} - -#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx - - -#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx -sub t9{ - my $input1 = $_[0]; - my $input2 = $_[1]; - my $output = $_[2]; - - - open(IN1,"<$input1") if -e $input1; - open(IN2,"<$input2") or die "cannot open file $_[1] : $!"; - open(OUT,">$output") or die "cannot open file $_[2] : $!"; - - - my %seen = (); - my $prevkey = 0; - - if (-e $input1){ - while (my $line = <IN1>){ - chomp($line); - my @fields = split(/\t/,$line); - my $key1 = join ("_",@fields[0,1,3,4,5]); - # print "key in t9 = $key1\n"; - $seen{$key1}++ if ($prevkey ne $key1) ; - $prevkey = $key1; - } -# print "done first hash\n"; - close IN1; - } - - while (my $line = <IN2>){ - # print $line, "**\n"; - if (-e $input1){ - chomp($line); - my @fields = split(/\t/,$line); - my $key2 = join ("_",@fields[0,1,3,4,5]); - if (exists $seen{$key2}){ - print OUT "$line\n" ; - delete $seen{$key2}; - } - } - else { - print OUT "$line\n" ; - } - } - - close IN2; - close OUT; -} -#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx - - -sub multiSpecies_compound_microsat_hunter3{ - - my $input1 = $_[0]; ###### the *_sput_op4_ii file - my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" - my $output1 = $_[2]; ###### plain microsatellite file - my $output2 = $_[3]; ###### compound microsatellite file - my $org = $_[4]; ###### 1 or 2 - $no_of_species = $_[5]; - #print "IN multiSpecies_compound_microsat_hunter3: @_\n"; - #my @tags = split(/\t/,$info); - sub compoundify; - open(IN,"<$input1") or die "Cannot open file $input1 $!"; - open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; - open(OUT,">$output1") or die "Cannot open file $output1 $!"; - open(OUT2,">$output2") or die "Cannot open file $output2 $!"; - $infocord = 2 + (4*$no_of_species) - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; - - my @thresholds = ("0"); - push(@thresholds, split(/_/,$_[6])); - sub thresholdCheck; - my %micros = (); - while (my $line = <IN>){ - # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; - next if $line =~ /\t\t/; - if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $2, $3, $4, $5); - # print $key, "#-#-#-#-#-#-#-#\n"; - push (@{$micros{$key}},$line); - } - else{ - } - } - close IN; - my @deletedlines = (); - - my $linecount = 0; - - while(my $sine = <SEQ>){ - my %microstart=(); - my %microend=(); - - my @sields = split(/\t/,$sine); - - my $key = (); - - if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - $key = join("\t",$1, $2, $3, $4, $5); - # print $key, "<-<-<-<-<-<-<-<\n"; - } - else{ - } - - if (exists $micros{$key}){ - $linecount++; - my @microstring = @{$micros{$key}}; - my @tempmicrostring = @{$micros{$key}}; - - foreach my $line (@tempmicrostring){ - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - push (@{$microstart{$start}},$line); - push (@{$microend{$end}},$line); - } - my $firstflag = 'down'; - while( my $line =shift(@microstring)){ - # print "-----------\nline = $line "; - chomp $line; - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - my $startmicro = $line; - my $endmicro = $line; - - # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n"; - - delete ($microstart{$start}); - delete ($microend{$end}); - my $flag = 'down'; - my $startflag = 'down'; - my $endflag = 'down'; - my $prestart = $start - $distance; - my $postend = $end + $distance; - my @compoundlines = (); - my %compoundhash = (); - push (@compoundlines, $line); - push (@{$compoundhash{$line}},$line); - my $startrank = 1; - my $endrank = 1; - - while( ($startflag eq "down") || ($endflag eq "down") ){ - if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) { -# print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n"; - last; - } - - $firstflag = "up"; - if ($startflag eq "down"){ - for my $i ($prestart ... $start){ - - if(exists $microend{$i}){ - chomp $microend{$i}[0]; - if(exists $compoundhash{$microend{$i}[0]}) {next;} - # print "sending from microend $startmicro, $microend{$i}[0] |||\n"; - if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){ - push(@compoundlines, $microend{$i}[0]); - # print "accepted\n"; - my @tields = split(/\t/,$microend{$i}[0]); - $startmicro = $microend{$i}[0]; - chomp $startmicro; - $start = $tields[$startcord]; - $flag = 'down'; - $startrank++; - # print "startcompund = $microend{$i}[0]\n"; - delete $microend{$i}; - delete $microstart{$start}; - $startflag = 'down'; - $prestart = $start - $distance; - last; - } - else{ - $flag = 'up'; - $startflag = 'up'; - } - } - else{ - $flag = 'up'; - $startflag = 'up'; - } - } - } - - $endrank = $startrank; - - if ($endflag eq "down"){ - for my $i ($end ... $postend){ - - if(exists $microstart{$i} ){ - chomp $microstart{$i}[0]; - if(exists $compoundhash{$microstart{$i}[0]}) {next;} - # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n"; - - if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){ - push(@compoundlines, $microstart{$i}[0]); - # print "accepted\n"; - my @tields = split(/\t/,$microstart{$i}[0]); - $end = $tields[$endcord]-0; - $endmicro = $microstart{$i}[0]; - $endrank++; - chomp $endmicro; - $flag = 'down'; - # print "endcompund = $microstart{$i}[0]\n"; - delete $microstart{$i}; - delete $microend{$end}; - shift @microstring; - $postend = $end + $distance; - $endflag = 'down'; - last; - } - else{ - $flag = 'up'; - $endflag = 'up'; - } - } - else{ - $flag = 'up'; - $endflag = 'up'; - } - } - } - # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; - } #end while( $flag eq "down") - # print "compoundlines = @compoundlines \n"; - if (scalar (@compoundlines) == 1){ - print OUT $line,"\n"; - } - if (scalar (@compoundlines) > 1){ - my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]); - # print $compoundline,"\n"; - print OUT2 $compoundline,"\n"; - } - } #end foreach my $line (@microstring){ - } #if (exists $micros{$key}){ - - - } - - close OUT; - close OUT2; -} - - -#------------------------------------------------------------------------------------------------ -sub compoundify{ - my ($compoundlines, $sequence) = @_; -# print "\nfound to compound : @$compoundlines and$sequence \n"; - my $noOfComps = @$compoundlines; -# print "Number of elements in hash is $noOfComps\n"; - my @starts; - my @ends; - foreach my $line (@$compoundlines){ -# print "compoundify.. line = $line \n"; - chomp $line; - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - # print "start = $start, end = $end \n"; - push(@starts, $start); - push(@ends,$end); - } - my @temp = @$compoundlines; - my $startline=$temp[0]; - my @mields = split(/\t/,$startline); - my $startcoord = $mields[$startcord]; - my $startgapsign=$mields[$endcord]; - my @startsorted = sort { $a <=> $b } @starts; - my @endsorted = sort { $a <=> $b } @ends; - my @intervals; - for my $end (0 ... (scalar(@endsorted)-2)){ - my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1)); - push(@intervals,$interval); - # print "interval = $interval =\n"; - # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n"; - } - push(@intervals,""); - my $compoundmicrosat=(); - my $multiunit=""; - foreach my $line (@$compoundlines){ - my @fields = split(/\t/,$line); - my $component="[".$fields[$microsatcord]."]".shift(@intervals); - $compoundmicrosat=$compoundmicrosat.$component; - $multiunit=$multiunit."[".$fields[$motifcord]."]"; -# print "multiunit = $multiunit\n"; - } - my $compoundcopy = $compoundmicrosat; - $compoundcopy =~ s/\[|\]//g; - my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1; - - - my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit)); - return $compoundline; -} - -#------------------------------------------------------------------------------------------------ - -sub identityMatch_thresholdCheck{ - my $line1 = $_[0]; - my $line2 = $_[1]; - my $rank = $_[2]; - my @lields1 = split(/\t/,$line1); - my @lields2 = split(/\t/,$line2); -# print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n"; - - if (length($lields1[$motifcord]) == length($lields2[$motifcord])){ - my $probe = $lields1[$motifcord].$lields1[$motifcord]; - #print "$probe :: $lields2[$motifcord]\n"; - return "proceed" if $probe =~ /$lields2[$motifcord]/; - #print "line recieved\n"; - if ($rank ==1){ - return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; - } - else { - return "proceed" if thresholdCheck($line2) eq "proceed"; - return "stop"; - } - } - else{ - if ($rank ==1){ - return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; - } - else { - return "proceed" if thresholdCheck($line2) eq "proceed"; - return "stop"; - } - } - return "stop"; -} -#------------------------------------------------------------------------------------------------ - -sub thresholdCheck{ - my @checkthresholds=(0,@thresholds); - #print "IN thresholdCheck: @_\n"; - my $line = $_[0]; - my @lields = split(/\t/,$line); - return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])]; - return "stop"; -} -#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx - -sub multiSpecies_filtering_interrupted_microsats{ -# print "IN multiSpecies_filtering_interrupted_microsats: @_\n"; - my $unfiltered = $_[0]; - my $filtered = $_[1]; - my $residue = $_[2]; - my $no_of_species = $_[5]; - open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; - open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; - open(RES,">$residue") or die "Cannot open file $residue: $!"; - - $infocord = 2 + (4*$no_of_species) - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - - - my @sub_thresholds = (0); - - push(@sub_thresholds, split(/_/,$_[3])); - my @thresholds = (0); - - push(@thresholds, split(/_/,$_[4])); - - while (my $line = <UNF>) { - next if $line !~ /[a-z]/; - #print $line; - chomp $line; - my @fields = split(/\t/,$line); - my $motif = $fields[$motifcord]; - my $realmotif = $motif; - #print "motif = $motif\n"; - if ($motif =~ /^\[/){ - $motif =~ s/^\[//g; - my @motifs = split(/\]/,$motif); - $realmotif = $motifs[0]; - } -# print "realmotif = $realmotif"; - my $motif_size = length($realmotif); - - my $microsat = $fields[$microsatcord]; -# print "microsat = $microsat\n"; - $microsat =~ s/^\[|\]$//sg; - my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); - - $microsat = join("",@microsats); - if (length($microsat) < $thresholds[$motif_size]) { - # print length($microsat)," < ",$thresholds[$motif_size],"\n"; - print RES $line,"\n"; next; - } - my @lengths = (); - foreach my $mic (@microsats){ - push(@lengths, length($mic)); - } - if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) { - # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n"; - print RES $line,"\n"; next;} - else {print FIL $line,"\n"; next; - } - } - close FIL; - close RES; - -} - -sub largest_microsat{ - my $counter = 0; - my($max) = shift(@_); - foreach my $temp (@_) { - #print "finding largest array: $maxcounter \n"; - if($temp > $max){ - $max = $temp; - } - } - return($max); -} - -#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx -sub multiSpecies_compound_microsat_analyzer{ - ####### PARAMETER ######## - ########################## - - my $input1 = $_[0]; ###### the *_sput_op4_ii file - my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" - my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format - my $output2 = $_[3]; ###### the pure compound microsatellites - my $org = $_[4]; - my $no_of_species = $_[5]; -# print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n"; - $infocord = 2 + (4*$no_of_species) - 1; - $typecord = 2 + (4*$no_of_species) + 1 - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - - open(IN,"<$input1") or die "Cannot open file $input1 $!"; - open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; - - open(OUT,">$output1") or die "Cannot open file $output1 $!"; - open(OUT2,">$output2") or die "Cannot open file $output2 $!"; - - -# print "opened files \n"; - my %micros = (); - my $keycounter=0; - my $linecounter=0; - while (my $line = <IN>){ - $linecounter++; - if ($line =~ /([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); - push (@{$micros{$key}},$line); - $keycounter++; - } - else{ - # print "no key\n"; - } - } - close IN; - my @deletedlines = (); -# print "done hash . linecounter=$linecounter, keycounter=$keycounter\n"; - #--------------------------------------------------------------------------------------------------- - # NOW READING THE SEQUENCE FILE - my $keyfound=0; - my $keyexists=0; - my $inter=0; - my $pure=0; - - while(my $sine = <SEQ>){ - my %microstart=(); - my %microend=(); - my @sields = split(/\t/,$sine); - my $key = (); - if ($sine =~ /([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); - $keyfound++; - } - else{ - - } - if (exists $micros{$key}){ - $keyexists++; - my @microstring = @{$micros{$key}}; - - my @filteredmicrostring; - - foreach my $line (@microstring){ - chomp $line; - my $copy_line = $line; - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - # FOR COMPOUND MICROSATELLITES - if ($fields[$typecord] eq "compound"){ - $line = compound_microsat_analyser($line); - if ($line eq "NULL") { - print OUT2 "$copy_line\n"; - $pure++; - next; - } - else{ - print OUT "$line\n"; - $inter++; - next; - } - } - } - - } #if (exists $micros{$key}){ - } - close OUT; - close OUT2; -# print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n"; -} - -sub compound_microsat_analyser{ - my $line = $_[0]; - my @fields = split(/\t/,$line); - my $motifline = $fields[$motifcord]; - my $microsat = $fields[$microsatcord]; - $motifline =~ s/^\[|\]$//g; - $microsat =~ s/^\[|\]$//g; - $microsat =~ s/-//g; - my @interruptions = (); - my @motields = split(/\]\[/,$motifline); - my @microields = split(/\][a-zA-Z|-]*\[/,$microsat); - my @inields = split(/[.*]/,$microsat); - shift @inields; - my @motifcount = scalar(@motields); - my $prevmotif = $motields[0]; - my $prevmicro = $microields[0]; - my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0])); - my $localflag = 'down'; - my @infoarray = (); - - for my $l (1 ... (scalar(@motields)-1)){ - my $probe = $prevmotif.$prevmotif; - if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;} - - if ($probe =~ /$motields[$l]/i){ - my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l])); - my $curr_startphase = substr($microields[$l],0,length($motields[$l])); - if ($curr_startphase =~ /$prevphase/i) { - $infoarray[$l-1] = "insertion"; - } - else { - $infoarray[$l-1] = "indel/substitution"; - } - - $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase; - next; - } - else {$localflag = "up"; last;} - } - if ($localflag eq 'up') {return "NULL";} - - if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";} - if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";} - if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";} - if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";} - if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";} - - @microields = split(/[\[|\]]/,$microsat); - my @microsats = (); - my @positions = (); - my $lengthtracker = 0; - - for my $i (0 ... (scalar(@microields ) - 1)){ - if ($i%2 == 0){ - push(@microsats,$microields[$i]); - $lengthtracker = $lengthtracker + length($microields[$i]); - - } - else{ - push(@interruptions,$microields[$i]); - push(@positions, $lengthtracker+1); - $lengthtracker = $lengthtracker + length($microields[$i]); - } - - } - my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions))); - return($returnline); -} - -#xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx - -sub multiSpecies_compoundClarifyer{ -# print "IN multiSpecies_compoundClarifyer: @_\n"; - my $input1 = $_[0]; ###### the *_sput_compound - my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" - my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format - my $output2 = $_[3]; ###### compound file - my $org = $_[4]; - my $no_of_species = $_[5]; - @thresholds = "0"; - push(@thresholds, split(/_/,$_[6])); - - - $infocord = 2 + (4*$no_of_species) - 1; - $typecord = 2 + (4*$no_of_species) + 1 - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; - - $interr_poscord = $motifcord + 3; - $no_of_interruptionscord = $motifcord + 4; - $interrcord = $motifcord + 2; - $interrtypecord = $motifcord + 1; - - - open(IN,"<$input1") or die "Cannot open file $input1 $!"; - open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; - - open(INT,">$output1") or die "Cannot open file $output2 $!"; - open(COMP,">$output2") or die "Cannot open file $output2 $!"; - #open(CH,">changed") or die "Cannot open file changed $!"; - -# print "opened files \n"; - my $linecounter = 0; - my $microcounter = 0; - - my %micros = (); - while (my $line = <IN>){ - # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; - $linecounter++; - if ($line =~ /([a-zA-Z0-9]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([a-zA-Z0-9]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([a-zA-Z0-9]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); - # print $key, "#-#-#-#-#-#-#-#\n"; - push (@{$micros{$key}},$line); - $microcounter++; - } - else {print $line;} - } -# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; - close IN; - my @deletedlines = (); -# print "done hash \n"; - $linecounter = 0; - #--------------------------------------------------------------------------------------------------- - # NOW READING THE SEQUENCE FILE - my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide); - $printer = 0; - - while(my $sine = <SEQ>){ - my %microstart=(); - my %microend=(); - my @sields = split(/\t/,$sine); - my $key = (); - if ($sine =~ /([a-z0-9A-Z]+)\s+(chr[0-9a-zA-Z]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+(chr[0-9a-zA-Z]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+(chr[0-9a-zA-Z]+)\s+([0-9]+)\s+([0-9]+)\s/ ) { - $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); - } - else{ -# print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n"; - } - - if (exists $micros{$key}){ - my @microstring = @{$micros{$key}}; - delete $micros{$key}; - - foreach my $line (@microstring){ -# print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1; -# print "microsat = $line" if $printer == 1; - $linecounter++; - my $copy_line = $line; - my @mields = split(/\t/,$line); - my @fields = @mields; - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - my $microsat = $fields[$microsatcord]; - my $motifline = $fields[$motifcord]; - my $microsatcopy = $microsat; - my $positioner = $microsat; - $positioner =~ s/[a-zA-Z|-]/_/g; - $microsatcopy =~ s/^\[|\]$//gs; - chomp $microsatcopy; - my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); - my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); - my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start); -# print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1; - shift @inields; - #print "inields =@inields<\n"; - $motifline =~ s/^\[|\]$//gs; - chomp $motifline; - #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n"; - my @motields = split(/\]\[/,$motifline); - my $seq = $microsatcopy; - $seq =~ s/\[|\]//g; - my $seqlen = length($seq); - $seq = " ".$seq; - - my $longestmotif_no = longest_array_element(@motields); - my $shortestmotif_no = shortest_array_element(@motields); - #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n"; - - my $search = $motields[$longestmotif_no].$motields[$longestmotif_no]; - if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){ - print COMP $line; - next; - } - - my @shortestmotif_nos = (); - for my $m (0 ... $#motields){ - push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) ); - } - ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ - my $newleft =(); - my $leftstopper = 0; my $rightstopper = 0; - foreach my $shortmotif_no (@shortestmotif_nos){ - next if $shortmotif_no == 0; - my $last_left = $shortmotif_no; #$#motields; - my $last_hitter = 0; - for (my $i =($shortmotif_no-1); $i>=0; $i--){ - my $search = $motields[$shortmotif_no]; - if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} - if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ - $last_hitter = 1; - $last_left = $i+1; last; - } - my $probe = $motields[$i]; - if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} - - if ($probe !~ /$search/){ - $last_hitter = 1; - $last_left = $i+1; - # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n"; - last; - } - $last_left--;$last_hitter = 1; - # print "passed tests, last left = $last_left\n"; - } - # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n"; - if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;} - else {$leftstopper = 1; - # print "leftstopper = 1\n"; - } - } - - ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ - my $newright =(); - foreach my $shortmotif_no (@shortestmotif_nos){ - next if $shortmotif_no == $#motields; - my $last_right = $shortmotif_no;# -1; - for my $i ($shortmotif_no+1 ... $#motields){ - my $search = $motields[$shortmotif_no]; - if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} - if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ - $last_right = $i-1; last; - } - my $probe = $motields[$i]; - if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} - if ( $probe !~ /$search/){ - $last_right = $i-1; last; - } - $last_right++; - } - if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n"; - } - else {$rightstopper = 1; - } - } - - - if ($rightstopper == 1 && $leftstopper == 1){ - print COMP $line; -# print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1; - next; - } - -# print "pased initial testing phase \n" if $printer == 1; - my @outputs = (); - my @orig_starts = (); - my @orig_ends = (); - for my $mic (0 ... $#microields){ - my $miclen = length($microields[$mic]); - my $microleftlen = 0; - #print "\nmic = $mic\n"; - if($mic > 0){ - for my $submin (0 ... $mic-1){ - my $interval = (); - if (!exists $inields[$submin]) {$interval = "";} - else {$interval = $inields[$submin];} - #print "inield =$interval< and microield =$microields[$submin]<\n "; - $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); - } - } - push(@orig_starts,($microleftlen+1)); - push(@orig_ends, ($microleftlen+1 + $miclen -1)); - } - - ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ - - - for my $mic (0 ... $#microields){ - my $miclen = length($microields[$mic]); - my $microleftlen = 0; - if($mic > 0){ - for my $submin (0 ... $mic-1){ - # if(!exists $inields[$submin]) {$inields[$submin] = "";} - my $interval = (); - if (!exists $inields[$submin]) {$interval = "";} - else {$interval = $inields[$submin];} - #print "inield =$interval< and microield =$microields[$submin]<\n "; - $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); - } - } - $fields[$startcord] = $microleftlen+1; - $fields[$endcord] = $fields[$startcord] + $miclen -1; - $fields[$typecord] = $microsat_types[length($motields[$mic])]; - $fields[$microsatcord] = $microields[$mic]; - $fields[$motifcord] = $motields[$mic]; - my $templine = join("\t", (@fields[0 .. $motifcord]) ); - my $orig_templine = join("\t", (@fields[0 .. $motifcord]) ); - my $newline; - my $lefter = 1; my $righter = 1; - if ( $fields[$startcord] < 2){$lefter = 0;} - if ($fields[$endcord] == $seqlen){$righter = 0;} - - while($lefter == 1){ - $newline = left_extender($templine, $seq,$org); -# print "returned line from left extender= $newline \n" if $printer == 1; - if ($newline eq $templine){$templine = $newline; last;} - else {$templine = $newline;} - - if (left_extention_permission_giver($templine) eq "no") {last;} - } - while($righter == 1){ - $newline = right_extender($templine, $seq,$org); -# print "returned line from right extender= $newline \n" if $printer == 1; - if ($newline eq $templine){$templine = $newline; last;} - else {$templine = $newline;} - if (right_extention_permission_giver($templine) eq "no") {last;} - } - my @tempfields = split(/\t/,$templine); - $tempfields[$microsatcord] =~ s/\]|\[//g; - $tempfields[$motifcord] =~ s/^\[|\]$//gs; - my @tempmotields = split(/\]\[/,$tempfields[$motifcord]); - - if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) { -# print "scalar ( tempmotields) = 1\n" if $printer == 1; - next; - } - my $prevmotif = shift(@tempmotields); - my $stopper = 0; - - foreach my $tempmot (@tempmotields){ - if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;} - my $search = $prevmotif.$prevmotif; - if ($search !~ /$tempmot/) {$stopper = 1; last;} - $prevmotif = $tempmot; - } - if ( $stopper == 1) { -# print "length tempmot != length prevmotif\n" if $printer == 1; - next; - } - my $lastend = 0; - #---------------------------------------------------------- - my $left_captured = (); my $right_captured = (); - my $left_bp = (); my $right_bp = (); - # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n"; - for my $o (0 ... $#orig_starts){ -# print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1; -# print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1; - - if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord]) -# print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1; - $left_captured = $o; - $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1; - } - elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord]) -# print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1; - $right_captured = $o; - $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1; - } - $lastend = $orig_ends[$o] - } -# print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1; - my $leftmotif = (); my $left_trashed = (); - if ($tempfields[$startcord] != $fields[$startcord]) { - $leftmotif = $motields[$left_captured]; -# print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1; - if ( $left_captured !~ /[0-9]+/) {print $line,"\n", $templine,"\n"; } - $left_trashed = length($microields[$left_captured]) - $left_bp; - } - my $rightmotif = (); my $right_trashed = (); - if ($tempfields[$endcord] != $fields[$endcord]) { -# print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1; - $rightmotif = $motields[$right_captured]; - $right_trashed = length($microields[$right_captured]) - $right_bp; - } - - ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ - $stopper = 0; - my $deletioner = 0; - #if($tempfields[$startcord] != $fields[$startcord]){ -# print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1; - if ($left_captured != 0){ -# print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1; - for my $e (0 ... $left_captured-1){ - if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){ -# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; - $deletioner++; last; - } - if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ -# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; - $deletioner++; last; - } - if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ -# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; - $deletioner++; last; - } - } - } - #} -# print "after left search, deletioner = $deletioner\n" if $printer == 1; - if ($deletioner >= 1) { -# print "deletioner = $deletioner\n" if $printer == 1; - next; - } - - $deletioner = 0; - - #if($tempfields[$endcord] != $fields[$endcord]){ -# print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1; - if ($right_captured != $#microields){ -# print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1; - for my $e ($right_captured+1 ... $#microields){ - if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){ -# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; - $deletioner++; last; - } - if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ -# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; - $deletioner++; last; - } - if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ -# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; - $deletioner++; last; - } - } - } - #} -# print "deletioner = $deletioner\n" if $printer == 1; - if ($deletioner >= 1) { - next; - } - my $leftMotifs_notCaptured = (); - my $rightMotifs_notCaptured = (); - - if ($tempfields[$startcord] != $fields[$startcord] ){ - #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n"; - if (length($leftmotif) == 1 && $left_trashed > 3){ -# print "invaded left motif is long mononucleotide" if $printer == 1; - next; - - } - elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){ -# print "invaded left motif too long" if $printer == 1; - next; - } - } - if ($tempfields[$endcord] != $fields[$endcord] ){ - #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n"; - if (length($rightmotif)==1 && $right_trashed){ -# print "invaded right motif is long mononucleotide" if $printer == 1; - next; - - } - elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){ -# print "invaded right motif too long" if $printer == 1; - next; - - } - } - push @outputs, $templine; - } - if (scalar(@outputs) == 0){ print COMP $line; next;} - # print "outputs are:", join("\n",@outputs),"\n"; - if (scalar(@outputs) == 1){ - my @oields = split(/\t/,$outputs[0]); - my $start = $oields[$startcord]+$mields[$startcord]-1; - my $end = $start+($oields[$endcord]-$oields[$startcord]); - $oields[$startcord] = $start; $oields[$endcord] = $end; - print INT join("\t",@oields), "\n"; - # print CH $line,; - } - if (scalar(@outputs) > 1){ - my $motif_min = 10; - my $chosen_one = $outputs[0]; - foreach my $micro (@outputs){ - my @oields = split(/\t/,$micro); - my $tempmotif = $oields[$motifcord]; - $tempmotif =~ s/^\[|\]$//gs; - my @omots = split(/\]\[/, $tempmotif); - # print "motif_min = $motif_min, current motif = $tempmotif\n"; - my $start = $oields[$startcord]+$mields[$startcord]-1; - my $end = $start+($oields[$endcord]-$oields[$startcord]); - $oields[$startcord] = $start; $oields[$endcord] = $end; - if(length($omots[0]) < $motif_min) { - $chosen_one = join("\t",@oields); - $motif_min = length($omots[0]); - } - } - print INT $chosen_one, "\n"; - # print "chosen one is ".$chosen_one, "\n"; - # print CH $line; - - - } - - } - - } #if (exists $micros{$key}){ - else{ - } - } - close INT; - close COMP; -} -sub left_extender{ - #print "left extender\n"; - my ($line, $seq, $org) = @_; -# print "in left extender... line passed = $line and sequence is $seq\n"; - chomp $line; - my @fields = split(/\t/,$line); - my $rstart = $fields[$startcord]; - my $microsat = $fields[$microsatcord]; - $microsat =~ s/\[|\]//g; - my $rend = $rstart + length($microsat)-1; - $microsat =~ s/-//g; - my $motif = $fields[$motifcord]; - my $firstmotif = (); - - if ($motif =~ /^\[/){ - $motif =~ s/^\[//g; - $motif =~ /([a-zA-Z]+)\].*/; - $firstmotif = $1; - } - else {$firstmotif = $motif;} - - #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n"; - my $leftphase = substr($microsat, 0,length($firstmotif)); - my $phaser = $leftphase.$leftphase; - my @phase = split(/\s*/,$leftphase); - my @phases; - my @copy_phases = @phases; - my $crawler=0; - for (0 ... (length($leftphase)-1)){ - push(@phases, substr($phaser, $crawler, length($leftphase))); - $crawler++; - } - - my $start = $rstart; - my $end = $rend; - - my $leftseq = substr($seq, 0, $start); -# print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n"; - my @extentions = (); - my @trappeds = (); - my @intervalposs = (); - my @trappedposs = (); - my @trappedphases = (); - my @intervals = (); - my $firstmotif_length = length($firstmotif); - foreach my $phase (@phases){ -# print "left phase\t",substr($leftseq, -10),"\t$phase\n"; -# print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n"; - if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ -# print "in left pattern\n"; - my $trapped = $1; - my $trappedpos = length($leftseq)-length($trapped); - my $interval = $3; - my $intervalpos = index($trapped, $interval) + 1; -# print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n"; - - my $extention = substr($trapped, 0, length($trapped)-length($interval)); - my $leftpeep = substr($seq, 0, ($start-length($trapped))); - my @passed_overhangs; - - for my $i (1 ... length($phase)-1){ - my $overhang = substr($phase, -length($phase)+$i); -# print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n"; - #TEMPORARY... BETTER METHOD NEEDED - $leftpeep =~ s/-//g; - if ($leftpeep =~ /$overhang$/i){ - push(@passed_overhangs,$overhang); -# print "l overhang\n"; - } - } - - if(scalar(@passed_overhangs)>0){ - my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; - $extention = $overhang.$extention; - $trapped = $overhang.$trapped; - #print "trapped extended to $trapped \n"; - $trappedpos = length($leftseq)-length($trapped); - } - - push(@extentions,$extention); -# print "extentions = @extentions \n"; - - push(@trappeds,$trapped ); - push(@intervalposs,length($extention)+1); - push(@trappedposs, $trappedpos); -# print "trappeds = @trappeds\n"; - push(@trappedphases, substr($extention,0,length($phase))); - push(@intervals, $interval); - } - } - if (scalar(@trappeds == 0)) {return $line;} - - my $nikaal = shortest_array_element(@intervals); - - if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} - $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; - ##print "new fields 9 = $fields[9]\n"; - $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); - - if($fields[$microsatcord] !~ /^\[/i){ - $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; - } - - $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; - - if (exists ($fields[$motifcord+1])){ - $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; - } - else{$fields[$motifcord+1] = "indel/deletion";} - ##print "new fields 14 = $fields[14]\n"; - - if (exists ($fields[$motifcord+2])){ - $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; - } - else{$fields[$motifcord+2] = $intervals[$nikaal];} - my @seventeen=(); - if (exists ($fields[$motifcord+3])){ - @seventeen = split(/,/,$fields[$motifcord+3]); - # #print "scalarseventeen =@seventeen<-\n"; - for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} - $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); - $fields[$motifcord+4] = $fields[$motifcord+4]+1; - } - - else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} - - ##print "new fields 16 = $fields[16]\n"; - ##print "new fields 17 = $fields[17]\n"; - - - my $returnline = join("\t",@fields); - my $pastline = $returnline; - if ($fields[$microsatcord] =~ /\[/){ - $returnline = multiSpecies_compoundClarifyer_merge($returnline); - } - return $returnline; -} -sub right_extender{ - my ($line, $seq, $org) = @_; - chomp $line; - my @fields = split(/\t/,$line); - my $rstart = $fields[$startcord]; - my $microsat = $fields[$microsatcord]; - $microsat =~ s/\[|\]//g; - my $rend = $rstart + length($microsat)-1; - $microsat =~ s/-//g; - my $motif = $fields[$motifcord]; - my $temp_lastmotif = (); - - if ($motif =~ /\]$/s){ - $motif =~ s/\]$//sg; - $motif =~ /.*\[([a-zA-Z]+)/; - $temp_lastmotif = $1; - } - else {$temp_lastmotif = $motif;} - my $lastmotif = substr($microsat,-length($temp_lastmotif)); - ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n"; - my $rightphase = substr($microsat, -length($lastmotif)); - my $phaser = $rightphase.$rightphase; - my @phase = split(/\s*/,$rightphase); - my @phases; - my @copy_phases = @phases; - my $crawler=0; - for (0 ... (length($rightphase)-1)){ - push(@phases, substr($phaser, $crawler, length($rightphase))); - $crawler++; - } - - my $start = $rstart; - my $end = $rend; - - my $rightseq = substr($seq, $end+1); - my @extentions = (); - my @trappeds = (); - my @intervalposs = (); - my @trappedposs = (); - my @trappedphases = (); - my @intervals = (); - my $lastmotif_length = length($lastmotif); - foreach my $phase (@phases){ - if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ - my $trapped = $1; - my $trappedpos = $end+1; - my $interval = $2; - my $intervalpos = index($trapped, $interval) + 1; - - my $extention = substr($trapped, length($interval)); - my $rightpeep = substr($seq, ($end+length($trapped))+1); - my @passed_overhangs = ""; - - #TEMPORARY... BETTER METHOD NEEDED - $rightpeep =~ s/-//g; - - for my $i (1 ... length($phase)-1){ - my $overhang = substr($phase,0, $i); -# #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n"; - if ($rightpeep =~ /^$overhang/i){ - push(@passed_overhangs, $overhang); -# #print "r overhang\n"; - } - } - if (scalar(@passed_overhangs) > 0){ - my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; - $extention = $extention.$overhang; - $trapped = $trapped.$overhang; -# #print "trapped extended to $trapped \n"; - } - - push(@extentions,$extention); - ##print "extentions = @extentions \n"; - - push(@trappeds,$trapped ); - push(@intervalposs,$intervalpos); - push(@trappedposs, $trappedpos); -# #print "trappeds = @trappeds\n"; - push(@trappedphases, substr($extention,0,length($phase))); - push(@intervals, $interval); - } - } - if (scalar(@trappeds == 0)) {return $line;} - -# my $nikaal = longest_array_element(@trappeds); - my $nikaal = shortest_array_element(@intervals); - -# #print "longest element found = $nikaal \n"; - - if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} - $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; - ##print "new fields 9 = $fields[9]"; - $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); - - ##print "new fields 11 = $fields[11]\n"; - - if($fields[$microsatcord] !~ /^\[/i){ - $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; - } - - $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; - ##print "new fields 12 = $fields[12]\n"; - - ##print "scalar of fields = ",scalar(@fields),"\n"; - if (exists ($fields[$motifcord+1])){ -# print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1]; -# <STDIN> if !exists $fields[$motifcord+1]; - $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; - } - else{$fields[$motifcord+1] = "indel/deletion";} - ##print "new fields 14 = $fields[14]\n"; - - if (exists ($fields[$motifcord+2])){ - $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; - } - else{$fields[$motifcord+2] = $intervals[$nikaal];} - ##print "new fields 15 = $fields[15]\n"; - - my @seventeen=(); - if (exists ($fields[$motifcord+3])){ - ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n"; -# print " print fields = @fields\n " if !exists $fields[$motifcord+3]; - <STDIN> if !exists $fields[$motifcord+3]; - my $currpos = length($microsat)+$intervalposs[$nikaal]; - $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; - $fields[$motifcord+4] = $fields[$motifcord+4]+1; - - } - - else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} - - ##print "new fields 16 = $fields[16]\n"; - - ##print "new fields 17 = $fields[17]\n"; - my $returnline = join("\t",@fields); - my $pastline = $returnline; - if ($fields[$microsatcord] =~ /\[/){ - $returnline = multiSpecies_compoundClarifyer_merge($returnline); - } - #print "finally right-extended line = ",$returnline,"\n"; - return $returnline; -} -sub longest_array_element{ - my $counter = 0; - my($max) = shift(@_); - my $maxcounter = 0; - foreach my $temp (@_) { - $counter++; - #print "finding largest array: $maxcounter \n" if $prinkter == 1; - if(length($temp) > length($max)){ - $max = $temp; - $maxcounter = $counter; - } - } - return($maxcounter); -} -sub shortest_array_element{ - my $counter = 0; - my($min) = shift(@_); - my $mincounter = 0; - foreach my $temp (@_) { - $counter++; - #print "finding largest array: $mincounter \n" if $prinkter == 1; - if(length($temp) < length($min)){ - $min = $temp; - $mincounter = $counter; - } - } - return($mincounter); -} - - -sub left_extention_permission_giver{ - my @fields = split(/\t/,$_[0]); - my $microsat = $fields[$microsatcord]; - $microsat =~ s/(^\[)|-//g; - my $motif = $fields[$motifcord]; - my $firstmotif = (); - my $firststretch = (); - my @stretches=(); - if ($motif =~ /^\[/){ - $motif =~ s/^\[//g; - $motif =~ /([a-zA-Z]+)\].*/; - $firstmotif = $1; - @stretches = split(/\]/,$microsat); - $firststretch = $stretches[0]; - ##print "firststretch = $firststretch\n"; - } - else {$firstmotif = $motif;$firststretch = $microsat;} - - if (length($firststretch) < $thresholds[length($firstmotif)]){ - return "no"; - } - else {return "yes";} - -} -sub right_extention_permission_giver{ - my @fields = split(/\t/,$_[0]); - my $microsat = $fields[$microsatcord]; - $microsat =~ s/-|(\]$)//sg; - my $motif = $fields[$motifcord]; - my $temp_lastmotif = (); - my $laststretch = (); - my @stretches=(); - - - if ($motif =~ /\]/){ - $motif =~ s/\]$//gs; - $motif =~ /.*\[([a-zA-Z]+)$/; - $temp_lastmotif = $1; - @stretches = split(/\[/,$microsat); - $laststretch = pop(@stretches); - ##print "last stretch = $laststretch\n"; - } - else {$temp_lastmotif = $motif; $laststretch = $microsat;} - - if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ - return "no"; - } - else { return "yes";} - - -} -sub multiSpecies_compoundClarifyer_merge{ - my $line = $_[0]; - #print "sent for mering: $line \n"; - my @mields = split(/\t/,$line); - my @fields = @mields; - my $microsat = $fields[$microsatcord]; - my $motifline = $fields[$motifcord]; - my $microsatcopy = $microsat; - $microsatcopy =~ s/^\[|\]$//sg; - my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); - my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); - shift @inields; - #print "inields =@inields<\n"; - $motifline =~ s/^\[|\]$//sg; - my @motields = split(/\]\[/,$motifline); - my @firstmotifs = (); - my @lastmotifs = (); - for my $i (0 ... $#microields){ - $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); - $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); - } - #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n"; - my @mergelist = (); - my @inter_poses = split(/,/,$fields[$interr_poscord]); - my $no_of_interruptions = $fields[$no_of_interruptionscord]; - my @interruptions = split(/,/,$fields[$interrcord]); - my @interrtypes = split(/,/,$fields[$interrtypecord]); - my $stopper = 0; - for my $i (0 ... $#motields-1){ - #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n"; - if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){ - $stopper = 1; - push(@mergelist, ($i)."_".($i+1)); - } - } - - return $line if scalar(@mergelist) == 0; - - foreach my $merging (@mergelist){ - my @sets = split(/_/, $merging); - my @tempmicro = (); - my @tempmot = (); - for my $i (0 ... $sets[0]-1){ - push(@tempmicro, "[".$microields[$i]."]"); - push(@tempmicro, $inields[$i]); - push(@tempmot, "[".$motields[$i]."]"); - #print "adding pre-motifs number $i\n"; - } - my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; - push (@tempmicro, $pusher); - push(@tempmot, "[".$motields[$sets[0]]."]"); - my $outcoming = -2; - for my $i ($sets[1]+1 ... $#microields-1){ - push(@tempmicro, "[".$microields[$i]."]"); - push(@tempmicro, $inields[$i]); - push(@tempmot, "[".$motields[$i]."]"); - #print "adding post-motifs number $i\n"; - $outcoming = $i; - } - if ($outcoming != -2){ - #print "outcoming = $outcoming \n"; - push(@tempmicro, "[".$microields[$outcoming+1 ]."]"); - push(@tempmot,"[". $motields[$outcoming+1]."]"); - } - $fields[$microsatcord] = join("",@tempmicro); - $fields[$motifcord] = join("",@tempmot); - - splice(@interrtypes, $sets[0], 1); - $fields[$interrtypecord] = join(",",@interrtypes); - splice(@interruptions, $sets[0], 1); - $fields[$interrcord] = join(",",@interruptions); - splice(@inter_poses, $sets[0], 1); - $fields[$interr_poscord] = join(",",@inter_poses); - $no_of_interruptions = $no_of_interruptions - 1; - } - - if ($no_of_interruptions == 0){ - $fields[$microsatcord] =~ s/^\[|\]$//sg; - $fields[$motifcord] =~ s/^\[|\]$//sg; - $line = join("\t", @fields[0 ... $motifcord]); - } - else{ - $line = join("\t", @fields); - } - return $line; -} - -sub thrashallow{ - my $motif = $_[0]; - return 4 if length($motif) == 2; - return 6 if length($motif) == 3; - return 8 if length($motif) == 4; - -} - -#xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx -sub multispecies_filtering_compound_microsats{ - my $unfiltered = $_[0]; - my $filtered = $_[1]; - my $residue = $_[2]; - my $no_of_species = $_[5]; - open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; - open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; - open(RES,">$residue") or die "Cannot open file $residue: $!"; - - $infocord = 2 + (4*$no_of_species) - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - - my @sub_thresholds = ("0"); - push(@sub_thresholds, split(/_/,$_[3])); - my @thresholds = ("0"); - push(@thresholds, split(/_/,$_[4])); - - while (my $line = <UNF>) { - if ($line !~ /compound/){ - print FIL $line,"\n"; next; - } - chomp $line; - my @fields = split(/\t/,$line); - my $motifline = $fields[$motifcord]; - $motifline =~ s/^\[|\]$//g; - my @motifs = split(/\]\[/,$motifline); - my $microsat = $fields[$microsatcord]; - $microsat =~ s/^\[|\]$|-//g; - my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); - - my $stopper = 0; - for my $i (0 ... $#motifs){ - my @common = (); - my $probe = $motifs[$i].$motifs[$i]; - my $motif_size = length($motifs[$i]); - - for my $j (0 ... $#motifs){ - next if length($motifs[$i]) != length($motifs[$j]); - push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i; - } - - if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;} - else {next;} - } - - if ($stopper == 1){ - print RES $line,"\n"; - } - else { print FIL $line,"\n"; } - } - close FIL; - close RES; -} - -#xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx - -sub chromosome_unrand_breaker{ -# print "IN chromosome_unrand_breaker: @_\n "; - my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" - my $dir = $_[1]; ###### directory where subsets are put - my $output2 = $_[2]; ###### list of subset files - my $increment = $_[3]; - my $info = $_[4]; - my $chr = $_[5]; - open(SEQ,"<$input1") or die "Cannot open file $input1 $!"; - - open(OUT,">$output2") or die "Cannot open file $output2 $!"; - - #--------------------------------------------------------------------------------------------------- - # NOW READING THE SEQUENCE FILE - - my $seed = 0; - my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); - print OUT $subset,"\n"; - open(SUB,">$subset"); - - while(my $sine = <SEQ>){ - $seed++; - print SUB $sine; - - if ($seed%$increment == 0 ){ - close SUB; - $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); - open(SUB,">$subset"); - print SUB $sine; - print OUT $subset,"\n"; - # print $subset,"\n"; - } - } - close OUT; - close SUB; -} -#xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx -sub multiSpecies_interruptedMicrosatHunter{ -# print "IN multiSpecies_interruptedMicrosatHunter: @_\n"; - my $input1 = $_[0]; ###### the *_sput_op4_ii file - my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" - my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format - my $output2 = $_[3]; ###### uninterrupted microsatellite file - my $org = $_[4]; - my $no_of_species = $_[5]; - - my @thresholds = "0"; - push(@thresholds, split(/_/,$_[6])); - -# print "thresholds = @thresholds \n"; - $infocord = 2 + (4*$no_of_species) - 1; - $typecord = 2 + (4*$no_of_species) + 1 - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; - - $interr_poscord = $motifcord + 3; - $no_of_interruptionscord = $motifcord + 4; - $interrcord = $motifcord + 2; - $interrtypecord = $motifcord + 1; - - - $prinkter = 0; -# print "prionkytet = $prinkter\n"; - - open(IN,"<$input1") or die "Cannot open file $input1 $!"; - open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; - - open(INT,">$output1") or die "Cannot open file $output2 $!"; - open(UNINT,">$output2") or die "Cannot open file $output2 $!"; - -# print "opened files !!\n"; - my $linecounter = 0; - my $microcounter = 0; - - my %micros = (); - while (my $line = <IN>){ - # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; - $linecounter++; - if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+(chr[0-9a-zA-Z]+)\s([0-9]+)\s+([0-9]+)\s/ ) { - my $key = join("\t",$1, $2, $3, $4, $5); - # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1; - push (@{$micros{$key}},$line); - $microcounter++; - } - else {#print $line if $prinkter == 1; - } - } -# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; - close IN; - my @deletedlines = (); -# print "done hash \n"; - $linecounter = 0; - #--------------------------------------------------------------------------------------------------- - # NOW READING THE SEQUENCE FILE - while(my $sine = <SEQ>){ - #print $linecounter,"\n" if $linecounter % 1000 == 0; - my %microstart=(); - my %microend=(); - my @sields = split(/\t/,$sine); - my $key = (); - if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - $key = join("\t",$1, $2, $3, $4, $5); - # print $key, "<-<-<-<-<-<-<-<\n"; - } - - # $prinkter = 1 if $sine =~ /^>H\t499\t/; - - if (exists $micros{$key}){ - my @microstring = @{$micros{$key}}; - delete $micros{$key}; - my @filteredmicrostring; -# print "sequence = $sields[$sequencepos]" if $prinkter == 1; - foreach my $line (@microstring){ - $linecounter++; - my $copy_line = $line; - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - -# print $line if $prinkter == 1; - #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE - my $newline; - while(1){ - # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1; - if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} - - $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org); - if ($newline eq $line){$line = $newline; last;} - else {$line = $newline;} - - if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} -# print "returned line from left extender= $line \n" if $prinkter == 1; - } - while(1){ - # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1; - if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} - - $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org); - if ($newline eq $line){$line = $newline; last;} - else {$line = $newline;} - - if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} -# print "returned line from right extender= $line \n" if $prinkter == 1; - } -# print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1; - - my @tempfields = split(/\t/,$line); - if ($tempfields[$microsatcord] =~ /\[/){ - print INT $line,"\n"; - } - else{ - print UNINT $line,"\n"; - } - - if ($line =~ /NULL/){ next; } - push(@filteredmicrostring, $line); - push (@{$microstart{$start}},$line); - push (@{$microend{$end}},$line); - } - - my $firstflag = 'down'; - - } #if (exists $micros{$key}){ - } - close INT; - close UNINT; -# print "final number of lines = $linecounter\n"; -} - -sub multiSpecies_interruptedMicrosatHunter_left_extender{ - my ($line, $seq, $org) = @_; -# print "left extender, like passed = $line\n" if $prinkter == 1; -# print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1; - chomp $line; - my @fields = split(/\t/,$line); - my $rstart = $fields[$startcord]; - my $microsat = $fields[$microsatcord]; - $microsat =~ s/\[|\]//g; - my $rend = $rstart + length($microsat)-1; - $microsat =~ s/-//g; - my $motif = $fields[$motifcord]; - my $firstmotif = (); - - if ($motif =~ /^\[/){ - $motif =~ s/^\[//g; - $motif =~ /([a-zA-Z]+)\].*/; - $firstmotif = $1; - } - else {$firstmotif = $motif;} - -# print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1; - my $leftphase = substr($microsat, 0,length($firstmotif)); - my $phaser = $leftphase.$leftphase; - my @phase = split(/\s*/,$leftphase); - my @phases; - my @copy_phases = @phases; - my $crawler=0; - for (0 ... (length($leftphase)-1)){ - push(@phases, substr($phaser, $crawler, length($leftphase))); - $crawler++; - } - - my $start = $rstart; - my $end = $rend; - - my $leftseq = substr($seq, 0, $start); -# print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1; - my @extentions = (); - my @trappeds = (); - my @intervalposs = (); - my @trappedposs = (); - my @trappedphases = (); - my @intervals = (); - my $firstmotif_length = length($firstmotif); - foreach my $phase (@phases){ -# print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1; -# print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1; - if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ -# print "in left pattern\n" if $prinkter == 1; - my $trapped = $1; - my $trappedpos = length($leftseq)-length($trapped); - my $interval = $3; - my $intervalpos = index($trapped, $interval) + 1; -# print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1; - - my $extention = substr($trapped, 0, length($trapped)-length($interval)); - my $leftpeep = substr($seq, 0, ($start-length($trapped))); - my @passed_overhangs; - - for my $i (1 ... length($phase)-1){ - my $overhang = substr($phase, -length($phase)+$i); -# print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1; - #TEMPORARY... BETTER METHOD NEEDED - $leftpeep =~ s/-//g; - if ($leftpeep =~ /$overhang$/i){ - push(@passed_overhangs,$overhang); -# print "l overhang\n" if $prinkter == 1; - } - } - - if(scalar(@passed_overhangs)>0){ - my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; - $extention = $overhang.$extention; - $trapped = $overhang.$trapped; -# print "trapped extended to $trapped \n" if $prinkter == 1; - $trappedpos = length($leftseq)-length($trapped); - } - - push(@extentions,$extention); -# print "extentions = @extentions \n" if $prinkter == 1; - - push(@trappeds,$trapped ); - push(@intervalposs,length($extention)+1); - push(@trappedposs, $trappedpos); -# print "trappeds = @trappeds\n" if $prinkter == 1; - push(@trappedphases, substr($extention,0,length($phase))); - push(@intervals, $interval); - } - } - if (scalar(@trappeds == 0)) {return $line;} - -############################ my $nikaal = longest_array_element(@trappeds); - my $nikaal = shortest_array_element(@intervals); - -# print "longest element found = $nikaal \n" if $prinkter == 1; - - if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} - $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; - #print "new fields 9 = $fields[9]\n" if $prinkter == 1; - $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); - - #print "new fields 9 = $fields[9]\n" if $prinkter == 1; - - if($fields[$microsatcord] !~ /^\[/i){ - $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; - } - - $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; - #print "new fields 14 = $fields[12]\n" if $prinkter == 1; - - #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1; - - - if (scalar(@fields) > $motifcord+1){ - $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; - } - else{$fields[$motifcord+1] = "indel/deletion";} - #print "new fields 14 = $fields[14]\n" if $prinkter == 1; - - if (scalar(@fields)>$motifcord+2){ - $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; - } - else{$fields[$motifcord+2] = $intervals[$nikaal];} - #print "new fields 15 = $fields[15]\n" if $prinkter == 1; - - my @seventeen=(); - - if (scalar(@fields)>$motifcord+3){ - @seventeen = split(/,/,$fields[$motifcord+3]); - # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1; - for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} - $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); - $fields[$motifcord+4] = $fields[$motifcord+4]+1; - } - - else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} - - #print "new fields 16 = $fields[16]\n" if $prinkter == 1; - #print "new fields 17 = $fields[17]\n" if $prinkter == 1; - -# return join("\t",@fields); - my $returnline = join("\t",@fields); - my $pastline = $returnline; - if ($fields[$microsatcord] =~ /\[/){ - $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); - } -# print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1; - return $returnline; -} - -sub multiSpecies_interruptedMicrosatHunter_right_extender{ -# print "right extender\n" if $prinkter == 1; - my ($line, $seq, $org) = @_; -# print "in right extender... line passed = $line\n" if $prinkter == 1; -# print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1; - chomp $line; - my @fields = split(/\t/,$line); - my $rstart = $fields[$startcord]; - my $microsat = $fields[$microsatcord]; - $microsat =~ s/\[|\]//g; - my $rend = $rstart + length($microsat)-1; - $microsat =~ s/-//g; - my $motif = $fields[$motifcord]; - my $temp_lastmotif = (); - - if ($motif =~ /\]$/){ - $motif =~ s/\]$//g; - $motif =~ /.*\[([a-zA-Z]+)/; - $temp_lastmotif = $1; - } - else {$temp_lastmotif = $motif;} - my $lastmotif = substr($microsat,-length($temp_lastmotif)); -# print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1; - my $rightphase = substr($microsat, -length($lastmotif)); - my $phaser = $rightphase.$rightphase; - my @phase = split(/\s*/,$rightphase); - my @phases; - my @copy_phases = @phases; - my $crawler=0; - for (0 ... (length($rightphase)-1)){ - push(@phases, substr($phaser, $crawler, length($rightphase))); - $crawler++; - } - - my $start = $rstart; - my $end = $rend; - - my $rightseq = substr($seq, $end+1); -# print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1; -# print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1; - my @extentions = (); - my @trappeds = (); - my @intervalposs = (); - my @trappedposs = (); - my @trappedphases = (); - my @intervals = (); - my $lastmotif_length = length($lastmotif); - foreach my $phase (@phases){ -# print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1; -# print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1; - if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ -# print "in right pattern\n" if $prinkter == 1; - my $trapped = $1; - my $trappedpos = $end+1; - my $interval = $2; - my $intervalpos = index($trapped, $interval) + 1; -# print "trapped = $trapped, interval = $interval\n" if $prinkter == 1; - - my $extention = substr($trapped, length($interval)); - my $rightpeep = substr($seq, ($end+length($trapped))+1); - my @passed_overhangs = ""; - - #TEMPORARY... BETTER METHOD NEEDED - $rightpeep =~ s/-//g; - - for my $i (1 ... length($phase)-1){ - my $overhang = substr($phase,0, $i); -# print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1; - if ($rightpeep =~ /^$overhang/i){ - push(@passed_overhangs, $overhang); -# print "r overhang\n" if $prinkter == 1; - } - } - if (scalar(@passed_overhangs) > 0){ - my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; - $extention = $extention.$overhang; - $trapped = $trapped.$overhang; -# print "trapped extended to $trapped \n" if $prinkter == 1; - } - - push(@extentions,$extention); - #print "extentions = @extentions \n" if $prinkter == 1; - - push(@trappeds,$trapped ); - push(@intervalposs,$intervalpos); - push(@trappedposs, $trappedpos); -# print "trappeds = @trappeds\n" if $prinkter == 1; - push(@trappedphases, substr($extention,0,length($phase))); - push(@intervals, $interval); - } - } - if (scalar(@trappeds == 0)) {return $line;} - -################################### my $nikaal = longest_array_element(@trappeds); - my $nikaal = shortest_array_element(@intervals); - -# print "longest element found = $nikaal \n" if $prinkter == 1; - - if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} - $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; - $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); - - - if($fields[$microsatcord] !~ /^\[/i){ - $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; - } - - $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; - - - if (scalar(@fields) > $motifcord+1){ - $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; - } - else{$fields[$motifcord+1] = "indel/deletion";} - - if (scalar(@fields)>$motifcord+2){ - $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; - } - else{$fields[$motifcord+2] = $intervals[$nikaal];} - - my @seventeen=(); - if (scalar(@fields)>$motifcord+3){ - #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1; - my $currpos = length($microsat)+$intervalposs[$nikaal]; - $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; - $fields[$motifcord+4] = $fields[$motifcord+4]+1; - - } - - else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} - -# print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1; -# return join("\t",@fields); - - my $returnline = join("\t",@fields); - my $pastline = $returnline; - if ($fields[$microsatcord] =~ /\[/){ - $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); - } -# print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1; - return $returnline; - -} - -sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{ - my @fields = split(/\t/,$_[0]); - my $microsat = $fields[$microsatcord]; - $microsat =~ s/(^\[)|-//sg; - my $motif = $fields[$motifcord]; - chomp $motif; -# print $motif, "\n" if $motif !~ /^\[/; - my $firstmotif = (); - my $firststretch = (); - my @stretches=(); - -# print "motif = $motif, microsat = $microsat\n" if $prinkter == 1; - if ($motif =~ /^\[/){ - $motif =~ s/^\[//sg; - $motif =~ /([a-zA-Z]+)\].*/; - $firstmotif = $1; - @stretches = split(/\]/,$microsat); - $firststretch = $stretches[0]; - #print "firststretch = $firststretch\n" if $prinkter == 1; - } - else {$firstmotif = $motif;$firststretch = $microsat;} -# print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1; - if (length($firststretch) < $thresholds[length($firstmotif)]){ - return "no"; - } - else {return "yes";} - -} -sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{ - my @fields = split(/\t/,$_[0]); - my $microsat = $fields[$microsatcord]; - $microsat =~ s/-|(\]$)//sg; - my $motif = $fields[$motifcord]; - chomp $motif; - my $temp_lastmotif = (); - my $laststretch = (); - my @stretches=(); - - - if ($motif =~ /\]/){ - $motif =~ s/\]$//sg; - $motif =~ /.*\[([a-zA-Z]+)$/; - $temp_lastmotif = $1; - @stretches = split(/\[/,$microsat); - $laststretch = pop(@stretches); - #print "last stretch = $laststretch\n" if $prinkter == 1; - } - else {$temp_lastmotif = $motif; $laststretch = $microsat;} - - if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ - return "no"; - } - else { return "yes";} - - -} -sub checking_substitutions{ - - my ($line, $seq, $startprobes, $endprobes) = @_; - #print "sequence = $seq \n" if $prinkter == 1; - #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n"; - # <STDIN>; - my @seqarray = split(/\s*/,$seq); - my @startsubst_probes = split(/\|/,$startprobes); - my @endsubst_probes = split(/\|/,$endprobes); - chomp $line; - my @fields = split(/\t/,$line); - my $start = $fields[11] - $fields[10]; - my $end = $fields[13] - $fields[10]; - my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9] - $motif =~ s/\[|\]//g; - my $microsat = $fields[14]; - $microsat =~ s/\[|\]//g; - #------------------------------------------------------------------------ - # GETTING START AND END PHASES - my $startphase = substr($microsat,0, length($motif)); - my $endphase = substr($microsat,-length($motif), length($motif)); - #print "start and end phases are - $startphase and $endphase\n"; - my $startflag = 'down'; - my $endflag = 'down'; - my $substitution_distance = length($motif); - my $prestart = $start - $substitution_distance; - my $postend = $end + $substitution_distance; - my @endadds = (); - my @startadds = (); - if (($prestart < 0) || ($postend > scalar(@seqarray))) { - last; - } - #------------------------------------------------------------------------#------------------------------------------------------------------------ - # CHECKING FOR SUBSTITUTION PROBES NOW - - if ($fields[8] ne "mononucleotide"){ - while ($startflag eq "down"){ - my $search = join("",@seqarray[$prestart...($start-1)]); - #print "search is from $prestart...($start-1) = $search\n"; - foreach my $probe (@startsubst_probes){ - #print "\t\tprobe = $probe\n"; - if ($search =~ /^$probe/){ - #print "\tfound addition to the left - $search \n"; - my $copyprobe = $probe; - my $type; - my $subspos = 0; - my $interruption = ""; - if ($search eq $startphase) { $type = "NONE";} - else{ - $copyprobe =~ s/\[a-zA-Z\]/^/g; - $subspos = index($copyprobe,"^") + 1; - $type = "substitution"; - $interruption = substr($search, $subspos,1); - } - my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos); - #print "adding information: $addinfo \n"; - push(@startadds, $addinfo); - $prestart = $prestart - $substitution_distance; - $start = $start-$substitution_distance; - $startflag = 'down'; - - last; - } - else{ - $startflag = 'up'; - } - } - } - #<STDIN>; - while ($endflag eq "down"){ - my $search = join("",@seqarray[($end+1)...$postend]); - #print "search is from ($end+1)...$postend] = $search\n"; - - foreach my $probe (@endsubst_probes){ - #print "\t\tprobe = $probe\n"; - if ($search =~ /$probe$/){ - my $copyprobe = $probe; - my $type; - my $subspos = 0; - my $interruption = ""; - if ($search eq $endphase) { $type = "NONE";} - else{ - $copyprobe =~ s/\[a-zA-Z\]/^/g; - $subspos = index($copyprobe,"^") + 1; - $type = "substitution"; - $interruption = substr($search, $subspos,1); - } - my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos); - #print "adding information: $addinfo \n"; - push(@endadds, $addinfo); - $postend = $postend + $substitution_distance; - $end = $end+$substitution_distance; - push(@endadds, $search); - $endflag = 'down'; - last; - } - else{ - $endflag = 'up'; - } - } - } - #print "startadds = @startadds, endadds = @endadds \n"; - - } -} -sub microsat_packer{ - my $microsat = $_[0]; - my $addition = $_[1]; - - - -} -sub multiSpecies_interruptedMicrosatHunter_merge{ - $prinkter = 0; -# print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n"; - my $line = $_[0]; -# print "sent for mering: $line \n" if $prinkter ==1; - my @mields = split(/\t/,$line); - my @fields = @mields; - my $microsat = allCaps($fields[$microsatcord]); - my $motifline = allCaps($fields[$motifcord]); - my $microsatcopy = $microsat; -# print "microsat = $microsat|\n" if $prinkter ==1; - $microsatcopy =~ s/^\[|\]$//sg; - chomp $microsatcopy; - my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); - my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); - shift @inields; -# print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1; - $motifline =~ s/^\[|\]$//sg; - my @motields = split(/\]\[/,$motifline); - my @firstmotifs = (); - my @lastmotifs = (); - for my $i (0 ... $#microields){ - $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); - $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); - } -# print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1; - my @mergelist = (); - my @inter_poses = split(/,/,$fields[$interr_poscord]); - my $no_of_interruptions = $fields[$no_of_interruptionscord]; - my @interruptions = split(/,/,$fields[$interrcord]); - my @interrtypes = split(/,/,$fields[$interrtypecord]); - my $stopper = 0; - for my $i (0 ... $#motields-1){ -# print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1; - if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){ - $stopper = 1; - push(@mergelist, ($i)."_".($i+1)); #<STDIN> if $prinkter ==1; - } - } - -# print "mergelist = @mergelist\n" if $prinkter ==1; - return $line if scalar(@mergelist) == 0; -# print "merging @mergelist\n" if $prinkter ==1; -# <STDIN> if $prinkter ==1; - - foreach my $merging (@mergelist){ - my @sets = split(/_/, $merging); -# print "sets = @sets\n" if $prinkter ==1; - my @tempmicro = (); - my @tempmot = (); -# print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1; - for my $i (0 ... $sets[0]-1){ -# print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1; - push(@tempmicro, "[".$microields[$i]."]"); - push(@tempmicro, $inields[$i]); - push(@tempmot, "[".$motields[$i]."]"); -# print "adding pre-motifs number $i\n" if $prinkter ==1; -# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; - } -# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; -# print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1; - my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; -# print "middle is, from @motields - @sets, number 0 which is is\n"; -# print ": $motields[$sets[0]]\n"; - push (@tempmicro, $pusher); - push(@tempmot, "[".$motields[$sets[0]]."]"); - push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]]; - my $outcoming = -2; -# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; -# print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1; - for my $i ($sets[1]+1 ... $#microields){ -# print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1; - push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i]; - push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i]; - push(@tempmot, "[".$motields[$i]."]"); -# print "adding post-motifs number $i\n" if $prinkter ==1; - $outcoming = $i; - } -# print "____________________________________________________________________________\n"; - $prinkter = 0; - $fields[$microsatcord] = join("",@tempmicro); - $fields[$motifcord] = join("",@tempmot); -# print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1; - - splice(@interrtypes, $sets[0], 1); - $fields[$interrtypecord] = join(",",@interrtypes); - splice(@interruptions, $sets[0], 1); - $fields[$interrcord] = join(",",@interruptions); - splice(@inter_poses, $sets[0], 1); - $fields[$interr_poscord] = join(",",@inter_poses); - $no_of_interruptions = $no_of_interruptions - 1; - } - - if ($no_of_interruptions == 0 && $line !~ /compound/){ - $fields[$microsatcord] =~ s/^\[|\]$//sg; - $fields[$motifcord] =~ s/^\[|\]$//sg; - $line = join("\t", @fields[0 ... $motifcord]); - } - else{ - $line = join("\t", @fields); - } -# print "post merging, the line is $line\n" if $prinkter ==1; - #<STDIN> if $stopper ==1; - return $line; -} -sub interval_asseser{ - my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3]; -} -#--------------------------------------------------------------------------------------------------- -sub allCaps{ - my $motif = $_[0]; - $motif =~ s/a/A/g; - $motif =~ s/c/C/g; - $motif =~ s/t/T/g; - $motif =~ s/g/G/g; - return $motif; -} - - -#xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx -sub merge_interruptedMicrosats{ -# print "IN merge_interruptedMicrosats: @_\n"; - my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" - my $input1 = $_[1]; ###### the *_sput_op4_ii file - my $input2 = $_[2]; ###### the *_sput_op4_ii file - $no_of_species = $_[3]; - - my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward - my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse - my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward - #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse - #my $info = $_[4]; - #my @tags = split(/\t/,$info); - - open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; - open(INF,"<$input1") or die "Cannot open file $input1 $!"; - open(INR,"<$input2") or die "Cannot open file $input2 $!"; - open(OUTF,">$output1") or die "Cannot open file $output1 $!"; - open(OUTR,">$output2") or die "Cannot open file $output2 $!"; - open(MER,">$output3") or die "Cannot open file $output3 $!"; - #open(MERR,">$output4") or die "Cannot open file $output4 $!"; - - - - $printer = 0; - -# print "files opened \n"; - $infocord = 2 + (4*$no_of_species) - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - $typecord = $infocord + 1; - my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; - - $interrtypecord = $motifcord + 1; - $interrcord = $motifcord + 2; - $interr_poscord = $motifcord + 3; - $no_of_interruptionscord = $motifcord + 4; - $mergestarts = $no_of_interruptionscord+ 1; - $mergeends = $no_of_interruptionscord+ 2; - $mergemicros = $no_of_interruptionscord+ 3; - - # NOW ADDING FORWARD MICROSATELLITES TO HASH - my %fmicros = (); - my $microcounter=0; - my $linecounter = 0; - while (my $line = <INF>){ - # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; - $linecounter++; - if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $2, $4, $5); - # print $key, "#-#-#-#-#-#-#-#\n"; - push (@{$fmicros{$key}},$line); - $microcounter++; - } - else {print $line;} - } -# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; - close INF; - my @deletedlines = (); -# print "done forward hash \n"; - $linecounter = 0; - #--------------------------------------------------------------------------------------------------- - # NOW ADDING REVERSE MICROSATELLITES TO HASH - my %rmicros = (); - $microcounter=0; - while (my $line = <INR>){ - # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; - $linecounter++; - if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $2, $4, $5); - # print $key, "#-#-#-#-#-#-#-#\n"; - push (@{$rmicros{$key}},$line); - $microcounter++; - } - else {print "cant make key\n";} - } -# print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; - close INR; -# print "done reverse hash \n"; - $linecounter = 0; - - #------------------------------------------------------------------------------------------------ - - while(my $sine = <SEQ>){ - #<STDIN> if $sine =~ /16349128/; - next if $sine !~ /[a-zA-Z0-9]/; -# print "-" x 150, "\n" if $printer == 1; - my @sields = split(/\t/,$sine); - my @merged = (); - - my $key = (); - - if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - $key = join("\t",$1, $2, $4, $5); - # print $key, "<-<-<-<-<-<-<-<\n"; - } - # print "key = $key\n"; - - my @sets1; - my @sets2; - chomp $sields[$sequencepos]; - my $rev_sequence = reverse($sields[$sequencepos]); - $rev_sequence =~ s/ //g; - $rev_sequence = " ".$rev_sequence; - next if (!exists $fmicros{$key} && !exists $rmicros{$key}); - - if (exists $fmicros{$key}){ - # print "line no : $linecount\n"; - my @raw_microstring = @{$fmicros{$key}}; - my %starts = (); my %ends = (); -# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} - my @microstring=(); - for my $u (0 ... $#raw_microstring){ - my @tields = split(/\t/,$raw_microstring[$u]); - next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; - push(@microstring, $raw_microstring[$u]); - $starts{$tields[$startcord]} = $tields[$startcord]; - $ends{$tields[$endcord]} = $tields[$endcord]; - } - - # print "founf microstring in forward\n: @microstring\n"; - chomp @microstring; - my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos])); - @sets1 = split("\=", $clusterresult); - my @temp = split(/_/,$sets1[0]) ; $microscanned+= scalar(@temp); - # print "sets = ", join("<all\nmerged>", @sets1), "\n<<-sets1\n"; <STDIN>; - } #if (exists $micros{$key}){ - - if (exists $rmicros{$key}){ - # print "line no : $linecount\n"; - my @raw_microstring = @{$rmicros{$key}}; - my %starts = (); my %ends = (); -# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} - my @microstring=(); - for my $u (0 ... $#raw_microstring){ - my @tields = split(/\t/,$raw_microstring[$u]); - next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; - push(@microstring, $raw_microstring[$u]); - $starts{$tields[$startcord]} = $tields[$startcord]; - $ends{$tields[$endcord]} = $tields[$endcord]; - } - # print "founf microstring in reverse\n: @microstring\n"; <STDIN>; - chomp @microstring; - # print "sending reversed sequence\n"; - my $clusterresult = (find_clusters(@microstring, $rev_sequence ) ); - @sets2 = split("\=", $clusterresult); - my @temp = split(/_/,$sets2[0]) ; $microscanned+= scalar(@temp); - } #if (exists $micros{$key}){ - - my @popout1 = (); - my @popout2 = (); - my @forwardset = (); - if (exists $sets2[1] ){ - if(exists $sets1[0]) { - push (@popout1, $sets1[0],$sets2[1]); - my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));# - print OUTF join("\n",split("_", $forwardset[0])), "\n"; - my @localmerged = split("_", $forwardset[1]); - my $sequence = $sields[$sequencepos]; - $sequence =~ s/ //g; - for my $j (0 ... $#localmerged){ - $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); - } - - push (@merged, @localmerged); - - } - else{ - my @localmerged = split("_", $sets2[1]); - my $sequence = $sields[$sequencepos]; - $sequence =~ s/ //g; - for my $j (0 ... $#localmerged){ - $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); - } - - push (@merged, @localmerged); - } - } - elsif (exists $sets1[0]){ - print OUTF join("\n",split("_", $sets1[0])), "\n"; - } - - my @reverseset= (); - if (exists $sets1[1]){ - if (exists $sets2[0]){ - push (@popout2, $sets2[0],$sets1[1]); - # print "popout2 = @popout2\n"; - my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos])); - #print "reverseset = $reverseset[1] < --- reverseset1\n"; - print OUTR join("\n",split("_", $reverseset[0])), "\n"; - push(@merged, (split("_", $reverseset[1]))); - } - else{ - push(@merged, (split("_", $sets1[1]))); - } - } - elsif (exists $sets2[0]){ - print OUTR join("\n",split("_", $sets2[0])), "\n"; - - } - - if (scalar @merged > 0){ - my @filtered_merged = split("__",(filterDuplicates_merged(@merged))); - print MER join("\n", @filtered_merged),"\n"; - } - # <STDIN> if $sine =~ /16349128/; - - } - close(SEQ); - close(INF); - close(INR); - close(OUTF); - close(OUTR); - close(MER); - -} -sub find_clusters{ - my @input = @_; - my $sequence = pop(@input); - $sequence =~ s/ //g; - my @microstring0 = @input; -# print "IN: find_clusters:\n"; - my %microstart=(); - my %microend=(); - my @nonmerged = (); - my @mergedSet = (); -# print "set of microsats = @microstring \n"; - my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0; -# print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1; - #<STDIN> if $printer == 1; - my @tempmicrostring = @microstring; - foreach my $line (@tempmicrostring){ - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/; - # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; - push (@{$microstart{$start}},$line); - push (@{$microend{$end}},$line); - } - my $firstflag = 'down'; - while( my $line =shift(@microstring)){ -# print "-----------\nline = $line \n" if $printer == 1; - chomp $line; - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ; - my $startmicro = $line; - my $endmicro = $line; -# print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; - - delete ($microstart{$start}); - delete ($microend{$end}); - my $flag = 'down'; - my $startflag = 'down'; - my $endflag = 'down'; - my $prestart = $start - $distance; - my $postend = $end + $distance; - my @compoundlines = (); - my %compoundhash = (); - push (@compoundlines, $line); - push (@{$compoundhash{$line}},$line); - my $startrank = 1; - my $endrank = 1; - - while( ($startflag eq "down") || ($endflag eq "down") ){ -# print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1; - if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){ -# print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1; - last; - } - - $firstflag = "up"; - if ($startflag eq "down"){ - for my $i ($prestart ... $end){ - if(exists $microend{$i}){ - chomp $microend{$i}[0]; - if(exists $compoundhash{$microend{$i}[0]}) {next;} - chomp $microend{$i}[0]; - push(@compoundlines, $microend{$i}[0]); - my @tields = split(/\t/,$microend{$i}[0]); - $startmicro = $microend{$i}[0]; - chomp $startmicro; - $flag = 'down'; - $startrank++; -# print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1; - delete $microend{$i}; - delete $microstart{$tields[$startcord]}; - $end = $tields[$endcord]; - $startflag = 'down'; - $prestart = $tields[$startcord] - $distance; - last; - } - else{ - $flag = 'up'; - $startflag = 'up'; - } - } - } - - if ($endflag eq "down"){ - - for my $i ($start ... $postend){ -# print "$start ----> $i -----> $postend\n" if $printer == 1; - if(exists $microstart{$i} ){ - chomp $microstart{$i}[0]; - if(exists $compoundhash{$microstart{$i}[0]}) {next;} - chomp $microstart{$i}[0]; - push(@compoundlines, $microstart{$i}[0]); - my @tields = split(/\t/,$microstart{$i}[0]); - $endmicro = $microstart{$i}[0]; - $endrank++; - chomp $endmicro; - $flag = 'down'; -# print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1; - - delete $microstart{$i} if exists $microstart{$i} ; - delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]}; -# print "done\n" if $printer == 1; - - shift @microstring; - $end = $tields[$endcord]; - $postend = $tields[$endcord] + $distance; - $endflag = 'down'; - last; - } - else{ - $flag = 'up'; - $endflag = 'up'; - } -# print "out of the if\n" if $printer == 1; - } -# print "out of the for\n" if $printer == 1; - - } -# print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; - } #end while( $flag eq "down") -# print "compoundlines = @compoundlines \n" if $printer == 1; - - if (scalar (@compoundlines) == 1){ - push(@nonmerged, $line); - - } - if (scalar (@compoundlines) > 1){ -# print "FROM CLUSTERER\n" if $printer == 1; - push(@mergedSet,merge_microsats(@compoundlines, $sequence) ); - } - } #end foreach my $line (@microstring){ -# print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1; -#<STDIN> if scalar(@mergedSet) > 0; -# print "EXIT: find_clusters\n"; -return (join("_",@nonmerged). "=".join("_",@mergedSet)); -} - -sub custom { - $a->[$startcord+1] <=> $b->[$startcord+1]; -} - -sub popOuter { -# print "\nIN: popOuter @_\n"; - my @all = split ("_",$_[0]); -# <STDIN> if !defined $_[0]; - my @merged = split ("_",$_[1]); - my $sequence = $_[2]; - my $seqlen = length($sequence); -# print "\nIN: popOuter @_\n" if scalar(@_) != 3; -# <STDIN> if scalar(@_) != 3; - my %microstart=(); - my %microend=(); - my @mergedSet = (); - my @nonmerged = (); -# print "\n\n\n all = @all\n<--all\n"; - - foreach my $line (@all){ - my @fields = split(/\t/,$line); - my $start = $seqlen - $fields[$startcord]+ 1; - my $end = $seqlen - $fields[$endcord] + 1; - push (@{$microstart{$start}},$line); - push (@{$microend{$end}},$line); - } - my $firstflag = 'down'; - - my %forPopouting = (); - - while( my $line =shift(@merged)){ -# print "\n MErgedline: $line \n" if $printer == 1; - chomp $line; - my @fields = split(/\t/,$line); - my $start = $fields[$startcord]; - my $end = $fields[$endcord]; - my $startmicro = $line; - my $endmicro = $line; - - delete ($microstart{$start}); - delete ($microend{$end}); - my $flag = 'down'; - my $startflag = 'down'; - my $endflag = 'down'; - my $prestart = $start - $distance; - my $postend = $end + $distance; - my @compoundlines = (); - my %compoundhash = (); - push (@compoundlines, $line); - my $startrank = 1; - my $endrank = 1; - - # print "\nstart = $start, end = $end\n"; - # <STDIN>; - for my $i ($start ... $end){ - if(exists $microend{$i}){ - # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n"; - chomp $microend{$i}[0]; - my @fields = split(/\t/,$microend{$i}[0]); - delete $microstart{$seqlen - $fields[$startcord] + 1}; - my $invertseq = $sequence; - $invertseq =~ s/ //g; - push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq )); - delete $microend{$i}; - - } - - if(exists $microstart{$i} ){ - # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n"; - - chomp $microstart{$i}[0]; - my @fields = split(/\t/,$microstart{$i}[0]); - delete $microend{$seqlen - $fields[$endcord] + 1}; - my $invertseq = $sequence; - $invertseq =~ s/ //g; - push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) ); - delete $microstart{$i}; - } - } - - if (scalar (@compoundlines) == 1){ - push(@mergedSet,join("\t",@compoundlines) ); - } - else { -# print "FROM POPOUTER\n" if $printer == 1; - push(@mergedSet, merge_microsats(@compoundlines, $sequence) ); - } - } - - foreach my $key (sort keys %microstart) { - push(@nonmerged,$microstart{$key}[0]); - } - - return (join("_",@nonmerged). "=".join("_",@mergedSet) ); -} - - - -sub invert_justCoordinates{ - my $microsat = $_[0]; -# print "IN invert_justCoordinates\n" if $printer == 1; - chomp $microsat; - my $seqLength = $_[1]; - my @fields = split(/\t/,$microsat); - my $start = $seqLength - $fields[$endcord] + 1; - my $end = $seqLength - $fields[$startcord] + 1; - $fields[$startcord] = $start; - $fields[$endcord] = $end; - $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); -# print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1; - return join("\t",@fields); -} - -sub largest_number{ - my $counter = 0; - my($max) = shift(@_); - foreach my $temp (@_) { - #print "finding largest array: $maxcounter \n"; - if($temp > $max){ - $max = $temp; - } - } - return($max); -} -sub smallest_number{ - my $counter = 0; - my($min) = shift(@_); - foreach my $temp (@_) { - #print "finding largest array: $maxcounter \n"; - if($temp < $min){ - $min = $temp; - } - } - return($min); -} - - -sub filterDuplicates_merged{ - my @merged = @_; - my %revmerged = (); - my @fmerged = (); - foreach my $micro (@merged) { - my @fields = split(/\t/,$micro); - if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){ - my $key = join("_",$fields[1], $fields[$startcord], $fields[$endcord]); - # print "adding ... \n$key\n$micro\n"; - push(@{$revmerged{$key}}, $micro); - } - else{ - # print "pushing.. $micro\n"; - push(@fmerged, $micro); - } - } -# print "\n"; - foreach my $micro (@fmerged) { - my @fields = split(/\t/,$micro); - my $key = join("_",$fields[1], $fields[$startcord], $fields[$endcord]); - # print "searching for key $key\n"; - if (exists $revmerged{$key}){ - # print "deleting $revmerged{$key}[0]\n"; - delete $revmerged{$key}; - } - } - foreach my $key (sort keys %revmerged) { - push(@fmerged,$revmerged{$key}[0]); - } -# print "returning ", join("\n", @fmerged),"\n" ; - return join("__", @fmerged); -} - -sub invert_microsat{ - my $micro = $_[0]; - chomp $micro; - if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;} - else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; } - my $sequence = $_[1]; - $sequence =~ s/ //g; - my $seqlen = length($sequence); - my @fields = split(/\t/,$micro); - my $start = $seqlen - $fields[$endcord] +1; - my $end = $seqlen - $fields[$startcord] +1; - $fields[$startcord] = $start; - $fields[$endcord] = $end; - $fields[$motifcord] = reverse_micro($fields[$motifcord]); - $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); - if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){ - my @intertypes = split(/,/,$fields[$interrtypecord]); - my @inters = split(/,/,$fields[$interrcord]); - my @interposes = split(/,/,$fields[$interr_poscord]); - $fields[$interrtypecord] = join(",",reverse(@intertypes)); - $fields[$no_of_interruptionscord] = scalar(@interposes); - for my $i (0 ... $fields[$no_of_interruptionscord]-1){ - if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){ - $inters[$i] = reverse($inters[$i]); - $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1; - } - else{ - $inters[$i] = ""; - $interposes[$i] = $interposes[$i] - 1; - } - $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1; - } - - $fields[$interrcord] = join(",",reverse(@inters)); - $fields[$interr_poscord] = join(",",reverse(@interposes)); - } - - my $finalmicrosat = join("\t", @fields); - return $finalmicrosat; - -} -sub reverse_micro{ - my $micro = reverse($_[0]); - my @strand = split(/\s*/,$micro); - for my $i (0 ... $#strand){ - if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} - if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} - } - return join("",@strand); -} - -#xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx - -sub forward_reverse_sputoutput_comparer { -# print "IN forward_reverse_sputoutput_comparer: @_\n"; - my $input0 = $_[0]; ###### the *nogap_unrand_match file - my $input1 = $_[1]; ###### the real file, *sput* data - my $input2 = $_[2]; ###### the reverse file, *sput* data - my $output1 = $_[3]; ###### microsats different in real file - my $output2 = $_[4]; ###### microsats missing in real file - my $output3 = $_[5]; ###### microsats common among real and reverse file - my $no_of_species = $_[6]; - - $infocord = 2 + (4*$no_of_species) - 1; - $typecord = 2 + (4*$no_of_species) + 1 - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; - $interrtypecord = $motifcord + 1; - $interrcord = $motifcord + 2; - $interr_poscord = $motifcord + 3; - $no_of_interruptionscord = $motifcord + 4; - $mergestarts = $no_of_interruptionscord+ 1; - $mergeends = $no_of_interruptionscord+ 2; - $mergemicros = $no_of_interruptionscord+ 3; - - - open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; - open(INF,"<$input1") or die "Cannot open file $input1 $!"; - open(INR,"<$input2") or die "Cannot open file $input2 $!"; - - open(DIFF,">$output1") or die "Cannot open file $output1 $!"; - #open(MISS,">$output2") or die "Cannot open file $output2 $!"; - open(SAME,">$output3") or die "Cannot open file $output3 $!"; - - -# print "opened files \n"; - my $linecounter = 0; - my $fcounter = 0; - my $rcounter = 0; - - $printer = 0; - #--------------------------------------------------------------------------------------------------- - # NOW ADDING FORWARD MICROSATELLITES TO HASH - my %fmicros = (); - my $microcounter=0; - while (my $line = <INF>){ - $linecounter++; - if ($line =~ /([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); - # print $key, "#-#-#-#-#-#-#-#\n"; - push (@{$fmicros{$key}},$line); - $microcounter++; - } - else { - #print $line; - } - } -# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; - close INF; - my @deletedlines = (); -# print "done forward hash \n"; - $linecounter = 0; - #--------------------------------------------------------------------------------------------------- - # NOW ADDING REVERSE MICROSATELLITES TO HASH - my %rmicros = (); - $microcounter=0; - while (my $line = <INR>){ - $linecounter++; - if ($line =~ /([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); - # print $key, "#-#-#-#-#-#-#-#\n"; - push (@{$rmicros{$key}},$line); - $microcounter++; - } - else {} - } -# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; - close INR; -# print "done reverse hash \n"; - $linecounter = 0; - #--------------------------------------------------------------------------------------------------- - #--------------------------------------------------------------------------------------------------- - # NOW READING THE SEQUENCE FILE - while(my $sine = <SEQ>){ - my %microstart=(); - my %microend=(); - my @sields = split(/\t/,$sine); - my $key = (); - if ($sine =~ /([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); - } - else { - next; - } - $printer = 0; - my $sequence = $sields[$sequencepos]; - chomp $sequence; - $sequence =~ s/ //g; - my @localfs = (); - my @localrs = (); - - if (exists $fmicros{$key}){ - @localfs = @{$fmicros{$key}}; - delete $fmicros{$key}; - } - - my %forwardstarts = (); - my %forwardends = (); - - foreach my $f (@localfs){ - my @fields = split(/\t/,$f); - push (@{$forwardstarts{$fields[$startcord]}},$f); - push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]); - } - - if (exists $rmicros{$key}){ - @localrs = @{$rmicros{$key}}; - delete $rmicros{$key}; - } - else{ - } - - foreach my $r (@localrs){ - chomp $r; - my @rields = split(/\t/,$r); -# print "rields = @rields\n" if $printer == 1; - my $reciprocalstart = length($sequence) - $rields[$endcord] + 1; - my $reciprocalend = length($sequence) - $rields[$startcord] + 1; -# print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1; - my $microsat = reverse_micro(all_caps($rields[$microsatcord])); - my @localcollection=(); - for my $i ($reciprocalstart+1 ... $reciprocalend-1){ - if (exists $forwardstarts{$i}){ - push(@localcollection, $forwardstarts{$i}[0] ); - delete $forwardstarts{$i}; - } - if (exists $forwardends{$i}){ - next if !exists $forwardstarts{$forwardends{$i}[0]}; - push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] ); - } - } - if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);} - - if (scalar(@localcollection) == 0){ - print SAME invert_microsat($r,($sequence) ), "\n"; - } - - elsif (scalar(@localcollection) == 1){ -# print "f microsat = $localcollection[0]\n" if $printer == 1; - my @lields = split(/\t/,$localcollection[0]); - $lields[$microsatcord]=all_caps($lields[$microsatcord]); -# print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1; -# print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1; - if ($microsat eq $lields[$microsatcord]){ - chomp $localcollection[0]; - print SAME $localcollection[0], "\n"; - } - if ($microsat ne $lields[$microsatcord]){ - chomp $localcollection[0]; - my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence); -# print "newmicro = $newmicro\n" if $printer == 1; - if ($newmicro =~ /[a-zA-Z]/){ - print SAME $newmicro,"\n"; - } - else{ - print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n"; -# print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1; -# print "@rields\n@lields\n" if $printer == 1; - } - } - } - else{ -# print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1; - } - } - } - - close(SEQ); - close(INF); - close(INR); - close(DIFF); - close(SAME); - -} -sub all_caps{ - my @strand = split(/\s*/,$_[0]); - for my $i (0 ... $#strand){ - if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} - if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} - if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} - if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} - } - return join("",@strand); -} -sub microsatChooser{ - my $forward = $_[0]; - my $reverse = $_[1]; - my $sequence = $_[2]; - my $seqLength = length($sequence); - $sequence =~ s/ //g; - my @fields = split(/\t/,$forward); - my @rields = split(/\t/,$reverse); - my $r_start = $seqLength - $rields[$endcord] + 1; - my $r_end = $seqLength - $rields[$startcord] + 1; - - - my $f_microsat = $fields[$microsatcord]; - my $r_microsat = $rields[$microsatcord]; - - if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) { - return $forward if length($f_microsat) >= length($r_microsat); - return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat); - } - return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord]; - - my $f_microsat_copy = $f_microsat; - my $r_microsat_copy = $r_microsat; - $f_microsat_copy =~ s/^\[|\]$//g; - $r_microsat_copy =~ s/^\[|\]$//g; - - my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); - my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); - my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); - my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); - - my $f_motif = $fields[$motifcord]; - my $r_motif = $rields[$motifcord]; - my $f_motif_copy = $f_motif; - my $r_motif_copy = $r_motif; - $f_motif_copy =~ s/^\[|\]$//g; - $r_motif_copy =~ s/^\[|\]$//g; - - my @f_motields = split(/\]\[/,$f_motif_copy); - my @r_motields = split(/\]\[/,$r_motif_copy); - - my $f_purestretch = join("",@f_microields); - my $r_purestretch = join("",@r_microields); - - if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){ -# print "now.. studying $forward\n$reverse\n" if $printer == 1; - if ($fields[$typecord] eq $rields[$typecord]){ -# print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; - - if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ - my $subset_answer = isSubset($forward, $reverse, $seqLength); -# print "subset answer = $subset_answer\n" if $printer == 1; - return $forward if $subset_answer == 1; - return invert_microsat($reverse, $sequence) if $subset_answer == 2; - return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); - return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); - return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch); - return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch); - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ; - } - elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } - elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ - return $forward; - } - elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ - return invert_microsat($reverse, $sequence); - } - } - else{ - my $fmotlen = (); - my $rmotlen = (); - $fmotlen =1 if $fields[$typecord] eq "mononucleotide"; - $fmotlen =2 if $fields[$typecord] eq "dinucleotide"; - $fmotlen =3 if $fields[$typecord] eq "trinucleotide"; - $fmotlen =4 if $fields[$typecord] eq "tetranucleotide"; - $rmotlen =1 if $rields[$typecord] eq "mononucleotide"; - $rmotlen =2 if $rields[$typecord] eq "dinucleotide"; - $rmotlen =3 if $rields[$typecord] eq "trinucleotide"; - $rmotlen =4 if $rields[$typecord] eq "tetranucleotide"; - - if ($fmotlen < $rmotlen){ - if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){ - return $forward; - } - else{ - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } - } - if ($fmotlen > $rmotlen){ - if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){ - return invert_microsat($reverse, $sequence); - } - else{ - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } - } - } - } - if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){ -# print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; - if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ - my $subset_answer = isSubset($forward, $reverse, $seqLength); -# print "subset answer = $subset_answer\n" if $printer == 1; - return $forward if $subset_answer == 1; - return invert_microsat($reverse, $sequence) if $subset_answer == 2; -# print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1; - return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); - return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); - if ($subset_answer == 3){ - if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){ - if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){ - return $forward; - } - else{ - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } - } - if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){ - if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){ - return invert_microsat($reverse, $sequence); - } - else{ - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } - } - } - } - elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } - elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ - return $forward; - } - elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ - return invert_microsat($reverse, $sequence); - } - - } - - if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){ -# print "one compound, one nucleotide\n" if $printer == 1; - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } - if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){ -# print "one compound, one nucleotide\n" if $printer == 1; - return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); - } -} - -sub isSubset{ - my $forward = $_[0]; my @fields = split(/\t/,$forward); - my $reverse = $_[1]; my @rields = split(/\t/,$reverse); - my $seqLength = $_[2]; - my $r_start = $seqLength - $rields[$endcord] + 1; - my $r_end = $seqLength - $rields[$startcord] + 1; -# print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1; - return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end; - return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end; - return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord]; - return "3"; -} - -sub motifBYmotif_match{ - my $forward = $_[0]; - my $reverse = $_[1]; - $forward =~ s/^\[|\]$//g; - $reverse =~ s/^\[|\]$//g; - my @f_motields=split(/\]\[/, $forward); - my @r_motields=split(/\]\[/, $reverse); - my $finalresult = 0; - - if (scalar(@f_motields) != scalar(@r_motields)){ - my $subresult = 0; - my @mega = (); my @sub = (); - @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields); - @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields); - @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields); - @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields); - - for my $i (0 ... $#sub){ - my $probe = $sub[$i].$sub[$i]; -# print "probing $probe and $mega[$i]\n" if $printer == 1; - if ($probe =~ /$mega[$i]/) {$subresult = 1; } - else {$subresult = 0; last; } - } - - return 0 if $subresult == 0; - return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f - return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse - - } - else{ - for my $i (0 ... $#f_motields){ - my $probe = $f_motields[$i].$f_motields[$i]; - if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;} - else {$finalresult = 0 ;last;} - } - } -# print "finalresult = $finalresult\n" if $printer == 1; - return $finalresult; -} - -sub merge_microsats{ - my @input = @_; - my $sequence = pop(@input); - $sequence =~ s/ //g; - my @seq_string = @input; -# print "IN: merge_microsats\n"; -# print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n"; - my $start; - my $end; - my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string; -# print "\nrearranged into @micros \n"; - my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = (); - my @fields = (); - for my $i (0 ... $#micros){ - chomp $micros[$i]; - @fields = split(/\t/,$micros[$i]); - push(@types, $fields[$typecord]); - push(@motifs, $fields[$motifcord]); - - if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);} - else { push(@interruptiontypes, "NA"); } - if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);} - else { push(@interruptions, "NA"); } - if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);} - else { push(@interrposes, "NA"); } - if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);} - else { push(@no_of_interruptions, "NA"); } - if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));} - else { push(@mergestart, $fields[$startcord]); } - if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));} - else { push(@mergeend, $fields[$endcord]); } - if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);} - else { push(@mergemicro, $fields[$microsatcord]); } - - - } - $start = smallest_number(@mergestart); - $end = largest_number(@mergeend); - my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]"; - my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro)); - return $microsat; -} - -sub slided_microsat{ - my $forward = $_[0]; my @fields = split(/\t/,$forward); - my $reverse = $_[1]; my @rields = split(/\t/,$reverse); - my $seqLength = $_[2]; - my $r_start = $seqLength - $rields[$endcord] + 1; - my $r_end = $seqLength - $rields[$startcord] + 1; - my $motlen =(); - $motlen =1 if $fields[$typecord] eq "mononucleotide"; - $motlen =2 if $fields[$typecord] eq "dinucleotide"; - $motlen =3 if $fields[$typecord] eq "trinucleotide"; - $motlen =4 if $fields[$typecord] eq "tetranucleotide"; - - if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) { - return 0; - } - else{ - return 1; - } - -} - -#xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx - - - -#xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx -sub new_multispecies_t10{ - my $input1 = $_[0]; #gap_op_unrand_match - my $input2 = $_[1]; #sput - my $output = $_[2]; #output - my $bin = $output."_bin"; - my $orgs = join("|",split(/\./,$_[3])); - my @organisms = split(/\./,$_[3]); - my $no_of_species = scalar(@organisms); #3 - my $t10info = $output."_info"; - $prinkter = 0; - - open (MATCH, "<$input1"); - open (SPUT, "<$input2"); - open (OUT, ">$output"); - open (INFO, ">$t10info"); - - - sub microsat_bracketer; - sub custom; - my %seen = (); - $infocord = 2 + (4*$no_of_species) - 1; - $typecord = 2 + (4*$no_of_species) + 1 - 1; - $startcord = 2 + (4*$no_of_species) + 2 - 1; - $strandcord = 2 + (4*$no_of_species) + 3 - 1; - $endcord = 2 + (4*$no_of_species) + 4 - 1; - $microsatcord = 2 + (4*$no_of_species) + 5 - 1; - $motifcord = 2 + (4*$no_of_species) + 6 - 1; - $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; - #---------------------------------------------------------------------------------------------------------------# - # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES # - #---------------------------------------------------------------------------------------------------------------# - my $linecounter = 0; - my $microcounter = 0; - while (my $line = <SPUT>){ - chomp $line; - # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; - next if $line !~ /[0-9a-z]+/; - $linecounter++; - # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); - # print $key, "#-#-#-#-#-#-#-#\n"; - if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - my $key = join("\t",$1, $2, $3, $4, $5); -# print "key = $key\n" if $prinkter == 1; - push (@{$seen{$key}},$line); - $microcounter++; - } - else { print "could not make ker in SPUT : \n$line \n"; - } - } -# print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; -# print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; - close SPUT; - - #---------------------------------------------------------------------------------------------------------------- - - #-------------------------------------------------------------------------------------------------------# - # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY # - # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN # - # SECOND FILE FOLLOWED BY # - # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION # - #-------------------------------------------------------------------------------------------------------# - my $ref = 0; - my $ref2 = 0; - my $ref3 = 0; - my $ref4 = 0; - my $deletes= 0; - my $duplicates = 0; - my $neighbors = 0; - my $tooshort = 0; - my $prevmicrol=(); - my $startnotfound = 0; - my $matchkeysformed = 0; - my $keysused = 0; - - while (my $line = <MATCH>) { -# print colored ['magenta'], $line if $prinkter == 1; - next if $line !~ /[a-zA-Z0-9]/; - chomp $line; - my @fields2 = split(/\t/,$line); - my $key2 = (); - # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); - if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { - $matchkeysformed++; - $key2 = join("\t",$1, $2, $3, $4, $5); -# print "key = $key2 \n" if $prinkter == 1; - } - else{ -# print "could not make ker in SEQ : $line\n"; - next; - } - my $sequence = $fields2[$sequencepos]; - $sequence =~ s/\*/-/g; - my $count = 0; - if (exists $seen{$key2}){ - $keysused++; - my @unsorted_raw = @{$seen{$key2}}; - delete $seen{$key2}; - my @sequencearr = split(/\s*/, $sequence); - -# print "sequencearr = @sequencearr\n" if $prinkter == 1; - - my $counter; - - my %start_database = (); - my %end_database = (); - foreach my $uns (@unsorted_raw){ - my @uields = split(/\t/,$uns); - $start_database{$uields[$startcord]} = $uns; - $end_database{$uields[$endcord]} = $uns; - } - - my @unsorted = (); - my %starts = (); my %ends = (); -# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;} - for my $u (0 ... $#unsorted_raw){ - my @tields = split(/\t/,$unsorted_raw[$u]); - next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; - push(@unsorted, $unsorted_raw[$u]); - $starts{$tields[$startcord]} = $unsorted_raw[$u]; -# print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1; - } - - my $basecounter= 0; - my $gapcounter = 0; - my $poscounter = 0; - - for my $s (@sequencearr){ - - $poscounter++; - if ($s eq "-"){ - $gapcounter++; next; - } - else{ - $basecounter++; - } - - - #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; - #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; - #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1; - - if (exists $starts{$basecounter}){ - my $locus = $starts{$basecounter}; -# print "locus identified = $locus\n" if $prinkter == 1; - my @fields3 = split(/\t/,$locus); - my $start = $fields3[$startcord]; - my $end = $fields3[$endcord]; - my $motif = $fields3[$motifcord]; - my $microsat = $fields3[$microsatcord]; - my @leftbracketpos = (); - my @rightbracketpos = (); - my $bracket_picker = 'no'; - my $leftbrackets=(); - my $rightbrackets = (); - my $micro_cpy = $microsat; -# print "microsat = $microsat\n" if $prinkter == 1; - while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';} - while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);} - $microsat =~ s/[\[\]\-\*]//g; -# print "microsat = $microsat\n" if $prinkter == 1; - my $human_search = join '-*', split //, $microsat; - my $temp = substr($sequence, $poscounter-1); -# print "with poscounter = $poscounter\n" if $prinkter == 1; - my $search_result = (); - my $posnow = (); - while ($temp =~ /($human_search)/gi){ - $search_result = $1; - # $posnow = pos($temp); - last; - } - - my @gapspos = (); - while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); } - my $gaps = join("__",@gapspos); - - my $final_microsat = $search_result; - if ($bracket_picker eq "yes"){ - $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets); - } - - my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat); - - if ($bracket_picker eq "yes") { - $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]); - } - print OUT $outsentence,"\n"; - } - } - } - } - my $unusedkeys = scalar(keys %seen); - print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; - print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; - print INFO "microsatellites too short = $tooshort\n"; - print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; - - #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; - #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; - #print "microsatellites too short = $tooshort\n"; - #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; - #print "unused keys = \n",join("\n", (keys %seen)),"\n"; - close (MATCH); - close (SPUT); - close (OUT); - close (INFO); -} - -sub microsat_bracketer{ -# print "in bracketer: @_\n"; - my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_; - my @gaps = split(/__/,$gapspos); - my @lefts = split(/__/,$leftbracketpos); - my @rights = split(/__/,$rightbracketpos); - my @new=(); - my $pure = $microsat; - $pure =~ s/-//g; - my $off = 0; - my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights); - push(@gaps, 0); - push(@lefts,0); - push(@rights,0); - - for my $i (1 ... $finallength){ -# print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n"; - if($rights[0] == $i){ - # print "pushed a ]\n"; - push(@new, "]"); - shift(@rights); - push(@rights,0); - for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} - next; - } - if($gaps[0] == $i){ - # print "pushed a -\n"; - push(@new, "-"); - shift(@gaps); - push(@gaps, 0); - for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;} - for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;} - - next; - } - if($lefts[0] == $i){ -# print "pushed a [\n"; - push(@new, "["); - shift(@lefts); - push(@lefts,0); - for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} - next; - } - else{ - my $pushed = substr($pure,$off,1); - $off++; - push(@new,$pushed ); -# print "pushed an alphabet, now new = @new, pushed = $pushed\n"; - next; - } - } - my $returnmicrosat = join("",@new); -# print "final microsat = $returnmicrosat \n"; - return($returnmicrosat); -} - -#xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx - - -#xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx -sub multiSpecies_orthFinder4{ - #print "IN multiSpecies_orthFinder4: @_\n"; - my @handles = (); - #1 SEPT 30TH 2008 - #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER - #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL - #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST - #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED - #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY - #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES. - #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE - #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE - #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN - #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl - - #1 trial run: - #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50 - - $prinkter=0; - - ############# - my $CLUSTER_DIST = $_[4]; - ############# - - - my $aligns = $_[0]; - my @micros = split(/:/, $_[1]); - my $orth = $_[2]; - #my $not_orth = "notorth"; - @tags = split(/:/, $_[3]); - - $no_of_species=scalar(@tags); - my $junkfile = $orth."_junk"; - #open(JUNK,">$junkfile"); - - #my $info = $output1."_info"; - #print "inputs are : \n"; foreach(@micros){print $_,"\n";} - #print "info = @_\n"; - - - open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; - open (ORTH, ">$orth"); - my $output=$orth."_out"; - open (OUTP, ">$output"); - - - #open (NORTH, ">$not_orth"); - #open (INF, ">$info"); - my $i = 0; - foreach my $path (@micros){ - $handles[$i] = IO::Handle->new(); - open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!"; - $i++; - } - - #print "Opened files\n"; - - - $infocord = 2 + (4*$no_of_species) - 1; - $typecord = 2 + (4*$no_of_species) + 1 - 1; - $motifcord = $typecord + 1; - $gapcord = $motifcord+1; - $startcord = $gapcord + 1; - $strandcord = $startcord + 1; - $endcord = $strandcord + 1; - $microsatcord = $endcord + 1; - $sequencepos = 2 + (4*$no_of_species) + 1 -1 ; - #$sequencepos = 17; - # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES - #---------------------------------------------------------------------------------------------------------------- - my @hasharr = (); - foreach my $path (@micros){ - open(READ, "<$path") or die "Cannot open file $path :$!"; - my %single_hash = (); - my $key = (); - my $counter = 0; - while (my $line = <READ>){ - $counter++; - # print $line; - chomp $line; - my @fields1 = split(/\t/,$line); - if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { - $key = join("\t",$1, $2, $4, $5); - -# print "key = : $key\n" if $prinkter == 1; - -# print $line if $prinkter == 1; - push (@{$single_hash{$key}},$line); - } - else{ - # print "microsat line incompatible\n"; - } - } - push @hasharr, {%single_hash}; - # print "@{$single_hash{$key}} \n"; -# print "done $path: counter = $counter\n" if $prinkter == 1; - close READ; - } -# print "Done hashes\n"; - #---------------------------------------------------------------------------------------------------------------- - my $question=(); - #---------------------------------------------------------------------------------------------------------------- - my @contigstarts = (); - my @contigends = (); - - my %contigclusters = (); - my %contigclustersFirstStartOnly=(); - my %contigclustersLastEndOnly=(); - my %contigclustersLastEndLengthOnly=(); - my %contigclustersFirstStartLengthOnly=(); - my %contigpath=(); - my $dotcounter = 0; - while (my $line = <BO>){ -# print "x" x 60, "\n" if $prinkter == 1; - $dotcounter++; - - - -# print "." if $dotcounter % 100 ==0; -# print "\n" if $dotcounter % 5000 ==0; - next if $line !~ /^[0-9]+/; -# print $line if $prinkter == 1; - chomp $line; - my @fields2 = split(/\t/,$line); - my $key2 = (); - if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { - $key2 = join("\t",$1, $2, $4, $5); - } - else { -# print "seq line $line incompatible\n" if $prinkter == 1; - next;} - - - - - - - my @sequences = (); - for (0 ... $#tags){ - my $seq = <BO>; - # print $seq; - chomp $seq; - push(@sequences , " ".$seq); - } - my @origsequences = @sequences; - my $seqcopy = $sequences[0]; - my @strings = (); - $seqcopy =~ s/[a-zA-Z]|-/x/g; - my @string = split(/\s*/,$seqcopy); - - for my $s (0 ... $#tags){ - $sequences[$s] =~ s/-//g; - $sequences[$s] =~ s/[a-zA-Z]/x/g; - # print "length of sequence = ",length($sequences[$s]),"\n"; - my @tempstring = split(/\s*/,$sequences[$s]); - push(@strings, [@tempstring]) - - } - - my @species_list = (); - my @micro_count = 0; - my @starthash = (); - my $stopper = 1; - my @endhash = (); - - my @currentcontigstarts=(); - my @currentcontigends=(); - my @currentcontigchrs=(); - - for my $i (0 ... $#tags){ -# print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1; - my @temparr = (); - - if (exists $hasharr[$i]{$key2}){ - @temparr = @{$hasharr[$i]{$key2}}; - -# print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; - $line =~ /$tags[$i]\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/; -# print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; - my $startkey = $1."_".$2; print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; - my $endkey = $1."_".$3; print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; - $contigstarts[$i]{$startkey}= $key2; - $contigends[$i]{$endkey}= $key2; -# print "confirming existance: \n" if $prinkter == 1; -# print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1; -# print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1; - $currentcontigchrs[$i]=$1; - $currentcontigstarts[$i]=$2; - $currentcontigends[$i]=$3; - - } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} - else { - push (@starthash, {0 => "0"}); - push (@endhash, {0 => "0"}); - $currentcontigchrs[$i] = 0; - next; - } - $stopper = 0; - # print "exists: @temparr\n" if $prinkter == 1; - push(@micro_count, scalar(@temparr)); - push(@species_list, [@temparr]); - my @tempstart = (); my @tempend = (); - my %localends = (); - my %localhash = (); - # print "---------------------------\n"; - - foreach my $templine (@temparr){ -# print "templine = $templine\n" if $prinkter == 1; - my @tields = split(/\t/,$templine); - my $start = $tields[$startcord]; # - $tields[$gapcord]; - my $end = $tields[$endcord]; #- $tields[$gapcord]; - my $realstart = $tields[$startcord]- $tields[$gapcord]; - my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g); - $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/; - # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n"; - my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat; - # print "real start = $realstart, realend = $realend \n"; - for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;} - push(@tempstart, $start); - push(@tempend, $end); - $localhash{$start."-".$end} = $templine; - } - push @starthash, {%localhash}; - my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST); - - my @clusters = split(/_/,$foundclusters); - - my $clustno = 0; - - foreach my $cluster (@clusters) { - my @constituenst = split(/,/,$cluster); -# print "clusters returned: @constituenst\n" if $prinkter == 1; - } - - @string = split("_",stringPainter(join("_",@string),$foundclusters)); - - - } - next if $stopper == 1; - -# print colored ['blue'],"FINAL:\n" if $prinkter == 1; - my $finalclusters =findClusters(join("!",@string), 1); -# print colored ['blue'],"----------------------\n" if $prinkter == 1; - my @clusters = split(/,/,$finalclusters); -# print "@string\n" if $prinkter == 1; -# print "@clusters\n" if $prinkter == 1; -# print "------------------------------------------------------------------\n" if $prinkter == 1; - - my $clustno = 0; - - # foreach my $cluster (@clusters) { - # my @constituenst = split(/,/,$cluster); - # print "clusters returned: @constituenst\n"; - # } - - next if (scalar @clusters == 0); - - my @contigcluster=(); - my $clusterno=0; - my @contigClusterstarts=(); - my @contigClusterends = (); - - foreach my $clust (@clusters){ - # print "cluster: $clust\n"; - $clusterno++; - my @localclust = split(/\./, $clust); - my @result = (); - my @starts = (); - my @ends = (); - - for my $i (0 ... $#localclust){ - # print "localclust[$i]: $localclust[$i]\n"; - my @pattern = split(/:/, $localclust[$i]); - my @cords = split(/-/, $pattern[1]); - push (@starts, $cords[0]); - push (@ends, $cords[1]); - } - - my $extremestart = smallest_number(@starts); - my $extremeend = largest_number(@ends); - push(@contigClusterstarts, $extremestart); - push(@contigClusterends, $extremeend); -# print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ; - - foreach my $clustparts (@localclust){ - my @pattern = split(/:/, $clustparts); - # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n"; - push (@result, $starthash[$pattern[0]]{$pattern[1]}); - } - push(@contigcluster, join("\t", @result)); -# print join("\t", @result),"<-result \n" if $prinkter == 1 ; - } - - - my $firstclusterstart = smallest_number(@contigClusterstarts); - my $lastclusterend = largest_number(@contigClusterends); - - - $contigclustersFirstStartOnly{$key2}=$firstclusterstart; - $contigclustersLastEndOnly{$key2} = $lastclusterend; - $contigclusters{$key2}=[ @contigcluster ]; -# print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1; - for my $i (0 ... $#tags){ - #1 check if there exists adjacent alignment block wrt coordinates of this species. - next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. - #2 no need to worry about proximity of anything in adjacent block! - - #1 BELOW, the following is really to calclate the distance between the end coordinate of the - #2 cluster and the end of the gap-free sequence of each species. this is so that if an - #3 adjacent alignment block is found lateron, the exact distance between the potentially - #4 adjacent microsat clusters can be found here. the exact start coordinate will be used - #5 immediately below. - # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1; - - my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart); - my $species_endsubstring = (); - - if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";} - else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);} - -# print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1; -# print "for species: $tags[$i]: \n" if $prinkter == 1; - - $species_startsubstring =~ s/-| //g; - $species_endsubstring =~ s/-| //g; - $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring); - $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring); - - - -# print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1; -# print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1; -# print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1; - -# print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1; - - } - - - } -# print "\ndone the job of filling... \n"; - #/////////////////////////////////////////////////////////////////////////////////////// - #/////////////////////////////////////////////////////////////////////////////////////// - #/////////////////////////////////////////////////////////////////////////////////////// - #/////////////////////////////////////////////////////////////////////////////////////// - $prinkter=0; - open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; - - my %clusteringpaths=(); - my %clustersholder=(); - my %foundkeys=(); - my %clusteringpathsRev=(); - - - my $totalcount=(); - my $founkeys_enteredcount=(); - my $transfered=0; - my $complete_transfered=0; - my $plain_transfered=0; - my $existing_removed=0; - - while (my $line = <BO>){ -# print "x" x 60, "\n" if $prinkter == 1; - next if $line !~ /^[0-9]+/; - #print $line; - chomp $line; - my @fields2 = split(/\t/,$line); - my $key2 = (); - if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { - $key2 = join("\t",$1, $2, $4, $5); - } - - else {print "seq line $line incompatible\n"; next;} -# print "KEY = : $key2\n" if $prinkter == 1; - - - my @currentcontigstarts=(); - my @currentcontigends=(); - my @currentcontigchrs=(); - my @clusters = (); - my @clusterscopy=(); - if (exists $contigclusters{$key2}){ - @clusters = @{$contigclusters{$key2}}; - @clusterscopy=@clusters; - for my $i (0 ... $#tags){ - # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; - if ($line =~ /$tags[$i]\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/){ - # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; - my $startkey = $1."_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; - my $endkey = $1."_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; - $currentcontigchrs[$i]=$1; - $currentcontigstarts[$i]=$2; - $currentcontigends[$i]=$3; - } - else { - $currentcontigchrs[$i] = 0; - # print "no microsat clusters for $key2\n" if $prinkter == 1; next; - } - } - } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} - - my @sequences = (); - for (0 ... $#tags){ - my $seq = <BO>; - # print $seq; - chomp $seq; - push(@sequences , " ".$seq); - } - - next if scalar @currentcontigchrs == 0; - - # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1; - my %visitedcontigs=(); - - for my $i (0 ... $#tags){ - #1 check if there exists adjacent alignment block wrt coordinates of this species. - next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. - #2 no need to worry about proximity of anything in adjacent block! - @clusters=@clusterscopy; - #1 BELOW, the following is really to calclate the distance between the end coordinate of the - #2 cluster and the end of the gap-free sequence of each species. this is so that if an - #3 adjacent alignment block is found lateron, the exact distance between the potentially - #4 adjacent microsat clusters can be found here. the exact start coordinate will be used - #5 immediately below. - my $firstclusterstart = $contigclustersFirstStartOnly{$key2}; - my $lastclusterend = $contigclustersLastEndOnly{$key2}; - - my $key3 = $currentcontigchrs[$i]."_".($currentcontigstarts[$i]); -# print "check if exists $key3 in contigends for $i\n" if $prinkter == 1; - - if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){ - $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies# - # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1; - - #1 extract coordinates of the last cluster of this found alignment block -# print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1; - # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1; - # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; - # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; - my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}}; - my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}}; - my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=(); - - for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){ - # print "val = $val\n" if $prinkter == 1; - if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){ - push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/; - } - } - # print "-----\n" if $prinkter == 1; - for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){ - # print "val = $val\n" if $prinkter == 1; - if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){ - push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/; - } - } - - - # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1; - # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1; - - # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){ - if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){ - my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}}; - $regurgitate[$#regurgitate]=~s/\n//g; - $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters); - delete $contigclusters{$contigends[$i]{$key3}}; - $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ]; - delete $contigclusters{$key2}; - $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0; - $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0; - - if (scalar(@clusters) < 1){ - # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2}; - $clusteringpaths{$key2}=$contigends[$i]{$key3}; - $clusteringpathsRev{$contigends[$i]{$key3}}=$key2; - print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n"; - # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n"; - $founkeys_enteredcount-- if exists $foundkeys{$key2}; - $existing_removed++ if exists $foundkeys{$key2}; -# print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1; - delete $foundkeys{$key2} if exists $foundkeys{$key2}; - $complete_transfered++; - } - else{ - print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2}; - $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2}; - $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2}; - - $founkeys_enteredcount++ if !exists $foundkeys{$key2}; - $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2}; - # print "adding foundkeys entry $foundkeys{$key2}\n"; - $transfered++; - } - #$contigclusters{$key2}=[ @contigcluster ]; - } - } - else{ - # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1; - $plain_transfered++; - print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; - $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2}; - $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; - $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; - $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; - # print "adding foundkeys entry $foundkeys{$key2}\n"; - - } - $totalcount++; - - } - - - } - close BO; - #close (NORTH); - #/////////////////////////////////////////////////////////////////////////////////////// - #/////////////////////////////////////////////////////////////////////////////////////// - #/////////////////////////////////////////////////////////////////////////////////////// - #/////////////////////////////////////////////////////////////////////////////////////// - - my $founkeys_count=(); - my $nopath_count=(); - my $pathed_count=0; - foreach my $key2 (keys %foundkeys){ - #print "x" x 60, "\n"; -# print "x" if $dotcounter % 100 ==0; -# print "\n" if $dotcounter % 5000 ==0; - $founkeys_count++; - my $key = $key2; -# print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1; - if ($clusteringpaths{$key} eq $key){ -# print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1; - $nopath_count++; - delete $foundkeys{$key2}; - print ORTH join ("\n",@{$contigclusters{$key2}}),"\n"; - } - else{ - my @pool=(); - my $key3=(); - $pathed_count++; -# print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1; -# print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1; - if ($clusteringpathsRev{$key} eq "0") { - next; - } - else{ - my $yek3 = $clusteringpathsRev{$key}; - my $yek = $key; -# print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1; - while ($yek3 ne "0"){ -# print "$yek->$yek3," if $prinkter == 1; - $yek = $yek3; - $yek3 = $clusteringpathsRev{$yek}; - } -# print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1; - $key3 = $clusteringpaths{$yek}; - $key = $yek; - } - -# print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1; - - while($key ne $key3){ -# print "KEEY $key->$key3\n" if $prinkter == 1; -# print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1; - - if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}}; print "now pool = @pool\n" if $prinkter == 1;} - delete $foundkeys{$key3}; - $key=$key3; - $key3=$clusteringpaths{$key}; - } -# print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1; - my @firstcontig= @{$contigclusters{$key}}; - delete $foundkeys{$key2} if exists $foundkeys{$key2} ; - delete $foundkeys{$key} if exists $foundkeys{$key}; - - unshift @pool, pop @firstcontig; -# print join("\t",@pool),"\n" if $prinkter == 1; - print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0; - print ORTH join ("\t",@pool),"\n"; - # join(); - } - - } - #close (NORTH); -# print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1; - close (BO); - close (ORTH); - close (OUTP); - return 1; - -} -sub stringPainter{ - my @string = split(/_/,$_[0]); -# print $_[0], " <- in stringPainter\n"; -# print $_[1], " <- in clusters\n"; - - my @clusters = split(/,/, $_[1]); - for my $i (0 ... $#clusters){ - my $cluster = $clusters[$i]; -# print "cluster = $cluster\n"; - my @parts = split(/\./,$cluster); - my @cord = split(/:|-/,shift(@parts)); - my $minstart = $cord[1]; - my $maxend = $cord[2]; -# print "minstart = $minstart , maxend = $maxend\n"; - - for my $j (0 ... $#parts){ -# print "oing thri $parts[$j]\n"; - my @cord = split(/:|-/,$parts[$j]); - $minstart = $cord[1] if $cord[1] < $minstart; - $maxend = $cord[2] if $cord[2] > $maxend; - } -# print "minstart = $minstart , maxend = $maxend\n"; - for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;} - - - } -# print "@string <-done from function stringPainter\n"; - return join("_",@string); -} - -sub findClusters{ - my $continue = 0; - my @mapped_clusters = (); - my $clusterdist = $_[1]; - my $previous = 'x'; - my @localcluster = (); - my $cluster_starts = (); - my $cluster_ends = (); - my $localcluster_start = (); - my $localcluster_end = (); - my @record_cluster = (); - my @string = split(/\!/, $_[0]); - my $zerolength=0; - - for my $pos_pos (1 ... $#string){ - my $pos = $string[$pos_pos]; -# print $pos, "\n"; - if ($continue == 0 && $pos eq "x") {next;} - - if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){ - if ($zerolength == 0) {$localcluster_end = $pos_pos-1}; - $zerolength++; - $continue = 1; - } - - if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) { - $zerolength = 0; - $continue = 0; - my %seen; - my @uniqed = grep !$seen{$_}++, @localcluster; -# print "caught cluster : @uniqed \n"; - push(@mapped_clusters, [@uniqed]); -# print "clustered:\n@uniqed\n"; - @localcluster = (); - @record_cluster = (); - - } - - if ($pos ne "x"){ - $zerolength = 0; - $continue = 1; - $pos =~ s/x,//g; - my @entries = split(/,/,$pos); - $localcluster_end = 0; - $localcluster_start = 0; - push(@record_cluster,$pos); - - if ($continue == 0){ - @localcluster = (); - @localcluster = (@localcluster, @entries); - $localcluster_start = $pos_pos; - } - - if ($continue == 1 ) { - @localcluster = (@localcluster, @entries); - } - } - } - - if (scalar(@localcluster) > 0){ - my %seen; - my @uniqed = grep !$seen{$_}++, @localcluster; - # print "caught cluster : @uniqed \n"; - push(@mapped_clusters, [@uniqed]); - # print "clustered:\n@uniqed\n"; - @localcluster = (); - @record_cluster = (); - } - - my @returner = (); - - foreach my $clust (@mapped_clusters){ - my @localclust = @$clust; - my @result = (); - foreach my $clustparts (@localclust){ - push(@result,$clustparts); - } - push(@returner , join(".",@result)); - } -# print "returnig: ", join(",",@returner), "\n"; - return join(",",@returner); -} -#xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml --- a/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -<tool id="multispecies_orthologous_microsats" name="Extract orthologous microsatellites" version="1.0.1"> - <description> for multiple (>2) species alignments</description> - <command interpreter="perl"> - multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl - $input1 - $out_file1 - $thresholds - $species - "$treedefinition" - $separation - - </command> - <inputs> - <page> - <param format="maf" name="input1" type="data" label="Select MAF alignments"/> - <param name="separation" size="10" type="integer" value="10" label="Minimum base pair distance between adjacent microsatellite blocks" - help="A value of 10 means: Adjacent microsatellites separated by less than 10 base pairs will be excluded from the output."/> - <param name="thresholds" size="15" type="text" value="9,10,12,12" label="Minimum Threshold for the number of repeats for microsatellites" - help="A value of 9,10,12,12 means: All monos having fewer than 9 repeats, dis having fewer than 5 repeats, tris having fewer than 4 repeats, tetras having fewer than 3 repeats will be excluded from the output."/> - <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="NOTE: Currently users are requested to select one of these three combinations: hg18-panTro2-ponAbe2, hg18-panTro2-ponAbe2-rheMac2 or hg18-panTro2-ponAbe2-rheMac2-calJac1"> - <options> - <filter type="data_meta" ref="input1" key="species" /> - </options> - </param> - <param name="treedefinition" size="200" type="text" value = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)" label="Tree definition of all species above whether or not selected for microsatellite extraction" - help="For example: ((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/> - </page> - </inputs> - <outputs> - <data format="txt" name="out_file1" metadata_source="input1"/> - </outputs> - <requirements> - <requirement type="binary">sputnik</requirement> - </requirements> - <tests> - <test> - <param name="input1" value="chr22_5sp.maf"/> - <param name="thresholds" value="9,10,12,12"/> - <param name="species" value="hg18,panTro2,ponAbe2,rheMac2,calJac1"/> - <param name="treedefinition" value="((((hg18, panTro2), ponAbe2), rheMac2), calJac1)"/> - <param name="separation" value="10"/> - <output name="out_file1" file="chr22_5sp.microraw.tabular"/> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This tool finds ortholgous microsatellite blocks between aligned species - -</help> - - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/parseMAF_smallIndels.pl --- a/tools/regVariation/parseMAF_smallIndels.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,698 +0,0 @@ -#!/usr/bin/perl -w -# a program to get indels -# input is a MAF format 3-way alignment file -# from 3-way blocks only at this time -# translate seq2, seq3, etc coordinates to + if align orient is reverse complement - -use strict; -use warnings; - -# declare and initialize variables -my $fh; # variable to store filehandle -my $record; -my $offset; -my $library = $ARGV[0]; -my $count = 0; -my $count2 = 0; -my $count3 = 0; -my $count4 = 0; -my $start1 = my $start2 = my $start3 = my $start4 = my $start5 = my $start6 = 0; -my $orient = ""; -my $outgroup = $ARGV[2]; -my $ingroup1 = my $ingroup2 = ""; -my $count_seq1insert = my $count_seq1delete = 0; -my $count_seq2insert = my $count_seq2delete = 0; -my $count_seq3insert = my $count_seq3delete = 0; -my @seq1_insert_lengths = my @seq1_delete_lengths = (); -my @seq2_insert_lengths = my @seq2_delete_lengths = (); -my @seq3_insert_lengths = my @seq3_delete_lengths = (); -my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = (); -my @seq1_insert_startOnly = my @seq1_delete_startOnly = my @seq2_insert_startOnly = my @seq2_delete_startOnly = (); -my @seq3_insert_startOnly = my @seq3_delete_startOnly = (); -my @indels = (); - -# check to make sure correct files -my $usage = "usage: parseMAF_smallIndels.pl [MAF.in] [small_Indels_summary.out] [outgroup]\n"; -die $usage unless @ARGV == 3; - -# perform some standard subroutines -$fh = open_file($library); - -$offset = tell($fh); - -#my $ofile = $ARGV[2]; -#unless (open(OFILE, ">$ofile")){ -# print "Cannot open output file \"$ofile\"\n\n"; -# exit; -#} - -my $ofile2 = $ARGV[1]; -unless (open(OFILE2, ">$ofile2")){ - print "Cannot open output file \"$ofile2\"\n\n"; - exit; -} - - -# header line for output files -#print OFILE "# small indel events, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n"; -#print OFILE "#align\tingroup1\tingroup1_coord\tingroup1_orient\tingroup2\tingroup2_coord\tingroup2_orient\toutgroup\toutgroup_coord\toutgroup_orient\tindel_type\n"; - -#print OFILE2 "# small indels summary, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n"; -print OFILE2 "#block\tindel_type\tindel_length\tingroup1\tingroup1_start\tingroup1_end\tingroup1_alignSize\tingroup1_orient\tingroup2\tingroup2_start\tingroup2_end\tingroup2_alignSize\tingroup2_orient\toutgroup\toutgroup_start\toutgroup_end\toutgroup_alignSize\toutgroup_orient\n"; - -# main body of program -while ($record = get_next_record($fh) ){ - if ($record=~ m/\s*##maf(.*)\s*# maf/s){ - next; - } - - my @sequences = get_sequences_within_block($record); - my @seq_info = get_indels_within_block(@sequences); - get_indels_lengths(@seq_info); - - $offset = tell($fh); - $count++; - -} - -get_starts_only(@seq1_insert); -get_starts_only(@seq1_delete); -get_starts_only(@seq2_insert); -get_starts_only(@seq2_delete); -get_starts_only(@seq3_insert); -get_starts_only(@seq3_delete); - -# print some things to keep track of progress -#print "# $library\n"; -#print "# number of records = $count\n"; -#print "# number of sequence \"s\" lines = $count2\n"; -if ($count3 != 0){ - print "Skipped $count3 blocks with only 2 seqs;\n"; -} -#print "# number of records with only h-m = $count4\n\n"; - -print "Ingroup1 = $ingroup1; Ingroup2 = $ingroup2; Outgroup = $outgroup;\n"; -print "# of ingroup1 inserts = $count_seq1insert;\n"; -print "# of ingroup1 deletes = $count_seq1delete;\n"; -print "# of ingroup2 inserts = $count_seq2insert;\n"; -print "# of ingroup2 deletes = $count_seq2delete;\n"; -print "# of outgroup3 inserts = $count_seq3insert;\n"; -print "# of outgroup3 deletes = $count_seq3delete\n"; - - -#close OFILE; - -if ($count == $count3){ - print STDERR "Skipped all blocks since none of them contain 3-way alignments.\n"; - exit -1; -} - -###################SUBROUTINES##################################### - -# subroutine to open file -sub open_file { - my($filename) = @_; - my $fh; - - unless (open($fh, $filename)){ - print "Cannot open file $filename\n"; - exit; - } - return $fh; -} - -# get next record -sub get_next_record { - my ($fh) = @_; - my ($offset); - my ($record) = ""; - my ($save_input_separator) = $/; - - $/ = "a score"; - - $record = <$fh>; - - $/ = $save_input_separator; - return $record; -} - -# get header info -sub get_sequences_within_block{ - my (@alignment) = @_; - my @lines = (); - - my @sequences = (); - - @lines = split ("\n", $record); - foreach (@lines){ - chomp($_); - if (m/^\s*$/){ - next; - } - elsif (m/^\s*=(\d+\.*\d*)/){ - - }elsif (m/^\s*s(.*)$/){ - $count2++; - - push (@sequences,$_); - } - } - return @sequences; -} - -sub get_indels_within_block{ - my (@sequences) = @_; - my $line1 = my $line2 = my $line3 = ""; - my @line1 = my @line2 = my @line3 = (); - my $score = 0; - my $start1 = my $align_length1 = my $end1 = my $seq_length1 = 0; - my $start2 = my $align_length2 = my $end2 = my $seq_length2 = 0; - my $start3 = my $align_length3 = my $end3 = my $seq_length3 = 0; - my $seq1 = my $orient1 = ""; - my $seq2 = my $orient2 = ""; - my $seq3 = my $orient3 = ""; - my $start1_plus = my $end1_plus = 0; - my $start2_plus = my $end2_plus = 0; - my $start3_plus = my $end3_plus = 0; - my @test = (); - my $test = ""; - my $header = ""; - my @header = (); - my $sequence1 = my $sequence2 = my $sequence3 =""; - my @array_return = (); - my $test1 = 0; - my $line1_stat = my $line2_stat = my $line3_stat = ""; - - # process 3-way blocks only - if (scalar(@sequences) == 3){ - $line1 = $sequences[0]; - chomp ($line1); - $line2 = $sequences[1]; - chomp ($line2); - $line3 = $sequences[2]; - chomp ($line3); - # check order of sequences and assign uniformly seq1= human, seq2 = chimp, seq3 = macaque - if ($line1 =~ m/$outgroup/){ - $line1_stat = "out"; - $line2=~ s/^\s*//; - $line2 =~ s/\s+/\t/g; - @line2 = split(/\t/, $line2); - if (($ingroup1 eq "") || ($line2[1] =~ m/$ingroup1/)){ - $line2_stat = "in1"; - $line3_stat = "in2"; - } - else{ - $line3_stat = "in1"; - $line2_stat = "in2"; } - } - elsif ($line2 =~ m/$outgroup/){ - $line2_stat = "out"; - $line1=~ s/^\s*//; - $line1 =~ s/\s+/\t/g; - @line1 = split(/\t/, $line1); - if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){ - $line1_stat = "in1"; - $line3_stat = "in2"; - } - else{ - $line3_stat = "in1"; - $line1_stat = "in2"; } - } - elsif ($line3 =~ m/$outgroup/){ - $line3_stat = "out"; - $line1=~ s/^\s*//; - $line1 =~ s/\s+/\t/g; - @line1 = split(/\t/, $line1); - if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){ - $line1_stat = "in1"; - $line2_stat = "in2"; - } - else{ - $line2_stat = "in1"; - $line1_stat = "in2"; } - } - - #print "# l1 = $line1_stat\n"; - #print "# l2 = $line2_stat\n"; - #print "# l3 = $line3_stat\n"; - my $linei1 = my $linei2 = my $lineo = ""; - my @linei1 = my @linei2 = my @lineo = (); - - if ($line1_stat eq "out"){ - $lineo = $line1; - } - elsif ($line1_stat eq "in1"){ - $linei1 = $line1; - } - else{ - $linei2 = $line1; - } - - if ($line2_stat eq "out"){ - $lineo = $line2; - } - elsif ($line2_stat eq "in1"){ - $linei1 = $line2; - } - else{ - $linei2 = $line2; - } - - if ($line3_stat eq "out"){ - $lineo = $line3; - } - elsif ($line3_stat eq "in1"){ - $linei1 = $line3; - } - else{ - $linei2 = $line3; - } - - $linei1=~ s/^\s*//; - $linei1 =~ s/\s+/\t/g; - @linei1 = split(/\t/, $linei1); - $end1 =($linei1[2]+$linei1[3]-1); - $seq1 = $linei1[1].":".$linei1[3]; - $ingroup1 = (split(/\./, $seq1))[0]; - $start1 = $linei1[2]; - $align_length1 = $linei1[3]; - $orient1 = $linei1[4]; - $seq_length1 = $linei1[5]; - $sequence1 = $linei1[6]; - $test1 = length($sequence1); - my $total_length1 = $test1+$start1; - my @array1 = ($start1,$end1,$orient1,$seq_length1); - ($start1_plus, $end1_plus) = convert_coords(@array1); - - $linei2=~ s/^\s*//; - $linei2 =~ s/\s+/\t/g; - @linei2 = split(/\t/, $linei2); - $end2 =($linei2[2]+$linei2[3]-1); - $seq2 = $linei2[1].":".$linei2[3]; - $ingroup2 = (split(/\./, $seq2))[0]; - $start2 = $linei2[2]; - $align_length2 = $linei2[3]; - $orient2 = $linei2[4]; - $seq_length2 = $linei2[5]; - $sequence2 = $linei2[6]; - my $test2 = length($sequence2); - my $total_length2 = $test2+$start2; - my @array2 = ($start2,$end2,$orient2,$seq_length2); - ($start2_plus, $end2_plus) = convert_coords(@array2); - - $lineo=~ s/^\s*//; - $lineo =~ s/\s+/\t/g; - @lineo = split(/\t/, $lineo); - $end3 =($lineo[2]+$lineo[3]-1); - $seq3 = $lineo[1].":".$lineo[3]; - $start3 = $lineo[2]; - $align_length3 = $lineo[3]; - $orient3 = $lineo[4]; - $seq_length3 = $lineo[5]; - $sequence3 = $lineo[6]; - my $test3 = length($sequence3); - my $total_length3 = $test3+$start3; - my @array3 = ($start3,$end3,$orient3,$seq_length3); - ($start3_plus, $end3_plus) = convert_coords(@array3); - - #print "# l1 = $ingroup1\n"; - #print "# l2 = $ingroup2\n"; - #print "# l3 = $outgroup\n"; - - my $ABC = ""; - my $coord1 = my $coord2 = my $coord3 = 0; - $coord1 = $start1_plus; - $coord2 = $start2_plus; - $coord3 = $start3_plus; - - for (my $position = 0; $position < $test1; $position++) { - my $indelType = ""; - my $indel_line = ""; - # seq1 deletes - if ((substr($sequence1,$position,1) eq "-") - && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) - && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ - $ABC = join("",($ABC,"X")); - my @s = split(/:/, $seq1); - $indelType = $s[0]."_delete"; - - #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; - $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); - push (@indels,$indel_line); - push (@seq1_delete,$indel_line); - $coord2++; $coord3++; - } - # seq2 deletes - elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) - && (substr($sequence2,$position,1) eq "-") - && (substr($sequence3,$position,1) !~ m/[-*\$?^]/)){ - $ABC = join("",($ABC,"Y")); - my @s = split(/:/, $seq2); - $indelType = $s[0]."_delete"; - #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; - $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); - push (@indels,$indel_line); - push (@seq2_delete,$indel_line); - $coord1++; - $coord3++; - - } - # seq1 inserts - elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) - && (substr($sequence2,$position,1) eq "-") - && (substr($sequence3,$position,1) eq "-")){ - $ABC = join("",($ABC,"Z")); - my @s = split(/:/, $seq1); - $indelType = $s[0]."_insert"; - #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; - $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); - push (@indels,$indel_line); - push (@seq1_insert,$indel_line); - $coord1++; - } - # seq2 inserts - elsif ((substr($sequence1,$position,1) eq "-") - && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) - && (substr($sequence3,$position,1) eq "-")){ - $ABC = join("",($ABC,"W")); - my @s = split(/:/, $seq2); - $indelType = $s[0]."_insert"; - #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; - $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); - push (@indels,$indel_line); - push (@seq2_insert,$indel_line); - $coord2++; - } - # seq3 deletes - elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) - && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) - && (substr($sequence3,$position,1) eq "-")){ - $ABC = join("",($ABC,"S")); - my @s = split(/:/, $seq3); - $indelType = $s[0]."_delete"; - #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; - $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); - push (@indels,$indel_line); - push (@seq3_delete,$indel_line); - $coord1++; $coord2++; - } - # seq3 inserts - elsif ((substr($sequence1,$position,1) eq "-") - && (substr($sequence2,$position,1) eq "-") - && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ - $ABC = join("",($ABC,"T")); - my @s = split(/:/, $seq3); - $indelType = $s[0]."_insert"; - #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; - $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); - push (@indels,$indel_line); - push (@seq3_insert,$indel_line); - $coord3++; - }else{ - $ABC = join("",($ABC,"N")); - $coord1++; $coord2++; $coord3++; - } - - } - @array_return=($seq1,$seq2,$seq3,$ABC); - return (@array_return); - - } - # ignore pairwise cases for now, just count the number of blocks - elsif (scalar(@sequences) == 2){ - my $ABC = ""; - my $coord1 = my $coord2 = my $coord3 = 0; - $count3++; - - $line1 = $sequences[0]; - $line2 = $sequences[1]; - chomp ($line1); - chomp ($line2); - - if ($line2 !~ m/$ingroup2/){ - $count4++; - } - } -} - - -sub get_indels_lengths{ - my (@array) = @_; - if (scalar(@array) == 4){ - my $seq1 = $array[0]; my $seq2 = $array[1]; my $seq3 = $array[2]; my $ABC = $array[3]; - - while ($ABC =~ m/(X+)/g) { - push (@seq1_delete_lengths,length($1)); - $count_seq1delete++; - } - - while ($ABC =~ m/(Y+)/g) { - push (@seq2_delete_lengths,length($1)); - $count_seq2delete++; - } - while ($ABC =~ m/(S+)/g) { - push (@seq3_delete_lengths,length($1)); - $count_seq3delete++; - } - while ($ABC =~ m/(Z+)/g) { - push (@seq1_insert_lengths,length($1)); - $count_seq1insert++; - } - while ($ABC =~ m/(W+)/g) { - push(@seq2_insert_lengths,length($1)); - $count_seq2insert++; - } - while ($ABC =~ m/(T+)/g) { - push (@seq3_insert_lengths,length($1)); - $count_seq3insert++; - } - } - elsif (scalar(@array) == 0){ - next; - } - -} -# convert to coordinates to + strand if align orient = - -sub convert_coords{ - my (@array) = @_; - my $s = $array[0]; - my $e = $array[1]; - my $o = $array[2]; - my $l = $array[3]; - my $start_plus = my $end_plus = 0; - - if ($o eq "-"){ - $start_plus = ($l - $e); - $end_plus = ($l - $s); - }elsif ($o eq "+"){ - $start_plus = $s; - $end_plus = $e; - } - - return ($start_plus, $end_plus); -} - -# get first line only for each event -sub get_starts_only{ - my (@test) = @_; - my $seq1 = my $seq2 = my $seq3 = my $indelType = my $old_seq1 = my $old_seq2 = my $old_seq3 = my $old_indelType = my $old_line = ""; - my $coord1 = my $coord2 = my $coord3 = my $old_coord1 = my $old_coord2 = my $old_coord3 = 0; - - my @matches = (); - my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = (); - - - foreach my $line (@test){ - chomp($line); - $line =~ s/^\s*//; - $line =~ s/\s+/\t/g; - my @line1 = split(/\t/, $line); - $seq1 = $line1[1]; - $coord1 = $line1[2]; - $seq2 = $line1[4]; - $coord2 = $line1[5]; - $seq3 = $line1[7]; - $coord3 = $line1[8]; - $indelType = $line1[10]; - if ($indelType =~ m/$ingroup1/ && $indelType =~ m/insert/){ - if ($coord1 != $old_coord1+1 || ($coord2 != $old_coord2 || $coord3 != $old_coord3)){ - $start1++; - push (@seq1_insert_startOnly,$line); - } - } - elsif ($indelType =~ m/$ingroup1/ && $indelType =~ m/delete/){ - if ($coord1 != $old_coord1 || ($coord2 != $old_coord2+1 || $coord3 != $old_coord3+1)){ - $start2++; - push(@seq1_delete_startOnly,$line); - } - } - elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/insert/){ - if ($coord2 != $old_coord2+1 || ($coord1 != $old_coord1 || $coord3 != $old_coord3)){ - $start3++; - push(@seq2_insert_startOnly,$line); - } - } - elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/delete/){ - if ($coord2 != $old_coord2 || ($coord1 != $old_coord1+1 || $coord3 != $old_coord3+1)){ - $start4++; - push(@seq2_delete_startOnly,$line); - } - } - elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/insert/){ - if ($coord3 != $old_coord3+1 || ($coord1 != $old_coord1 || $coord2 != $old_coord2)){ - $start5++; - push(@seq3_insert_startOnly,$line); - } - } - elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/delete/){ - if ($coord3 != $old_coord3 || ($coord1 != $old_coord1+1 ||$coord2 != $old_coord2+1)){ - $start6++; - push(@seq3_delete_startOnly,$line); - } - } - $old_indelType = $indelType; - $old_seq1 = $seq1; - $old_coord1 = $coord1; - $old_seq2 = $seq2; - $old_coord2 = $coord2; - $old_seq3 = $seq3; - $old_coord3 = $coord3; - $old_line = $line; - } -} -# append lengths to each event start line -my $counter1; my $counter2; my $counter3; my $counter4; my $counter5; my $counter6; -my @final1 = my @final2 = my @final3 = my @final4 = my @final5 = my @final6 = (); -my $final_line1 = my $final_line2 = my $final_line3 = my $final_line4 = my $final_line5 = my $final_line6 = ""; - - -for ($counter1 = 0; $counter1 < @seq3_insert_startOnly; $counter1++){ - $final_line1 = join("\t",($seq3_insert_startOnly[$counter1],$seq3_insert_lengths[$counter1])); - push (@final1,$final_line1); -} - -for ($counter2 = 0; $counter2 < @seq3_delete_startOnly; $counter2++){ - $final_line2 = join("\t",($seq3_delete_startOnly[$counter2],$seq3_delete_lengths[$counter2])); - push(@final2,$final_line2); -} - -for ($counter3 = 0; $counter3 < @seq2_insert_startOnly; $counter3++){ - $final_line3 = join("\t",($seq2_insert_startOnly[$counter3],$seq2_insert_lengths[$counter3])); - push(@final3,$final_line3); -} - -for ($counter4 = 0; $counter4 < @seq2_delete_startOnly; $counter4++){ - $final_line4 = join("\t",($seq2_delete_startOnly[$counter4],$seq2_delete_lengths[$counter4])); - push(@final4,$final_line4); -} - -for ($counter5 = 0; $counter5 < @seq1_insert_startOnly; $counter5++){ - $final_line5 = join("\t",($seq1_insert_startOnly[$counter5],$seq1_insert_lengths[$counter5])); - push(@final5,$final_line5); -} - -for ($counter6 = 0; $counter6 < @seq1_delete_startOnly; $counter6++){ - $final_line6 = join("\t",($seq1_delete_startOnly[$counter6],$seq1_delete_lengths[$counter6])); - push(@final6,$final_line6); -} - -# format final output -# # if inserts, increase coords for the sequence inserted, other sequences give coords for 5' and 3' bases flanking the gap -# # for deletes, increase coords for other 2 sequences and the one deleted give coords for 5' and 3' bases flanking the gap - -get_final_format(@final5); -get_final_format(@final6); -get_final_format(@final3); -get_final_format(@final4); -get_final_format(@final1); -get_final_format(@final2); - -sub get_final_format{ - my (@final) = @_; - foreach (@final){ - my $event_line = $_; - my @events = split(/\t/, $event_line); - my $event_type = $events[10]; - my @name_align1 = split(/:/, $events[1]); - my @name_align2 = split(/:/, $events[4]); - my @name_align3 = split(/:/, $events[7]); - my $seq1_event_start = my $seq1_event_end = my $seq2_event_start = my $seq2_event_end = my $seq3_event_start = my $seq3_event_end = 0; - my $final_event_line = ""; - # seq1_insert - if ($event_type =~ m/$ingroup1/ && $event_type =~ m/insert/){ - # only increase coord for human - # remember that other two sequnences, the gap spans (coord - 1) --> coord - $seq1_event_start = ($events[2]); - $seq1_event_end = ($events[2]+$events[11]-1); - $seq2_event_start = ($events[5]-1); - $seq2_event_end = ($events[5]); - $seq3_event_start = ($events[8]-1); - $seq3_event_end = ($events[8]); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); - } - # seq1_delete - elsif ($event_type =~ m/$ingroup1/ && $event_type =~ m/delete/){ - # only increase coords for seq2 and seq3 - # remember for seq1, the gap spans (coord - 1) --> coord - $seq1_event_start = ($events[2]-1); - $seq1_event_end = ($events[2]); - $seq2_event_start = ($events[5]); - $seq2_event_end = ($events[5]+$events[11]-1); - $seq3_event_start = ($events[8]); - $seq3_event_end = ($events[8]+$events[11]-1); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); - } - # seq2_insert - elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/insert/){ - # only increase coords for seq2 - # remember that other two sequnences, the gap spans (coord - 1) --> coord - $seq1_event_start = ($events[2]-1); - $seq1_event_end = ($events[2]); - $seq2_event_start = ($events[5]); - $seq2_event_end = ($events[5]+$events[11]-1); - $seq3_event_start = ($events[8]-1); - $seq3_event_end = ($events[8]); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); - } - # seq2_delete - elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/delete/){ - # only increase coords for seq1 and seq3 - # remember for seq2, the gap spans (coord - 1) --> coord - $seq1_event_start = ($events[2]); - $seq1_event_end = ($events[2]+$events[11]-1); - $seq2_event_start = ($events[5]-1); - $seq2_event_end = ($events[5]); - $seq3_event_start = ($events[8]); - $seq3_event_end = ($events[8]+$events[11]-1); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); - } - # start testing w/seq3_insert - elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/insert/){ - # only increase coord for rheMac - # remember that other two sequnences, the gap spans (coord - 1) --> coord - $seq1_event_start = ($events[2]-1); - $seq1_event_end = ($events[2]); - $seq2_event_start = ($events[5]-1); - $seq2_event_end = ($events[5]); - $seq3_event_start = ($events[8]); - $seq3_event_end = ($events[8]+$events[11]-1); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); - } - # seq3_delete - elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/delete/){ - # only increase coords for seq1 and seq2 - # remember for seq3, the gap spans (coord - 1) --> coord - $seq1_event_start = ($events[2]); - $seq1_event_end = ($events[2]+$events[11]-1); - $seq2_event_start = ($events[5]); - $seq2_event_end = ($events[5]+$events[11]-1); - $seq3_event_start = ($events[8]-1); - $seq3_event_end = ($events[8]); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); - - } - - print OFILE2 "$final_event_line\n"; - } -} -close OFILE2; diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/quality_filter.py --- a/tools/regVariation/quality_filter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,245 +0,0 @@ -#!/usr/bin/env python -#Guruprasad Ananda -""" -Filter based on nucleotide quality (PHRED score). - -usage: %prog input out_file primary_species mask_species score mask_char mask_region mask_region_length -""" - - -from __future__ import division -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -pkg_resources.require( "lrucache" ) -try: - pkg_resources.require("numpy") -except: - pass - -import psyco_full -import sys -import os, os.path -from UserDict import DictMixin -from bx.binned_array import BinnedArray, FileBinnedArray -from bx.bitset import * -from bx.bitset_builders import * -from fpconst import isNaN -from bx.cookbook import doc_optparse -from galaxy.tools.exception_handling import * -import bx.align.maf - -class FileBinnedArrayDir( DictMixin ): - """ - Adapter that makes a directory of FileBinnedArray files look like - a regular dict of BinnedArray objects. - """ - def __init__( self, dir ): - self.dir = dir - self.cache = dict() - def __getitem__( self, key ): - value = None - if key in self.cache: - value = self.cache[key] - else: - fname = os.path.join( self.dir, "%s.qa.bqv" % key ) - if os.path.exists( fname ): - value = FileBinnedArray( open( fname ) ) - self.cache[key] = value - if value is None: - raise KeyError( "File does not exist: " + fname ) - return value - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def load_scores_ba_dir( dir ): - """ - Return a dict-like object (keyed by chromosome) that returns - FileBinnedArray objects created from "key.ba" files in `dir` - """ - return FileBinnedArrayDir( dir ) - -def bitwise_and ( string1, string2, maskch ): - result=[] - for i,ch in enumerate(string1): - try: - ch = int(ch) - except: - pass - if string2[i] == '-': - ch = 1 - if ch and string2[i]: - result.append(string2[i]) - else: - result.append(maskch) - return ''.join(result) - -def main(): - # Parsing Command Line here - options, args = doc_optparse.parse( __doc__ ) - - try: - #chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols ) - inp_file, out_file, pri_species, mask_species, qual_cutoff, mask_chr, mask_region, mask_length, loc_file = args - qual_cutoff = int(qual_cutoff) - mask_chr = int(mask_chr) - mask_region = int(mask_region) - if mask_region != 3: - mask_length = int(mask_length) - else: - mask_length_r = int(mask_length.split(',')[0]) - mask_length_l = int(mask_length.split(',')[1]) - except: - stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." ) - - if pri_species == 'None': - stop_err( "No primary species selected, try again by selecting at least one primary species." ) - if mask_species == 'None': - stop_err( "No mask species selected, try again by selecting at least one species to mask." ) - - mask_chr_count = 0 - mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'} - mask_reg_dict = {0:'Current pos', 1:'Current+Downstream', 2:'Current+Upstream', 3:'Current+Both sides'} - - #ensure dbkey is present in the twobit loc file - filepath = None - try: - pspecies_all = pri_species.split(',') - pspecies_all2 = pri_species.split(',') - pspecies = [] - filepaths = [] - for line in open(loc_file): - if pspecies_all2 == []: - break - if line[0:1] == "#": - continue - fields = line.split('\t') - try: - build = fields[0] - for i,dbkey in enumerate(pspecies_all2): - if dbkey == build: - pspecies.append(build) - filepaths.append(fields[1]) - del pspecies_all2[i] - else: - continue - except: - pass - except Exception, exc: - stop_err( 'Initialization errorL %s' % str( exc ) ) - - if len(pspecies) == 0: - stop_err( "Quality scores are not available for the following genome builds: %s" % ( pspecies_all2 ) ) - if len(pspecies) < len(pspecies_all): - print "Quality scores are not available for the following genome builds: %s" %(pspecies_all2) - - scores_by_chrom = [] - #Get scores for all the primary species - for file in filepaths: - scores_by_chrom.append(load_scores_ba_dir( file.strip() )) - - try: - maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) - maf_writer = bx.align.maf.Writer( open(out_file,'w') ) - except Exception, e: - stop_err( "Your MAF file appears to be malformed: %s" % str( e ) ) - - maf_count = 0 - for block in maf_reader: - status_strings = [] - for seq in range (len(block.components)): - src = block.components[seq].src - dbkey = src.split('.')[0] - chr = src.split('.')[1] - if not (dbkey in pspecies): - continue - else: #enter if the species is a primary species - index = pspecies.index(dbkey) - sequence = block.components[seq].text - s_start = block.components[seq].start - size = len(sequence) #this includes the gaps too - status_str = '1'*size - status_list = list(status_str) - if status_strings == []: - status_strings.append(status_str) - ind = 0 - s_end = block.components[seq].end - #Get scores for the entire sequence - try: - scores = scores_by_chrom[index][chr][s_start:s_end] - except: - continue - pos = 0 - while pos < (s_end-s_start): - if sequence[ind] == '-': #No score for GAPS - ind += 1 - continue - score = scores[pos] - if score < qual_cutoff: - score = 0 - - if not(score): - if mask_region == 0: #Mask Corresponding position only - status_list[ind] = '0' - ind += 1 - pos += 1 - elif mask_region == 1: #Mask Corresponding position + downstream neighbors - for n in range(mask_length+1): - try: - status_list[ind+n] = '0' - except: - pass - ind = ind + mask_length + 1 - pos = pos + mask_length + 1 - elif mask_region == 2: #Mask Corresponding position + upstream neighbors - for n in range(mask_length+1): - try: - status_list[ind-n] = '0' - except: - pass - ind += 1 - pos += 1 - elif mask_region == 3: #Mask Corresponding position + neighbors on both sides - for n in range(-mask_length_l,mask_length_r+1): - try: - status_list[ind+n] = '0' - except: - pass - ind = ind + mask_length_r + 1 - pos = pos + mask_length_r + 1 - else: - pos += 1 - ind += 1 - - status_strings.append(''.join(status_list)) - - if status_strings == []: #this block has no primary species - continue - output_status_str = status_strings[0] - for stat in status_strings[1:]: - try: - output_status_str = bitwise_and (status_strings[0], stat, '0') - except Exception, e: - break - - for seq in range (len(block.components)): - src = block.components[seq].src - dbkey = src.split('.')[0] - if dbkey not in mask_species.split(','): - continue - sequence = block.components[seq].text - sequence = bitwise_and (output_status_str, sequence, mask_chr_dict[mask_chr]) - block.components[seq].text = sequence - mask_chr_count += output_status_str.count('0') - maf_writer.write(block) - maf_count += 1 - - maf_reader.close() - maf_writer.close() - print "No. of blocks = %d; No. of masked nucleotides = %s; Mask character = %s; Mask region = %s; Cutoff used = %d" %(maf_count, mask_chr_count, mask_chr_dict[mask_chr], mask_reg_dict[mask_region], qual_cutoff) - - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/quality_filter.xml --- a/tools/regVariation/quality_filter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<tool id="qualityFilter" name="Filter nucleotides" version="1.0.1"> - <description> based on quality scores</description> - <command interpreter="python"> - quality_filter.py - $input - $out_file1 - $primary_species - $mask_species - $score - $mask_char - ${mask_region.region} - #if $mask_region.region == "3" - ${mask_region.lengthr},${mask_region.lengthl} - #elif $mask_region.region == "0" - 1 - #else - ${mask_region.length} - #end if - ${GALAXY_DATA_INDEX_DIR}/quality_scores.loc - </command> - <inputs> - <param format="maf" name="input" type="data" label="Select data"/> - <param name="primary_species" type="select" label="Use quality scores of" display="checkboxes" multiple="true"> - <options> - <filter type="data_meta" ref="input" key="species" /> - </options> - </param> - <param name="mask_species" type="select" label="Mask Species" display="checkboxes" multiple="true"> - <options> - <filter type="data_meta" ref="input" key="species" /> - </options> - </param> - <param name="score" size="10" type="integer" value="20" label="Quality score cut-off" help="Cut-off value of 20 means mask all nucleotides having quality score less than or equal to 20"/> - <param name="mask_char" size="5" type="select" label="Mask character"> - <option value="0" selected="true">#</option> - <option value="1">$</option> - <option value="2">^</option> - <option value="3">*</option> - <option value="4">?</option> - <option value="5">N</option> - </param> - <conditional name="mask_region"> - <param name="region" type="select" label="Mask region"> - <option value="0" selected="true">Only the corresponding nucleotide </option> - <option value="1">Corresponding column + right-side neighbors</option> - <option value="2">Corresponding column + left-side neighbors</option> - <option value="3">Corresponding column + neighbors on both sides</option> - </param> - <when value="0"> - </when> - <when value="1"> - <param name="length" size="10" type="integer" value="2" label="Number of right-side neighbors"/> - </when> - <when value="2"> - <param name="length" size="10" type="integer" value="2" label="Number of left-side neighbors"/> - </when> - <when value="3"> - <param name="lengthr" size="10" type="integer" value="2" label="Number of neighbors on right-side" /> - <param name="lengthl" size="10" type="integer" value="2" label="Number of neighbors on left-side" /> - </when> - </conditional> - </inputs> - <outputs> - <data format="maf" name="out_file1" metadata_source="input"/> - </outputs> - <requirements> - <requirement type="python-module">numpy</requirement> - </requirements> - <tests> - <test> - <param name="input" value="6.maf"/> - <param name="primary_species" value="panTro2"/> - <param name="mask_species" value="hg18"/> - <param name="score" value="50"/> - <param name="mask_char" value="0"/> - <param name="region" value="0" /> - <output name="out_file1" file="6_quality_filter.maf"/> - </test> - </tests> - <help> - -.. class:: infomark - -**What it does** - -This tool takes a MAF file as input and filters nucleotides in every alignment block of the MAF file based on their quality/PHRED scores. - ------ - -.. class:: warningmark - -**Note** - -Any block/s not containing the primary species (species whose quality scores is to be used), will be omitted. -Also, any primary species whose quality scores are not available in Galaxy will be considered as a non-primary species. This info will appear as a message in the job history panel. - ------ - -**Example** - -- For the following alignment block:: - - a score=4050.0 - s hg18.chrX 3719221 48 - 154913754 tattttacatttaaaataaatatgtaaatatatattttatatttaaaa - s panTro2.chrX 3560945 48 - 155361357 tattttatatttaaaataaagatgtaaatatatattttatatttaaaa - -- running this tool with **Primary species as panTro2**, **Mask species as hg18, panTro2**, **Quality cutoff as 20**, **Mask character as #** and **Mask region as only the corresponding position** will return:: - - a score=4050.0 - s hg18.chrX 3719221 48 - 154913754 ###tttac#####a###a#atatgtaaat###tattt#####ttaaaa - s panTro2.chrX 3560945 48 - 155361357 ###tttat#####a###a#agatgtaaat###tattt#####ttaaaa - - where, the positions containing # represent panTro2 nucleotides having quality scores less than 20. - </help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/qv_to_bqv.py --- a/tools/regVariation/qv_to_bqv.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -#!/usr/bin/env python - -""" -Adapted from bx/scripts/qv_to_bqv.py - -Convert a qual (qv) file to several BinnedArray files for fast seek. -This script takes approximately 4 seconds per 1 million base pairs. - -The input format is fasta style quality -- fasta headers followed by -whitespace separated integers. - -usage: %prog qual_file output_file -""" - -import pkg_resources -pkg_resources.require( "bx-python" ) -pkg_resources.require( "numpy" ) -import string -import psyco_full -import sys, re, os, tempfile -from bx.binned_array import BinnedArrayWriter -from bx.cookbook import * -import fileinput - -def load_scores_ba_dir( dir ): - """ - Return a dict-like object (keyed by chromosome) that returns - FileBinnedArray objects created from "key.ba" files in `dir` - """ - return FileBinnedArrayDir( dir ) - -def main(): - args = sys.argv[1:] - try: - qual_file_dir = args[0] - #mydir="/home/gua110/Desktop/chimp_quality_scores/chr22.qa" - mydir="/home/gua110/Desktop/rhesus_quality_scores/rheMac2.qual.qv" - qual_file_dir = mydir.replace(mydir.split("/")[-1], "") - output_file = args[ 1 ] - fo = open(output_file,"w") - except: - print "usage: qual_file output_file" - sys.exit() - - tmpfile = tempfile.NamedTemporaryFile() - cmdline = "ls " + qual_file_dir + "*.qa | cat >> " + tmpfile.name - os.system (cmdline) - for qual_file in tmpfile.readlines(): - qual = fileinput.FileInput( qual_file.strip() ) - outfile = None - outbin = None - base_count = 0 - mega_count = 0 - - for line in qual: - line = line.rstrip("\r\n") - if line.startswith(">"): - # close old - if outbin and outfile: - print "\nFinished region " + region + " at " + str(base_count) + " base pairs." - outbin.finish() - outfile.close() - # start new file - region = line.lstrip(">") - #outfname = output_file + "." + region + ".bqv" #CHANGED - outfname = qual_file.strip() + ".bqv" - print >>fo, "Writing region " + region + " to file " + outfname - outfile = open( outfname , "wb") - outbin = BinnedArrayWriter(outfile, typecode='b', default=0) - base_count = 0 - mega_count = 0 - else: - if outfile and outbin: - nums = line.split() - for val in nums: - outval = int(val) - assert outval <= 255 and outval >= 0 - outbin.write(outval) - base_count += 1 - if (mega_count * 1000000) <= base_count: - sys.stdout.write(str(mega_count)+" ") - sys.stdout.flush() - mega_count = base_count // 1000000 + 1 - if outbin and outfile: - print "\nFinished region " + region + " at " + str(base_count) + " base pairs." - outbin.finish() - outfile.close() - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/qv_to_bqv.xml --- a/tools/regVariation/qv_to_bqv.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -<tool id="qv2bqv" name="qv2bqv"> - <description></description> - <command interpreter="python">qv_to_bqv.py "$input1" $output</command> - <inputs> - <param name="input1" type="data" format="interval" help="Directory" /> - </inputs> - <outputs> - <data format="text" name="output" metadata_source="input1" /> - </outputs> - <tests> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <output name="output" file="gops-coverage.dat" /> - </test> - </tests> -</tool> \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/rcve.py --- a/tools/regVariation/rcve.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,143 +0,0 @@ -#!/usr/bin/env python - -from galaxy import eggs - -import sys, string -from rpy import * -import numpy - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def sscombs(s): - if len(s) == 1: - return [s] - else: - ssc = sscombs(s[1:]) - return [s[0]] + [s[0]+comb for comb in ssc] + ssc - - -infile = sys.argv[1] -y_col = int(sys.argv[2])-1 -x_cols = sys.argv[3].split(',') -outfile = sys.argv[4] - -print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) -fout = open(outfile,'w') - -for i, line in enumerate( file ( infile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - -if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - -y_vals = [] -x_vals = [] - -for k,col in enumerate(x_cols): - x_cols[k] = int(col)-1 - x_vals.append([]) - """ - try: - float( elems[x_cols[k]] ) - except: - try: - msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] ) - except: - msg = "This operation cannot be performed on non-numeric data." - stop_err( msg ) - """ -NA = 'NA' -for ind,line in enumerate( file( infile )): - if line and not line.startswith( '#' ): - try: - fields = line.split("\t") - try: - yval = float(fields[y_col]) - except Exception, ey: - yval = r('NA') - #print >>sys.stderr, "ey = %s" %ey - y_vals.append(yval) - for k,col in enumerate(x_cols): - try: - xval = float(fields[col]) - except Exception, ex: - xval = r('NA') - #print >>sys.stderr, "ex = %s" %ex - x_vals[k].append(xval) - except: - pass - -x_vals1 = numpy.asarray(x_vals).transpose() -dat= r.list(x=array(x_vals1), y=y_vals) - -set_default_mode(NO_CONVERSION) -try: - full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user -except RException, rex: - stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") -set_default_mode(BASIC_CONVERSION) - -summary = r.summary(full) -fullr2 = summary.get('r.squared','NA') - -if fullr2 == 'NA': - stop_error("Error in linear regression") - -if len(x_vals) < 10: - s = "" - for ch in range(len(x_vals)): - s += str(ch) -else: - stop_err("This tool only works with less than 10 predictors.") - -print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value" -all_combos = sorted(sscombs(s), key=len) -all_combos.reverse() -for j,cols in enumerate(all_combos): - #if len(cols) == len(s): #Same as the full model above - # continue - if len(cols) == 1: - x_vals1 = x_vals[int(cols)] - else: - x_v = [] - for col in cols: - x_v.append(x_vals[int(col)]) - x_vals1 = numpy.asarray(x_v).transpose() - dat= r.list(x=array(x_vals1), y=y_vals) - set_default_mode(NO_CONVERSION) - red = r.lm(r("y ~ x"), data= dat) #Reduced model - set_default_mode(BASIC_CONVERSION) - summary = r.summary(red) - redr2 = summary.get('r.squared','NA') - try: - rcve = (float(fullr2)-float(redr2))/float(fullr2) - except: - rcve = 'NA' - col_str = "" - for col in cols: - col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " - col_str.strip() - rcve_col_str = "" - for col in s: - if col not in cols: - rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " " - rcve_col_str.strip() - if len(cols) == len(s): #full model - rcve_col_str = "-" - rcve = "-" - try: - redr2 = "%.4f" %(float(redr2)) - except: - pass - try: - rcve = "%.4f" %(float(rcve)) - except: - pass - print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve) diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/rcve.xml --- a/tools/regVariation/rcve.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ -<tool id="rcve1" name="Compute RCVE" version="1.0.0"> - <description> </description> - <command interpreter="python"> - rcve.py - $input1 - $response_col - $predictor_cols - $out_file1 - 1>/dev/null - </command> - <inputs> - <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> - <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> - <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true"> - <validator type="no_options" message="Please select at least one column."/> - </param> - </inputs> - <outputs> - <data format="input" name="out_file1" metadata_source="input1" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <tests> - <!-- Test data with vlid values --> - <test> - <param name="input1" value="reg_inp.tab"/> - <param name="response_col" value="1"/> - <param name="predictor_cols" value="2,3,4"/> - <output name="out_file1" file="rcve_out.dat"/> - </test> - - </tests> - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* - ------ - -.. class:: infomark - -**What it does** - -This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula: - -**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**, -which denotes the case where the 'i'th predictor is dropped. - - -In general, -**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**, -where, - -- {X,X+} denotes the set of all predictors, -- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one), -- {X} is the set of the predictors that are left in the reduced model after excluding {X+} - - -The 4 columns in the output are described below: - -- Column 1 (Model): denotes the variables present in the model ({X}) -- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1 -- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model. -- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model. - - - </help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/substitution_rates.py --- a/tools/regVariation/substitution_rates.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -#!/usr/bin/env python -#guruprasad Ananda -""" -Estimates substitution rates from pairwise alignments using JC69 model. -""" - -from galaxy import eggs -from galaxy.tools.util.galaxyops import * -from galaxy.tools.util import maf_utilities -import bx.align.maf -import sys, fileinput - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -if len(sys.argv) < 3: - stop_err("Incorrect number of arguments.") - -inp_file = sys.argv[1] -out_file = sys.argv[2] -fout = open(out_file, 'w') -int_file = sys.argv[3] -if int_file != "None": #The user has specified an interval file - dbkey_i = sys.argv[4] - chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) - - -def rateEstimator(block): - global alignlen, mismatches - - src1 = block.components[0].src - sequence1 = block.components[0].text - start1 = block.components[0].start - end1 = block.components[0].end - len1 = int(end1)-int(start1) - len1_withgap = len(sequence1) - mismatch = 0.0 - - for seq in range (1,len(block.components)): - src2 = block.components[seq].src - sequence2 = block.components[seq].text - start2 = block.components[seq].start - end2 = block.components[seq].end - len2 = int(end2)-int(start2) - for nt in range(len1_withgap): - if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character - if sequence1[nt].upper() != sequence2[nt].upper(): - mismatch += 1 - - if int_file == "None": - p = mismatch/min(len1,len2) - print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" %(src1,start1,end1,src2,start2,end2,min(len1,len2),mismatch,p) - else: - mismatches += mismatch - alignlen += min(len1,len2) - -def main(): - skipped = 0 - not_pairwise = 0 - - if int_file == "None": - try: - maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) - except: - stop_err("Your MAF file appears to be malformed.") - print >>fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp" - for block in maf_reader: - if len(block.components) != 2: - not_pairwise += 1 - continue - try: - rateEstimator(block) - except: - skipped += 1 - else: - index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] ) - if index is None: - print >> sys.stderr, "Your MAF file appears to be malformed." - sys.exit() - win = NiceReaderWrapper( fileinput.FileInput( int_file ), - chrom_col=chr_col_i, - start_col=start_col_i, - end_col=end_col_i, - strand_col=strand_col_i, - fix_strand=True) - species=None - mincols = 0 - global alignlen, mismatches - - for interval in win: - alignlen = 0 - mismatches = 0.0 - src = "%s.%s" % ( dbkey_i, interval.chrom ) - for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ): - if len(block.components) != 2: - not_pairwise += 1 - continue - try: - rateEstimator(block) - except: - skipped += 1 - if alignlen: - p = mismatches/alignlen - else: - p = 'NA' - interval.fields.append(str(alignlen)) - interval.fields.append(str(mismatches)) - interval.fields.append(str(p)) - print >>fout, "\t".join(interval.fields) - #num_blocks += 1 - - if not_pairwise: - print "Skipped %d non-pairwise blocks" %(not_pairwise) - if skipped: - print "Skipped %d blocks as invalid" %(skipped) -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/substitution_rates.xml --- a/tools/regVariation/substitution_rates.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -<tool id="subRate1" name="Estimate substitution rates " version="1.0.0"> - <description> for non-coding regions</description> - <command interpreter="python"> - substitution_rates.py - $input - $out_file1 - #if $region.type == "win": - ${region.input2} ${region.input2.dbkey} ${region.input2.metadata.chromCol},$region.input2.metadata.startCol,$region.input2.metadata.endCol,$region.input2.metadata.strandCol - #else: - "None" - #end if - </command> - <inputs> - <param format="maf" name="input" type="data" label="Select pair-wise alignment data"/> - <conditional name="region"> - <param name="type" type="select" label="Estimate rates corresponding to" multiple="false"> - <option value="align">Alignment block</option> - <option value="win">Intervals in your history</option> - </param> - <when value="win"> - <param format="interval" name="input2" type="data" label="Choose intervals"> - <validator type="unspecified_build" /> - </param> - </when> - <when value="align" /> - </conditional> - </inputs> - <outputs> - <data format="tabular" name="out_file1" metadata_source="input"/> - </outputs> - - <tests> - <test> - <param name="input" value="Interval2Maf_pairwise_out.maf"/> - <param name="type" value="align"/> - <output name="out_file1" file="subRates1.out"/> - </test> - </tests> - - <help> - -.. class:: infomark - -**What it does** - -This tool takes a pairwise MAF file as input and estimates substitution rate according to Jukes-Cantor JC69 model. The 3 new columns appended to the output are explained below: - -- L: number of nucleotides compared -- N: number of different nucleotides -- p = N/L - ------ - -.. class:: warningmark - -**Note** - -Any block/s not containing exactly two sequences, will be omitted. - - </help> -</tool> \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/substitutions.py --- a/tools/regVariation/substitutions.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ -#!/usr/bin/env python -#Guruprasad ANanda -""" -Fetches substitutions from pairwise alignments. -""" - -from galaxy import eggs - -from galaxy.tools.util import maf_utilities - -import bx.align.maf -import sys -import os, fileinput -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -if len(sys.argv) < 3: - stop_err("Incorrect number of arguments.") - -inp_file = sys.argv[1] -out_file = sys.argv[2] -fout = open(out_file, 'w') - -def fetchSubs(block): - - src1 = block.components[0].src - sequence1 = block.components[0].text - start1 = block.components[0].start - end1 = block.components[0].end - len1 = int(end1)-int(start1) - len1_withgap = len(sequence1) - - for seq in range (1,len(block.components)): - src2 = block.components[seq].src - sequence2 = block.components[seq].text - start2 = block.components[seq].start - end2 = block.components[seq].end - len2 = int(end2)-int(start2) - sub_begin = None - sub_end = None - begin = False - - for nt in range(len1_withgap): - if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character - if sequence1[nt].upper() != sequence2[nt].upper(): - if not(begin): - sub_begin = nt - begin = True - sub_end = nt - else: - if begin: - print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),start1+sub_end-sequence1[0:sub_end].count('-')) - print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),start2+sub_end-sequence2[0:sub_end].count('-')) - begin = False - - else: - if begin: - print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),end1+sub_end-sequence1[0:sub_end].count('-')) - print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),end2+sub_end-sequence2[0:sub_end].count('-')) - begin = False - ended = False - - -def main(): - skipped = 0 - not_pairwise = 0 - try: - maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) - except: - stop_err("Your MAF file appears to be malformed.") - print >>fout, "#Chr\tStart\tEnd" - for block in maf_reader: - if len(block.components) != 2: - not_pairwise += 1 - continue - try: - fetchSubs(block) - except: - skipped += 1 - - if not_pairwise: - print "Skipped %d non-pairwise blocks" %(not_pairwise) - if skipped: - print "Skipped %d blocks" %(skipped) -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/substitutions.xml --- a/tools/regVariation/substitutions.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -<tool id="substitutions1" name="Fetch substitutions " version="1.0.0"> - <description> from pairwise alignments</description> - <command interpreter="python"> - substitutions.py - $input - $out_file1 - </command> - <inputs> - <param format="maf" name="input" type="data" label="Select pair-wise alignment data"/> - </inputs> - <outputs> - <data format="tabular" name="out_file1" metadata_source="input"/> - </outputs> - - <tests> - <test> - <param name="input" value="Interval2Maf_pairwise_out.maf"/> - <output name="out_file1" file="subs.out"/> - </test> - </tests> - <help> - -.. class:: infomark - -**What it does** - -This tool takes a pairwise MAF file as input and fetches substitutions per alignment block. - ------ - -.. class:: warningmark - -**Note** - -Any block/s not containing exactly two sequences, will be omitted. - - </help> -</tool> \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/t_test_two_samples.pl --- a/tools/regVariation/t_test_two_samples.pl Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -# A program to implement the non-pooled t-test for two samples where the alternative hypothesis is two-sided or one-sided. -# The first input file is a TABULAR format file representing the first sample and consisting of one column only. -# The second input file is a TABULAR format file representing the first sample nd consisting of one column only. -# The third input is the sidedness of the t-test: either two-sided or, one-sided with m1 less than m2 or, -# one-sided with m1 greater than m2. -# The fourth input is the equality status of the standard deviations of both populations -# The output file is a TXT file representing the result of the two sample t-test. - -use strict; -use warnings; - -#variable to handle the motif information -my $motif; -my $motifName = ""; -my $motifNumber = 0; -my $totalMotifsNumber = 0; -my @motifNamesArray = (); - -# check to make sure having correct files -my $usage = "usage: non_pooled_t_test_two_samples_galaxy.pl [TABULAR.in] [TABULAR.in] [testSidedness] [standardDeviationEquality] [TXT.out] \n"; -die $usage unless @ARGV == 5; - -#get the input arguments -my $firstSampleInputFile = $ARGV[0]; -my $secondSampleInputFile = $ARGV[1]; -my $testSidedness = $ARGV[2]; -my $standardDeviationEquality = $ARGV[3]; -my $outputFile = $ARGV[4]; - -#open the input files -open (INPUT1, "<", $firstSampleInputFile) || die("Could not open file $firstSampleInputFile \n"); -open (INPUT2, "<", $secondSampleInputFile) || die("Could not open file $secondSampleInputFile \n"); -open (OUTPUT, ">", $outputFile) || die("Could not open file $outputFile \n"); - - -#variables to store the name of the R script file -my $r_script; - -# R script to implement the two-sample test on the motif frequencies in upstream flanking region -#construct an R script file and save it in the same directory where the perl file is located -$r_script = "non_pooled_t_test_two_samples.r"; - -open(Rcmd,">", $r_script) or die "Cannot open $r_script \n\n"; -print Rcmd " - sampleTable1 <- read.table(\"$firstSampleInputFile\", header=FALSE); - sample1 <- sampleTable1[, 1]; - - sampleTable2 <- read.table(\"$secondSampleInputFile\", header=FALSE); - sample2 <- sampleTable2[, 1]; - - testSideStatus <- \"$testSidedness\"; - STEqualityStatus <- \"$standardDeviationEquality\"; - - #open the output a text file - sink(file = \"$outputFile\"); - - #check if the t-test is two-sided - if (testSideStatus == \"two-sided\"){ - - #check if the standard deviations are equal in both populations - if (STEqualityStatus == \"equal\"){ - #two-sample t-test where standard deviations are assumed to be unequal, the test is two-sided - testResult <- t.test(sample1, sample2, var.equal = TRUE); - } else{ - #two-sample t-test where standard deviations are assumed to be unequal, the test is two-sided - testResult <- t.test(sample1, sample2, var.equal = FALSE); - } - } else{ #the t-test is one sided - - #check if the t-test is two-sided with m1 < m2 - if (testSideStatus == \"one-sided:_m1_less_than_m2\"){ - - #check if the standard deviations are equal in both populations - if (STEqualityStatus == \"equal\"){ - #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2 - testResult <- t.test(sample1, sample2, var.equal = TRUE, alternative = \"less\"); - } else{ - #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2 - testResult <- t.test(sample1, sample2, var.equal = FALSE, alternative = \"less\"); - } - } else{ #the t-test is one-sided with m1 > m2 - #check if the standard deviations are equal in both populations - if (STEqualityStatus == \"equal\"){ - #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2 - testResult <- t.test(sample1, sample2, var.equal = TRUE, alternative = \"greater\"); - } else{ - #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2 - testResult <- t.test(sample1, sample2, var.equal = FALSE, alternative = \"greater\"); - } - } - } - - #save the output of the t-test into the output text file - testResult; - - #close the output text file - sink(); - - #eof" . "\n"; - -close Rcmd; - -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); - -#close the input and output files -close(OUTPUT); -close(INPUT2); -close(INPUT1); - diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/t_test_two_samples.xml --- a/tools/regVariation/t_test_two_samples.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,160 +0,0 @@ -<tool id="t_test_two_samples" name="T Test for Two Samples" version="1.0.0"> - <description></description> - - <command interpreter="perl"> - t_test_two_samples.pl $inputFile1 $inputFile2 $inputTestSidedness3 $inputStandardDeviationEquality4 $outputFile1 - </command> - - <inputs> - <param format="tabular" name="inputFile1" type="data" label="Select the first sample tabular file"/> - <param format="tabular" name="inputFile2" type="data" label="Select the second sample tabular file"/> - - <param name="inputTestSidedness3" type="select" label="Choose the test sidedness:"> - <option value="two-sided">Two-sided</option> - <option value="one-sided:_m1_less_than_m2">One-sided: m1 less than m2</option> - <option value="one-sided:_m1_greater_than_m2">One-sided: m1 greater than m2</option> - </param> - - <param name="inputStandardDeviationEquality4" type="select" label="Choose the standard deviation equality status of the two populations:"> - <option value="equal">Equal</option> - <option value="unequal">Unequal</option> - </param> - </inputs> - - <outputs> - <data format="text" name="outputFile1"/> - </outputs> - - <tests> - <test> - <param name="inputFile1" value="sample1.tabular" ftype="tabular" /> - <param name="inputFile2" value="sample2.tabular" ftype="tabular" /> - <param name="inputTestSidedness3" value="Two-sided" /> - <param name="inputStandardDeviationEquality4" value="Equal" /> - <output name="outputFile1" file="t_test_result1.text" /> - </test> - - <test> - <param name="inputFile1" value="sample1.tabular" ftype="tabular" /> - <param name="inputFile2" value="sample2.tabular" ftype="tabular" /> - <param name="inputTestSidedness3" value="Two-sided" /> - <param name="inputStandardDeviationEquality4" value="Unequal" /> - <output name="outputFile1" file="t_test_result2.text" /> - </test> - - <test> - <param name="inputFile1" value="sample1.tabular" ftype="tabular" /> - <param name="inputFile2" value="sample2.tabular" ftype="tabular" /> - <param name="inputTestSidedness3" value="One-sided: m1 less than m2" /> - <param name="inputStandardDeviationEquality4" value="Equal" /> - <output name="outputFile1" file="t_test_result3.text" /> - </test> - - <test> - <param name="inputFile1" value="sample1.tabular" ftype="tabular" /> - <param name="inputFile2" value="sample2.tabular" ftype="tabular" /> - <param name="inputTestSidedness3" value="One-sided: m1 less than m2" /> - <param name="inputStandardDeviationEquality4" value="Unequal" /> - <output name="outputFile1" file="t_test_result4.text" /> - </test> - - <test> - <param name="inputFile1" value="sample1.tabular" ftype="tabular" /> - <param name="inputFile2" value="sample2.tabular" ftype="tabular"/> - <param name="inputTestSidedness3" value="One-sided: m1 greater than m2" /> - <param name="inputStandardDeviationEquality4" value="Equal" /> - <output name="outputFile1" file="t_test_result5.text" /> - </test> - - <test> - <param name="inputFile1" value="sample1.tabular" ftype="tabular" /> - <param name="inputFile2" value="sample2.tabular" ftype="tabular" /> - <param name="inputTestSidedness3" value="One-sided: m1 greater than m2" /> - <param name="inputStandardDeviationEquality4" value="Unequal" /> - <output name="outputFile1" file="t_test_result6.text" /> - </test> - </tests> - - - <help> - -.. class:: infomark - -**What it does** - -This program implements the non-pooled t-test for two samples where the alternative hypothesis is two-sided or one-sided. The program takes four inputs: - -- The first input file is a TABULAR format file representing the first sample and consisting of one column only. -- The second input file is a TABULAR format file representing the first sample and consisting of one column only. -- The third input is the sidedness of the t-test: either two-sided or, one-sided with m1 less than m2 or, one-sided with m1 greater than m2. -- The fourth input is the equality status of the standard deviations of both populations. -- The output file is a TXT file representing the result of the two-sample t-test. - - -**Example** - -Let us have the first input file representing the first sample as follows:: - - 5 - 4 - 8 - 6 - 7 - 2 - 1 - 1 - 0 - 6 - 4 - 5 - 7 - 5 - 3 - 2 - 5 - 8 - 7 - 6 - 4 - -And the second input file representing the second sample as follows:: - - 2 - 3 - 5 - 1 - 2 - 7 - 5 - 4 - 3 - 2 - 7 - 6 - 0 - 8 - 4 - 6 - 9 - 2 - 4 - 5 - 6 - -Runnig the program and choosing "Two-sided" and "Equal" as parameters will give the following output:: - - Two Sample t-test - - data: sample1 and sample2 - t = -0.3247, df = 40, p-value = 0.7471 - alternative hypothesis: true difference in means is not equal to 0 - 95 percent confidence interval: - -1.720030 1.243839 - sample estimates: - mean of x mean of y - 4.333333 4.571429 - - - </help> - -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/windowSplitter.py --- a/tools/regVariation/windowSplitter.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ -#!/usr/bin/env python - -""" -Split into windows. - -usage: %prog input size out_file - -l, --cols=N,N,N,N: Columns for chrom, start, end, strand in file -""" - -import sys, re, os - -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -def main(): - # Parsing Command Line here - options, args = doc_optparse.parse( __doc__ ) - - try: - chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols ) - inp_file, winsize, out_file, makesliding, offset = args - winsize = int(winsize) - offset = int(offset) - makesliding = int(makesliding) - if strand_col_1 <= 0: - strand = "+" #if strand is not defined, default it to + - except: - stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." ) - - fo = open(out_file,'w') - - skipped_lines = 0 - first_invalid_line = 0 - invalid_line = None - if offset == 0: - makesliding = 0 - - for i, line in enumerate( file( inp_file ) ): - line = line.strip() - if line and line[0:1] != "#": - try: - elems = line.split('\t') - if strand_col_1 != -1: - strand = elems[strand_col_1] - start = int(elems[start_col_1]) - end = int(elems[end_col_1]) - if makesliding == 0: - numwin = (end - start)/winsize - else: - numwin = (end - start)/offset - if numwin > 0: - for win in range(numwin): - elems_1 = elems - elems_1[start_col_1] = str(start) - elems_1[end_col_1] = str(start + winsize) - fo.write( "%s\n" % '\t'.join( elems_1 ) ) - if makesliding == 0: - start = start + winsize - else: - start = start + offset - if start+winsize > end: - break - except: - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - - fo.close() - - if makesliding == 1: - print 'Window size=%d, Sliding=Yes, Offset=%d' %(winsize, offset) - else: - print 'Window size=%d, Sliding=No' %(winsize) - if skipped_lines > 0: - print 'Skipped %d invalid lines starting with #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) - -if __name__ == "__main__": - main() diff -r c2a356708570 -r 33c067c3ae34 tools/regVariation/windowSplitter.xml --- a/tools/regVariation/windowSplitter.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ -<tool id="winSplitter" name="Make windows"> - <description></description> - <command interpreter="python">windowSplitter.py $input $size $out_file1 ${wintype.choice} ${wintype.offset} -l ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}</command> - <inputs> - <!--<param label="Genome" name="dbkey" type="genomebuild"/>--> - <param format="interval" name="input" type="data" label="Select data"/> - <param name="size" size="10" type="integer" value="500" label="Window size"/> - <conditional name="wintype"> - <param name="choice" type="select" label="Make sliding windows?"> - <option value="0" selected="true">No</option> - <option value="1">Yes</option> - </param> - <when value="0"> - <param name="offset" type="hidden" value="0" /> - </when> - <when value="1"> - <param name="offset" size="10" type="integer" value="10" label="Offset size"/> - </when> - </conditional> - </inputs> - <outputs> - <data format="interval" name="out_file1" metadata_source="input"/> - </outputs> - <tests> - <test> - <param name="input" value="4.bed"/> - <param name="size" value="5000"/> - <param name="choice" value="1"/> - <param name="offset" value="4000"/> - <output name="out_file1" file="4_windows.bed"/> - </test> - </tests> - <help> - -.. class:: infomark - -**What it does** - -This tool splits the intervals in the input file into smaller intervals based on the specified window-size and window type. - ------ - -.. class:: warningmark - -**Note** - -The positions at the end of the input interval which do not fit into the last window or a new window of required size, will be omitted from the output. - ------ - -.. class:: infomark - -**About formats** - -**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: - -The first three BED fields (required) are:: - - 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). - 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) - 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). - -The additional BED fields (optional) are:: - - 4. name - The name of the BED line. - 5. score - A score between 0 and 1000. - 6. strand - Defines the strand - either '+' or '-'. - 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. - 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. - 9. reserved - This should always be set to zero. - 10. blockCount - The number of blocks (exons) in the BED line. - 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. - 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. - 13. expCount - The number of experiments. - 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. - 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. - ------ - -**Example** - -- For the following dataset:: - - chr22 1000 4700 NM_174568 0 + - -- running this tool with **Window size as 1000**, will return:: - - chr22 1000 2000 NM_174568 0 + - chr22 2000 3000 NM_174568 0 + - chr22 3000 4000 NM_174568 0 + - -- running this tool to make **Sliding windows** of **size 1000** and **offset 500**, will return:: - - chr22 1000 2000 NM_174568 0 + - chr22 1500 2500 NM_174568 0 + - chr22 2000 3000 NM_174568 0 + - chr22 2500 3500 NM_174568 0 + - chr22 3000 4000 NM_174568 0 + - chr22 3500 4500 NM_174568 0 + - - </help> - - -</tool> \ No newline at end of file diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/listFiles.py --- a/tools/rgenetics/listFiles.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,227 +0,0 @@ -#Provides Upload tool with access to list of available files -import glob,sys -import galaxy.app as thisapp -import galaxy.util - -from elementtree.ElementTree import XML - -librepos = '/usr/local/galaxy/data/rg' -myrepos = '/home/rerla/galaxy' -marchinirepos = '/usr/local/galaxy/data/rg/snptest' - -from galaxy.tools.parameters import DataToolParameter - -#Provides Upload tool with access to list of available builds - -builds = [] -#Read build names and keys from galaxy.util -for dbkey, build_name in galaxy.util.dbnames: - builds.append((build_name,dbkey,False)) - -#Return available builds -def get_available_builds(defval='hg18'): - for i,x in enumerate(builds): - if x[1] == defval: - x = list(x) - x[2] = True - builds[i] = tuple(x) - return builds - - - -def get_tabular_cols( input, outformat='gg' ): - """numeric only other than rs for strict genome graphs - otherwise tabular. Derived from galaxy tool source around August 2007 by Ross""" - columns = [] - seenCnames = {} - elems = [] - colnames = ['Col%d' % x for x in range(input.metadata.columns+1)] - strict = (outformat=='gg') - for i, line in enumerate( file ( input.file_name ) ): - if line and not line.startswith( '#' ): - line = line.rstrip('\r\n') - elems = line.split( '\t' ) - - """ - Strict gg note: - Since this tool requires users to select only those columns - that contain numerical values, we'll restrict the column select - list appropriately other than the first column which must be a marker - """ - if len(elems) > 0: - for col in range(1, input.metadata.columns+1): - isFloat = False # short circuit common result - try: - val = float(elems[col-1]) - isFloat = True - except: - val = elems[col-1] - if val: - if i == 0: # header row - colnames[col] = val - if isFloat or (not strict) or (col == 1): # all in if not GG - option = colnames[col] - if not seenCnames.get(option,None): # new - columns.append((option,str(col),False)) - seenCnames[option] = option - #print 'get_tab: %d=%s. Columns=%s' % (i,line,str(columns)) - if len(columns) > 0 and i > 10: - """ - We have our select list built, so we can break out of the outer most for loop - """ - break - if i == 30: - break # Hopefully we never get here... - for option in range(min(5,len(columns))): - (x,y,z) = columns[option] - columns[option] = (x,y,True) - return columns # sorted select options - -def get_marchini_dir(): - """return the filesystem directory for snptest style files""" - return marchinirepos - - -def get_lib_SNPTESTCaCofiles(): - """return a list of file names - without extensions - available for caco studies - These have a common file name with both _1 and _2 suffixes""" - d = get_marchini_dir() - testsuffix = '.gen_1' # glob these - flist = glob.glob('%s/*%s' % (d,testsuffix)) - flist = [x.split(testsuffix)[0] for x in flist] # leaves with a list of file set names - if len(flist) > 0: - dat = [(flist[0],flist[0],True),] - dat += [(x,x,False) for x in flist[1:]] - else: - dat = [('No Marchini CaCo files found in %s - convert some using the Marchini converter tool' % d,'None',True),] - return dat - -def getChropt(): - """return dynamic chromosome select options - """ - c = ['X','Y'] - c += ['%d' % x for x in range(1,23)] - dat = [(x,x,False) for x in c] - x,y,z = dat[3] - dat[3] = (x,y,True) - return dat - - -def get_phecols(fname=''): - """ return a list of phenotype columns for a multi-select list - prototype: - foo = ('fake - not yet implemented','not implemented','False') - dat = [foo for x in range(5)] - return dat - """ - try: - header = file(fname,'r').next().split() - except: - return [('get_phecols unable to open file %s' % fname,'None',False),] - dat = [(x,x,False) for x in header] - return dat - -#Return various kinds of files - -def get_lib_pedfiles(): - dat = glob.glob('%s/ped/*.ped' % librepos) - dat += glob.glob('%s/ped/*.ped' % myrepos) - dat.sort() - if len(dat) > 0: - dat = [x.split('.ped')[0] for x in dat] - dat = [(x,x,'True') for x in dat] - else: - dat = [('No ped files - add some to %s/ped or %s/ped' % (librepos,myrepos),'None',True),] - return dat - -def get_lib_phefiles(): - ext = 'phe' - dat = glob.glob('%s/pheno/*.%s' % (librepos,ext)) - dat += glob.glob('%s/pheno/*.%s' % (myrepos,ext)) - dat.sort() - if len(dat) > 0: - dat = [(x,x,'False') for x in dat] - else: - dat = [('No %s files - add some to %s/pheno or %s/pheno' % (ext,librepos,myrepos),'None',True),] - return dat - -def get_lib_bedfiles(): - dat = glob.glob('%s/plinkbed/*.bed' % librepos) - dat += glob.glob('%s/plinkbed/*.bed' % myrepos) - dat.sort() - if len(dat) > 0: - dat = [x.split('.bed')[0] for x in dat] - dat = [(x,x,False) for x in dat] - else: - dat = [('No bed files - Please import some to %s/plinkbed or %s/plinkbed' % (librepos,myrepos),'None',True),] - return dat - -def get_lib_fbatfiles(): - dat = glob.glob('%s/plinkfbat/*.ped' % librepos) - dat += glob.glob('%s/plinkfbat/*.ped' % myrepos) - dat.sort() - if len(dat) > 0: - dat = [(x,x,False) for x in dat] - else: - dat = [('No fbat bed files - Please import some to %s/plinkfbat or %s/plinkfbat' % (librepos,myrepos),'None',True),] - return dat - -def get_lib_mapfiles(): - dat = glob.glob('%s/ped/*.map' % librepos) - dat += glob.glob('%s/ped/*.map' % myrepos) - dat.sort() - if len(dat) > 0: - dat = [(x,x,False) for x in dat] - else: - dat = [('No map files - add some to %s/ped' % librepos,'None',True),] - return dat - -def get_my_pedfiles(): - dat = glob.glob('%s/*.ped' % myrepos) - if len(dat) > 0: - dat = [(x,x,False) for x in dat] - else: - dat = [('No ped files - add some to %s' % librepos,'None',True),] - return dat - -def get_my_mapfiles(): - dat = glob.glob('%s/*.map' % myrepos) - if len(dat) > 0: - dat = [(x,x,'True') for x in dat] - else: - dat = [('No ped files - add some to %s' % librepos,'None',True),] - return dat - -def get_lib_xlsfiles(): - dat = glob.glob('%s/*.xls' % librepos) - if len(dat) > 0: - dat = [(x,x,False) for x in dat] - else: - dat = [('No ped files - add some to %s' % librepos,'None',True),] - return dat - -def get_lib_htmlfiles(): - dat = glob.glob('%s/*.html' % librepos) - if len(dat) > 0: - dat = [(x,x,False) for x in dat] - else: - dat = [('No ped files - add some to %s' % librepos,'None',True),] - return dat - -def get_my_xlsfiles(): - dat = glob.glob('%s/*.xls' % myrepos) - if len(dat) > 0: - dat = [(x,x,False) for x in dat] - else: - dat = [('No ped files - add some to %s' % librepos,'None',True),] - return dat - -def get_my_htmlfiles(): - dat = glob.glob('%s/*.html' % myrepos) - if len(dat) > 0: - dat = [(x,x,False) for x in dat] - else: - dat = [('No ped files - add some to %s' % librepos,'None',True),] - return dat - - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/plinkbinJZ.py --- a/tools/rgenetics/plinkbinJZ.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,868 +0,0 @@ -#!/usr/bin/env python2.4 -""" -""" - -import optparse,os,subprocess,gzip,struct,time,commands -from array import array - -#from AIMS import util -#from pga import util as pgautil - -__FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $' - -VERBOSE = True - -MISSING_ALLELES = set(['N', '0', '.', '-','']) - -AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)]) - -MAGIC_BYTE1 = '00110110' -MAGIC_BYTE2 = '11011000' -FORMAT_SNP_MAJOR_BYTE = '10000000' -FORMAT_IND_MAJOR_BYTE = '00000000' -MAGIC1 = (0, 3, 1, 2) -MAGIC2 = (3, 1, 2, 0) -FORMAT_SNP_MAJOR = (2, 0, 0, 0) -FORMAT_IND_MAJOR = (0, 0, 0, 0) -HEADER_LENGTH = 3 - -HOM0 = 3 -HOM1 = 0 -MISS = 2 -HET = 1 -HOM0_GENO = (0, 0) -HOM1_GENO = (1, 1) -HET_GENO = (0, 1) -MISS_GENO = (-9, -9) - -GENO_TO_GCODE = { - HOM0_GENO: HOM0, - HET_GENO: HET, - HOM1_GENO: HOM1, - MISS_GENO: MISS, - } - -CHROM_REPLACE = { - 'X': '23', - 'Y': '24', - 'XY': '25', - 'MT': '26', - 'M': '26', -} - -MAP_LINE_EXCEPTION_TEXT = """ -One or more lines in the *.map file has only three fields. -The line was: - -%s - -If you are running rgGRR through EPMP, this is usually a -sign that you are using an old version of the map file. -You can correct the problem by re-running Subject QC. If -you have already tried this, please contact the developers, -or file a bug. -""" - -INT_TO_GCODE = { - 0: array('i', (0, 0, 0, 0)), 1: array('i', (2, 0, 0, 0)), 2: array('i', (1, 0, 0, 0)), 3: array('i', (3, 0, 0, 0)), - 4: array('i', (0, 2, 0, 0)), 5: array('i', (2, 2, 0, 0)), 6: array('i', (1, 2, 0, 0)), 7: array('i', (3, 2, 0, 0)), - 8: array('i', (0, 1, 0, 0)), 9: array('i', (2, 1, 0, 0)), 10: array('i', (1, 1, 0, 0)), 11: array('i', (3, 1, 0, 0)), - 12: array('i', (0, 3, 0, 0)), 13: array('i', (2, 3, 0, 0)), 14: array('i', (1, 3, 0, 0)), 15: array('i', (3, 3, 0, 0)), - 16: array('i', (0, 0, 2, 0)), 17: array('i', (2, 0, 2, 0)), 18: array('i', (1, 0, 2, 0)), 19: array('i', (3, 0, 2, 0)), - 20: array('i', (0, 2, 2, 0)), 21: array('i', (2, 2, 2, 0)), 22: array('i', (1, 2, 2, 0)), 23: array('i', (3, 2, 2, 0)), - 24: array('i', (0, 1, 2, 0)), 25: array('i', (2, 1, 2, 0)), 26: array('i', (1, 1, 2, 0)), 27: array('i', (3, 1, 2, 0)), - 28: array('i', (0, 3, 2, 0)), 29: array('i', (2, 3, 2, 0)), 30: array('i', (1, 3, 2, 0)), 31: array('i', (3, 3, 2, 0)), - 32: array('i', (0, 0, 1, 0)), 33: array('i', (2, 0, 1, 0)), 34: array('i', (1, 0, 1, 0)), 35: array('i', (3, 0, 1, 0)), - 36: array('i', (0, 2, 1, 0)), 37: array('i', (2, 2, 1, 0)), 38: array('i', (1, 2, 1, 0)), 39: array('i', (3, 2, 1, 0)), - 40: array('i', (0, 1, 1, 0)), 41: array('i', (2, 1, 1, 0)), 42: array('i', (1, 1, 1, 0)), 43: array('i', (3, 1, 1, 0)), - 44: array('i', (0, 3, 1, 0)), 45: array('i', (2, 3, 1, 0)), 46: array('i', (1, 3, 1, 0)), 47: array('i', (3, 3, 1, 0)), - 48: array('i', (0, 0, 3, 0)), 49: array('i', (2, 0, 3, 0)), 50: array('i', (1, 0, 3, 0)), 51: array('i', (3, 0, 3, 0)), - 52: array('i', (0, 2, 3, 0)), 53: array('i', (2, 2, 3, 0)), 54: array('i', (1, 2, 3, 0)), 55: array('i', (3, 2, 3, 0)), - 56: array('i', (0, 1, 3, 0)), 57: array('i', (2, 1, 3, 0)), 58: array('i', (1, 1, 3, 0)), 59: array('i', (3, 1, 3, 0)), - 60: array('i', (0, 3, 3, 0)), 61: array('i', (2, 3, 3, 0)), 62: array('i', (1, 3, 3, 0)), 63: array('i', (3, 3, 3, 0)), - 64: array('i', (0, 0, 0, 2)), 65: array('i', (2, 0, 0, 2)), 66: array('i', (1, 0, 0, 2)), 67: array('i', (3, 0, 0, 2)), - 68: array('i', (0, 2, 0, 2)), 69: array('i', (2, 2, 0, 2)), 70: array('i', (1, 2, 0, 2)), 71: array('i', (3, 2, 0, 2)), - 72: array('i', (0, 1, 0, 2)), 73: array('i', (2, 1, 0, 2)), 74: array('i', (1, 1, 0, 2)), 75: array('i', (3, 1, 0, 2)), - 76: array('i', (0, 3, 0, 2)), 77: array('i', (2, 3, 0, 2)), 78: array('i', (1, 3, 0, 2)), 79: array('i', (3, 3, 0, 2)), - 80: array('i', (0, 0, 2, 2)), 81: array('i', (2, 0, 2, 2)), 82: array('i', (1, 0, 2, 2)), 83: array('i', (3, 0, 2, 2)), - 84: array('i', (0, 2, 2, 2)), 85: array('i', (2, 2, 2, 2)), 86: array('i', (1, 2, 2, 2)), 87: array('i', (3, 2, 2, 2)), - 88: array('i', (0, 1, 2, 2)), 89: array('i', (2, 1, 2, 2)), 90: array('i', (1, 1, 2, 2)), 91: array('i', (3, 1, 2, 2)), - 92: array('i', (0, 3, 2, 2)), 93: array('i', (2, 3, 2, 2)), 94: array('i', (1, 3, 2, 2)), 95: array('i', (3, 3, 2, 2)), - 96: array('i', (0, 0, 1, 2)), 97: array('i', (2, 0, 1, 2)), 98: array('i', (1, 0, 1, 2)), 99: array('i', (3, 0, 1, 2)), - 100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)), - 104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)), - 108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)), - 112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)), - 116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)), - 120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)), - 124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)), - 128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)), - 132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)), - 136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)), - 140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)), - 144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)), - 148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)), - 152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)), - 156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)), - 160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)), - 164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)), - 168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)), - 172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)), - 176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)), - 180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)), - 184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)), - 188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)), - 192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)), - 196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)), - 200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)), - 204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)), - 208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)), - 212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)), - 216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)), - 220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)), - 224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)), - 228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)), - 232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)), - 236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)), - 240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)), - 244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)), - 248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)), - 252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)), - } - -GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()]) - -### Exceptions -class DuplicateMarkerInMapFile(Exception): pass -class MapLineTooShort(Exception): pass -class ThirdAllele(Exception): pass -class PedError(Exception): pass -class BadMagic(Exception): - """ Raised when one of the MAGIC bytes in a bed file does not match - """ - pass -class BedError(Exception): - """ Raised when parsing a bed file runs into problems - """ - pass -class UnknownGenocode(Exception): - """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?) - """ - pass -class UnknownGeno(Exception): pass - -### Utility functions - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -def ceiling(n, k): - ''' Return the least multiple of k which is greater than n - ''' - m = n % k - if m == 0: - return n - else: - return n + k - m - -def nbytes(n): - ''' Return the number of bytes required for n subjects - ''' - return 2*ceiling(n, 4)/8 - -### Primary module functionality -class LPed: - """ The uber-class for processing the Linkage-format *.ped/*.map files - """ - def __init__(self, base): - self.base = base - self._ped = Ped('%s.ped' % (self.base)) - self._map = Map('%s.map' % (self.base)) - - self._markers = {} - self._ordered_markers = [] - self._marker_allele_lookup = {} - self._autosomal_indices = set() - - self._subjects = {} - self._ordered_subjects = [] - - self._genotypes = [] - - def parse(self): - """ - """ - if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow()) - self._map.parse() - self._markers = self._map._markers - self._ordered_markers = self._map._ordered_markers - self._autosomal_indices = self._map._autosomal_indices - - self._ped.parse(self._ordered_markers) - self._subjects = self._ped._subjects - self._ordered_subjects = self._ped._ordered_subjects - self._genotypes = self._ped._genotypes - self._marker_allele_lookup = self._ped._marker_allele_lookup - - ### Adjust self._markers based on the allele information - ### we got from parsing the ped file - for m, name in enumerate(self._ordered_markers): - a1, a2 = self._marker_allele_lookup[m][HET] - self._markers[name][-2] = a1 - self._markers[name][-1] = a2 - if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow()) - - def getSubjectInfo(self, fid, oiid): - """ - """ - return self._subject_info[(fid, oiid)] - - def getSubjectInfoByLine(self, line): - """ - """ - return self._subject_info[self._ordered_subjects[line]] - - def getGenotypesByIndices(self, s, mlist, format): - """ needed for grr if lped - deprecated but.. - """ - mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ? - raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)]) - if format == 'raw': - return raw_array - elif format == 'ref': - result = array('i', [0]*len(mlist)) - for m, gcode in enumerate(raw_array): - if gcode == HOM0: - nref = 3 - elif gcode == HET: - nref = 2 - elif gcode == HOM1: - nref = 1 - else: - nref = 0 - result[m] = nref - return result - else: - result = [] - for m, gcode in enumerate(raw_array): - result.append(self._marker_allele_lookup[m][gcode]) - return result - - def writebed(self, base): - """ - """ - dst_name = '%s.fam' % (base) - print 'Writing pedigree information to [ %s ]' % (dst_name) - dst = open(dst_name, 'w') - for skey in self._ordered_subjects: - (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey] - dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe)) - dst.close() - - dst_name = '%s.bim' % (base) - print 'Writing map (extended format) information to [ %s ]' % (dst_name) - dst = open(dst_name, 'w') - for m, marker in enumerate(self._ordered_markers): - chrom, name, genpos, abspos, a1, a2 = self._markers[marker] - dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2)) - dst.close() - - bed_name = '%s.bed' % (base) - print 'Writing genotype bitfile to [ %s ]' % (bed_name) - print 'Using (default) SNP-major mode' - bed = open(bed_name, 'w') - - ### Write the 3 header bytes - bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2))) - bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2))) - bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2))) - - ### Calculate how many "pad bits" we should add after the last subject - nsubjects = len(self._ordered_subjects) - nmarkers = len(self._ordered_markers) - total_bytes = nbytes(nsubjects) - nbits = nsubjects * 2 - pad_nibbles = ((total_bytes * 8) - nbits)/2 - pad = array('i', [0]*pad_nibbles) - - ### And now write genotypes to the file - for m in xrange(nmarkers): - geno = self._genotypes[m] - geno.extend(pad) - bytes = len(geno)/4 - for b in range(bytes): - idx = b*4 - gcode = tuple(geno[idx:idx+4]) - try: - byte = struct.pack('B', GCODE_TO_INT[gcode]) - except KeyError: - print m, b, gcode - raise - bed.write(byte) - bed.close() - - def autosomal_indices(self): - """ Return the indices of markers in this ped/map that are autosomal. - This is used by rgGRR so that it can select a random set of markers - from the autosomes (sex chroms screw up the plot) - """ - return self._autosomal_indices - -class Ped: - def __init__(self, path): - self.path = path - self._subjects = {} - self._ordered_subjects = [] - self._genotypes = [] - self._marker_allele_lookup = {} - - def lineCount(self,infile): - """ count the number of lines in a file - efficiently using wget - """ - return int(commands.getoutput('wc -l %s' % (infile)).split()[0]) - - - def parse(self, markers): - """ Parse a given file -- this needs to be memory-efficient so that large - files can be parsed (~1 million markers on ~5000 subjects?). It - should also be fast, if possible. - """ - - ### Find out how many lines are in the file so we can ... - nsubjects = self.lineCount(self.path) - ### ... Pre-allocate the genotype arrays - nmarkers = len(markers) - _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)] - self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)] - - if self.path.endswith('.gz'): - pfile = gzip.open(self.path, 'r') - else: - pfile = open(self.path, 'r') - - for s, line in enumerate(pfile): - line = line.strip() - if not line: - continue - - fid, iid, did, mid, sex, phe, genos = line.split(None, 6) - sid = iid.split('.')[0] - d_sid = did.split('.')[0] - m_sid = mid.split('.')[0] - - skey = (fid, iid) - self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) - self._ordered_subjects.append(skey) - - genotypes = genos.split() - - for m, marker in enumerate(markers): - idx = m*2 - a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m - s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m - - ### FIXME: I think this can still be faster, and simpler to read - # Two pieces of logic intertwined here: first, we need to code - # this genotype as HOM0, HOM1, HET or MISS. Second, we need to - # keep an ongoing record of the genotypes seen for this marker - if a1 == a2: - if a1 in MISSING_ALLELES: - geno = MISS_GENO - else: - if s1 == '0': - seen[0] = a1 - elif s1 == a1 or s2 == a2: - pass - elif s2 == '0': - seen[1] = a1 - else: - raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen))) - - if a1 == seen[0]: - geno = HOM0_GENO - elif a1 == seen[1]: - geno = HOM1_GENO - else: - raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen))) - elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES: - geno = MISS_GENO - else: - geno = HET_GENO - if s1 == '0': - seen[0] = a1 - seen[1] = a2 - elif s2 == '0': - if s1 == a1: - seen[1] = a2 - elif s1 == a2: - seen[1] = a1 - else: - raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen))) - else: - if sorted(seen) != sorted((a1, a2)): - raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen))) - - gcode = GENO_TO_GCODE.get(geno, None) - if gcode is None: - raise UnknownGeno(str(geno)) - self._genotypes[m][s] = gcode - - # Build the _marker_allele_lookup table - for m, alleles in enumerate(_marker_alleles): - if len(alleles) == 2: - a1, a2 = alleles - elif len(alleles) == 1: - a1 = alleles[0] - a2 = '0' - else: - print 'All alleles blank for %s: %s' % (m, str(alleles)) - raise - - self._marker_allele_lookup[m] = { - HOM0: (a2, a2), - HOM1: (a1, a1), - HET : (a1, a2), - MISS: ('0','0'), - } - - if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects), nsubjects, self.path) - -class Map: - def __init__(self, path=None): - self.path = path - self._markers = {} - self._ordered_markers = [] - self._autosomal_indices = set() - - def __len__(self): - return len(self._markers) - - def parse(self): - """ Parse a Linkage-format map file - """ - if self.path.endswith('.gz'): - fh = gzip.open(self.path, 'r') - else: - fh = open(self.path, 'r') - - for i, line in enumerate(fh): - line = line.strip() - if not line: - continue - - fields = line.split() - if len(fields) < 4: - raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line), len(fields))) - else: - chrom, name, genpos, abspos = fields - if name in self._markers: - raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path)) - abspos = int(abspos) - if abspos < 0: - continue - if chrom in AUTOSOMES: - self._autosomal_indices.add(i) - chrom = CHROM_REPLACE.get(chrom, chrom) - self._markers[name] = [chrom, name, genpos, abspos, None, None] - self._ordered_markers.append(name) - fh.close() - if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers), i, self.path) - -class BPed: - """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam - """ - def __init__(self, base): - self.base = base - self._bed = Bed('%s.bed' % (self.base)) - self._bim = Bim('%s.bim' % (self.base)) - self._fam = Fam('%s.fam' % (self.base)) - - self._markers = {} - self._ordered_markers = [] - self._marker_allele_lookup = {} - self._autosomal_indices = set() - - self._subjects = {} - self._ordered_subjects = [] - - self._genotypes = [] - - def parse(self, quick=False): - """ - """ - self._quick = quick - - self._bim.parse() - self._markers = self._bim._markers - self._ordered_markers = self._bim._ordered_markers - self._marker_allele_lookup = self._bim._marker_allele_lookup - self._autosomal_indices = self._bim._autosomal_indices - - self._fam.parse() - self._subjects = self._fam._subjects - self._ordered_subjects = self._fam._ordered_subjects - - self._bed.parse(self._ordered_subjects, self._ordered_markers, quick=quick) - self._bedf = self._bed._fh - self._genotypes = self._bed._genotypes - self.nsubjects = len(self._ordered_subjects) - self.nmarkers = len(self._ordered_markers) - self._bytes_per_marker = nbytes(self.nsubjects) - - def writeped(self, path=None): - """ - """ - path = self.path = path or self.path - - map_name = self.path.replace('.bed', '.map') - print 'Writing map file [ %s ]' % (map_name) - dst = open(map_name, 'w') - for m in self._ordered_markers: - chrom, snp, genpos, abspos, a1, a2 = self._markers[m] - dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos)) - dst.close() - - ped_name = self.path.replace('.bed', '.ped') - print 'Writing ped file [ %s ]' % (ped_name) - ped = open(ped_name, 'w') - firstyikes = False - for s, skey in enumerate(self._ordered_subjects): - idx = s*2 - (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey] - ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe)) - genotypes_for_subject = self.getGenotypesForSubject(s) - for m, snp in enumerate(self._ordered_markers): - #a1, a2 = self.getGenotypeByIndices(s, m) - a1,a2 = genotypes_for_subject[m] - ped.write(' %s %s' % (a1, a2)) - ped.write('\n') - ped.close() - - def getGenotype(self, subject, marker): - """ Retrieve a genotype for a particular subject/marker pair - """ - m = self._ordered_markers.index(marker) - s = self._ordered_subjects.index(subject) - return self.getGenotypeByIndices(s, m) - - def getGenotypesForSubject(self, s, raw=False): - """ Returns list of genotypes for all m markers - for subject s. If raw==True, then an array - of raw integer gcodes is returned instead - """ - if self._quick: - nmarkers = len(self._markers) - raw_array = array('i', [0]*nmarkers) - seek_nibble = s % 4 - for m in xrange(nmarkers): - seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH - self._bedf.seek(seek_byte) - geno = struct.unpack('B', self._bedf.read(1))[0] - quartet = INT_TO_GCODE[geno] - gcode = quartet[seek_nibble] - raw_array[m] = gcode - else: - raw_array = array('i', [row[s] for row in self._genotypes]) - - if raw: - return raw_array - else: - result = [] - for m, gcode in enumerate(raw_array): - result.append(self._marker_allele_lookup[m][gcode]) - return result - - def getGenotypeByIndices(self, s, m): - """ - """ - if self._quick: - # Determine which byte we need to seek to, and - # which nibble within the byte we need - seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH - seek_nibble = s % 4 - self._bedf.seek(seek_byte) - geno = struct.unpack('B', self._bedf.read(1))[0] - quartet = INT_TO_GCODE[geno] - gcode = quartet[seek_nibble] - else: - # Otherwise, just grab the genotypes from the - # list of arrays - genos_for_marker = self._genotypes[m] - gcode = genos_for_marker[s] - - return self._marker_allele_lookup[m][gcode] - - def getGenotypesByIndices(self, s, mlist, format): - """ - """ - if self._quick: - raw_array = array('i', [0]*len(mlist)) - seek_nibble = s % 4 - for i,m in enumerate(mlist): - seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH - self._bedf.seek(seek_byte) - geno = struct.unpack('B', self._bedf.read(1))[0] - quartet = INT_TO_GCODE[geno] - gcode = quartet[seek_nibble] - raw_array[i] = gcode - mlist = set(mlist) - else: - mlist = set(mlist) - raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist]) - - if format == 'raw': - return raw_array - elif format == 'ref': - result = array('i', [0]*len(mlist)) - for m, gcode in enumerate(raw_array): - if gcode == HOM0: - nref = 3 - elif gcode == HET: - nref = 2 - elif gcode == HOM1: - nref = 1 - else: - nref = 0 - result[m] = nref - return result - else: - result = [] - for m, gcode in enumerate(raw_array): - result.append(self._marker_allele_lookup[m][gcode]) - return result - - def getSubject(self, s): - """ - """ - skey = self._ordered_subjects[s] - return self._subjects[skey] - - def autosomal_indices(self): - """ Return the indices of markers in this ped/map that are autosomal. - This is used by rgGRR so that it can select a random set of markers - from the autosomes (sex chroms screw up the plot) - """ - return self._autosomal_indices - -class Bed: - - def __init__(self, path): - self.path = path - self._genotypes = [] - self._fh = None - - def parse(self, subjects, markers, quick=False): - """ Parse the bed file, indicated either by the path parameter, - or as the self.path indicated in __init__. If quick is - True, then just parse the bim and fam, then genotypes will - be looked up dynamically by indices - """ - self._quick = quick - - ordered_markers = markers - ordered_subjects = subjects - nsubjects = len(ordered_subjects) - nmarkers = len(ordered_markers) - - bed = open(self.path, 'rb') - self._fh = bed - - byte1 = bed.read(1) - byte2 = bed.read(1) - byte3 = bed.read(1) - format_flag = struct.unpack('B', byte3)[0] - - h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]]) - h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]]) - h3 = tuple(INT_TO_GCODE[format_flag]) - - if h1 != MAGIC1 or h2 != MAGIC2: - raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2)) - if format_flag: - print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3) - else: - raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3) - - print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects) - - ### If quick mode was specified, we're done ... - self._quick = quick - if quick: - return - - ### ... Otherwise, parse genotypes into an array, and append that - ### array to self._genotypes - ngcodes = ceiling(nsubjects, 4) - bytes_per_marker = nbytes(nsubjects) - for m in xrange(nmarkers): - genotype_array = array('i', [-1]*(ngcodes)) - for byte in xrange(bytes_per_marker): - intval = struct.unpack('B', bed.read(1))[0] - idx = byte*4 - genotype_array[idx:idx+4] = INT_TO_GCODE[intval] - self._genotypes.append(genotype_array) - -class Bim: - def __init__(self, path): - """ - """ - self.path = path - self._markers = {} - self._ordered_markers = [] - self._marker_allele_lookup = {} - self._autosomal_indices = set() - - def parse(self): - """ - """ - print 'Reading map (extended format) from [ %s ]' % (self.path) - bim = open(self.path, 'r') - for m, line in enumerate(bim): - chrom, snp, gpos, apos, a1, a2 = line.strip().split() - self._markers[snp] = (chrom, snp, gpos, apos, a1, a2) - self._marker_allele_lookup[m] = { - HOM0: (a2, a2), - HOM1: (a1, a1), - HET : (a1, a2), - MISS: ('0','0'), - } - self._ordered_markers.append(snp) - if chrom in AUTOSOMES: - self._autosomal_indices.add(m) - bim.close() - print '%s markers to be included from [ %s ]' % (m+1, self.path) - -class Fam: - def __init__(self, path): - """ - """ - self.path = path - self._subjects = {} - self._ordered_subjects = [] - - def parse(self): - """ - """ - print 'Reading pedigree information from [ %s ]' % (self.path) - fam = open(self.path, 'r') - for s, line in enumerate(fam): - fid, iid, did, mid, sex, phe = line.strip().split() - sid = iid.split('.')[0] - d_sid = did.split('.')[0] - m_sid = mid.split('.')[0] - skey = (fid, iid) - self._ordered_subjects.append(skey) - self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) - fam.close() - print '%s individuals read from [ %s ]' % (s+1, self.path) - -### Command-line functionality and testing -def test(arg): - ''' - ''' - - import time - - if arg == 'CAMP_AFFY.ped': - print 'Testing bed.parse(quick=True)' - s = time.time() - bed = Bed(arg.replace('.ped', '.bed')) - bed.parse(quick=True) - print bed.getGenotype(('400118', '10300283'), 'rs2000467') - print bed.getGenotype(('400118', '10101384'), 'rs2294019') - print bed.getGenotype(('400121', '10101149'), 'rs2294019') - print bed.getGenotype(('400123', '10200290'), 'rs2294019') - assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4') - e = time.time() - print 'e-s = %s\n' % (e-s) - - print 'Testing bed.parse' - s = time.time() - bed = BPed(arg) - bed.parse(quick=False) - e = time.time() - print 'e-s = %s\n' % (e-s) - - print 'Testing bed.writeped' - s = time.time() - outname = '%s_BEDTEST' % (arg) - bed.writeped(outname) - e = time.time() - print 'e-s = %s\n' % (e-s) - del(bed) - - print 'Testing ped.parse' - s = time.time() - ped = LPed(arg) - ped.parse() - e = time.time() - print 'e-s = %s\n' % (e-s) - - print 'Testing ped.writebed' - s = time.time() - outname = '%s_PEDTEST' % (arg) - ped.writebed(outname) - e = time.time() - print 'e-s = %s\n' % (e-s) - del(ped) - -def profile_bed(arg): - """ - """ - bed = BPed(arg) - bed.parse(quick=False) - outname = '%s_BEDPROFILE' % (arg) - bed.writeped(outname) - -def profile_ped(arg): - """ - """ - ped = LPed(arg) - ped.parse() - outname = '%s_PEDPROFILE' % (arg) - ped.writebed(outname) - -if __name__ == '__main__': - """ Run as a command-line, this script should get one or more arguments, - each one a ped file to be parsed with the PedParser (unit tests?) - """ - op = optparse.OptionParser() - op.add_option('--profile-bed', action='store_true', default=False) - op.add_option('--profile-ped', action='store_true', default=False) - opts, args = op.parse_args() - - if opts.profile_bed: - import profile - import pstats - profile.run('profile_bed(args[0])', 'fooprof') - p = pstats.Stats('fooprof') - p.sort_stats('cumulative').print_stats(10) - elif opts.profile_ped: - import profile - import pstats - profile.run('profile_ped(args[0])', 'fooprof') - p = pstats.Stats('fooprof') - p.sort_stats('cumulative').print_stats(10) - else: - for arg in args: - test(arg) - - ### Code used to generate the INT_TO_GCODE dictionary - #print '{\n ', - #for i in range(256): - # b = INT2BIN[i] - # ints = [] - # s = str(i).rjust(3) - # #print b - # for j in range(4): - # idx = j*2 - # #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2) - # ints.append(int(b[idx:idx+2], 2)) - # print '%s: array(\'i\', %s),' % (s,tuple(ints)), - # if i > 0 and (i+1) % 4 == 0: - # print '\n ', - #print '}' - - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/plinkbinJZ.pyc Binary file tools/rgenetics/plinkbinJZ.pyc has changed diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgCaCo.py --- a/tools/rgenetics/rgCaCo.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,271 +0,0 @@ -#!/usr/local/bin/python -# hack to run and process a plink case control association -# expects args as -# bfilepath outname jobname outformat (wig,xls) -# ross lazarus -# for wig files, we need annotation so look for map file or complain -""" -Parameters for wiggle track definition lines -All options are placed in a single line separated by spaces: - - track type=wiggle_0 name=track_label description=center_label \ - visibility=display_mode color=r,g,b altColor=r,g,b \ - priority=priority autoScale=on|off \ - gridDefault=on|off maxHeightPixels=max:default:min \ - graphType=bar|points viewLimits=lower:upper \ - yLineMark=real-value yLineOnOff=on|off \ - windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 -""" - -import sys,math,shutil,subprocess,os,time,tempfile,string -from os.path import abspath -from rgutils import timenow, plinke -imagedir = '/static/rg' # if needed for images -myversion = 'V000.1 April 2007' -verbose = False - -def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): - """ - score must be scaled to 0-1000 - - Want to make some wig tracks from each analysis - Best n -log10(p). Make top hit the window. - we use our tab output which has - rs chrom offset ADD_stat ADD_p ADD_log10p - rs3094315 1 792429 1.151 0.2528 0.597223 - - """ - - def is_number(s): - try: - float(s) - return True - except ValueError: - return False - header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) - column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] - halfwidth=100 - resfpath = os.path.join(twd,resf) - resf = open(resfpath,'r') - resfl = resf.readlines() # dumb but convenient for millions of rows - resfl = [x.split() for x in resfl] - headl = resfl[0] - resfl = resfl[1:] - headl = [x.strip().upper() for x in headl] - headIndex = dict(zip(headl,range(0,len(headl)))) - whatwewant = ['CHR','RS','OFFSET','LOG10ARMITAGEP'] - wewant = [headIndex.get(x,None) for x in whatwewant] - if None in wewant: # missing something - logf.write('### Error missing a required header from %s in makeGFF - headIndex=%s\n' % (whatwewant,headIndex)) - return - ppos = wewant[3] # last in list - resfl = [x for x in resfl if x[ppos] > '' and x[ppos] <> 'NA'] - resfl = [(float(x[ppos]),x) for x in resfl] # decorate - resfl.sort() - resfl.reverse() # using -log10 so larger is better - pvals = [x[0] for x in resfl] # need to scale - resfl = [x[1] for x in resfl] # drop decoration - resfl = resfl[:topn] # truncate - maxp = max(pvals) # need to scale - minp = min(pvals) - prange = abs(maxp-minp) + 0.5 # fudge - scalefact = 1000.0/prange - logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) - for i,row in enumerate(resfl): - row[ppos] = '%d' % (int(scalefact*pvals[i])) - resfl[i] = row # replace - outf = file(outfname,'w') - outf.write(header) - outres = [] # need to resort into chrom offset order - for i,lrow in enumerate(resfl): - chrom,snp,offset,p, = [lrow[x] for x in wewant] - gff = ('chr%s' % chrom,'rgCaCo','variation','%d' % (int(offset)-halfwidth), - '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) - outres.append(gff) - outres = [(x[0],int(x[3]),x) for x in outres] # decorate - outres.sort() # into chrom offset - outres=[x[2] for x in outres] # undecorate - outres = ['\t'.join(x) for x in outres] - outf.write('\n'.join(outres)) - outf.write('\n') - outf.close() - - -def plink_assocToGG(plinkout="hm",tag='test'): - """ plink --assoc output looks like this - # CHR SNP A1 F_A F_U A2 CHISQ P OR - # 1 rs3094315 G 0.6685 0.1364 A 104.1 1.929e-24 12.77 - # write as a genegraph input file - """ - inf = file('%s.assoc' % plinkout,'r') - outf = file('%sassoc.xls' % plinkout,'w') - res = ['rs\tlog10p%s\tFakeInvOR%s\tRealOR%s' % (tag,tag,tag),] # output header for ucsc genome graphs - head = inf.next() - for l in inf: - ll = l.split() - if len(ll) >= 8: - p = float(ll[7]) - if p <> 'NA': # eesh - logp = '%9.9f' % -math.log10(p) - else: - logp = 'NA' - try: - orat = ll[8] - except: - orat = 'NA' - orat2 = orat - # invert large negative odds ratios - if float(orat) < 1 and float(orat) > 0.0: - orat2 = '%9.9f' % (1.0/float(orat)) - outl = [ll[1],logp, orat2, orat] - res.append('\t'.join(outl)) - outf.write('\n'.join(res)) - outf.write('\n') - outf.close() - inf.close() - -def xformModel(infname='',resf='',outfname='', - name='foo',mapf='/usr/local/galaxy/data/rg/ped/x.bim',flog=None): - """munge a plink .model file into either a ucsc track or an xls file - rerla@meme ~/plink]$ head hmYRI_CEU.model - CHR SNP TEST AFF UNAFF CHISQ DF P - 1 rs3094315 GENO 41/37/11 0/24/64 NA NA NA - 1 rs3094315 TREND 119/59 24/152 81.05 1 2.201e-19 - 1 rs3094315 ALLELIC 119/59 24/152 104.1 1 1.929e-24 - 1 rs3094315 DOM 78/11 24/64 NA NA NA - - bim file has -[rerla@beast pbed]$ head plink_wgas1_example.bim -1 rs3094315 0.792429 792429 G A -1 rs6672353 0.817376 817376 A G - """ - if verbose: - print 'Rgenetics rgCaCo.xformModel got resf=%s, outfname=%s' % (resf,outfname) - res = [] - rsdict = {} - map = file(mapf,'r') - for l in map: # plink map - ll = l.strip().split() - if len(ll) >= 3: - rs=ll[1].strip() - chrom = ll[0] - if chrom.lower() == 'x': - chrom='23' - elif chrom.lower() == 'y': - chrom = 24 - elif chrom.lower() == 'mito': - chrom = 25 - offset = ll[3] - rsdict[rs] = (chrom,offset) - res.append('rs\tChr\tOffset\tGenop\tlog10Genop\tArmitagep\tlog10Armitagep\tAllelep\tlog10Allelep\tDomp\tlog10Domp') - f = open(resf,'r') - headl = f.readline() - if headl.find('\t') <> -1: - headl = headl.split('\t') - delim = '\t' - else: - headl = headl.split() - delim = None - whatwewant = ['CHR','SNP','TEST','AFF','UNAFF','CHISQ','P'] - wewant = [headl.index(x) for x in whatwewant] - llen = len(headl) - lnum = anum = 0 - lastsnp = None # so we know when to write out a gg line - outl = {} - f.seek(0) - for lnum,l in enumerate(f): - if lnum == 0: - continue - ll = l.split() - if delim: - ll = l.split(delim) - if len(ll) >= llen: # valid line - chr,snp,test,naff,nuaff,chi,p = [ll[x] for x in wewant] - snp = snp.strip() - chrom,offset = rsdict.get(snp,(None,None)) - anum += 1 - fp = 1.0 # if NA - lp = 0.0 - try: - fp = float(p) - if fp > 0: - lp = -math.log10(fp) - else: - fp = 9e-100 - flog.write('### WARNING - Plink calculated %s for %s p value!!! 9e-100 substituted!\n' % (p,test)) - flog.write('### offending line #%d in %s = %s' % (lnum,l)) - except: - pass - if snp <> lastsnp: - if len(outl.keys()) > 3: - sl = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] - res.append('\t'.join(sl)) # last snp line - outl = {'snp':snp,'chrom':chrom,'offset':offset} # first 3 cols for gg line - lastsnp = snp # reset for next marker - #if p == 'NA': - # p = 1.0 - # let's pass downstream for handling R is fine? - outl[test] = '%s\t%f' % (p,lp) - if len(outl.keys()) > 3: - l = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] - res.append('\t'.join(l)) # last snp line - f = file(outfname,'w') - res.append('') - f.write('\n'.join(res)) - f.close() - - - - -if __name__ == "__main__": - """ - # called as - <command interpreter="python"> - rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name" - '$out_file1' '$logf' '$logf.files_path' '$gffout' - </command> </command> - """ - if len(sys.argv) < 7: - s = 'rgCaCo.py needs 6 params - got %s \n' % (sys.argv) - print >> sys.stdout, s - sys.exit(0) - bfname = sys.argv[1] - name = sys.argv[2] - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - name = name.translate(trantab) - outfname = sys.argv[3] - logf = sys.argv[4] - logoutdir = sys.argv[5] - gffout = sys.argv[6] - topn = 1000 - try: - os.makedirs(logoutdir) - except: - pass - map_file = None - me = sys.argv[0] - amapf = '%s.bim' % bfname # to decode map in xformModel - flog = file(logf,'w') - logme = [] - cdir = os.getcwd() - s = 'Rgenetics %s http://rgenetics.org Galaxy Tools, rgCaCo.py started %s\n' % (myversion,timenow()) - print >> sys.stdout, s # so will appear as blurb for file - logme.append(s) - if verbose: - s = 'rgCaCo.py: bfname=%s, logf=%s, argv = %s\n' % (bfname, logf, sys.argv) - print >> sys.stdout, s # so will appear as blurb for file - logme.append(s) - twd = tempfile.mkdtemp(suffix='rgCaCo') # make sure plink doesn't spew log file into the root! - tname = os.path.join(twd,name) - vcl = [plinke,'--noweb','--bfile',bfname,'--out',name,'--model'] - p=subprocess.Popen(' '.join(vcl),shell=True,stdout=flog,cwd=twd) - retval = p.wait() - resf = '%s.model' % tname # plink output is here we hope - xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file - makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgCaCo_TopTable',description=name,topn=topn) - flog.write('\n'.join(logme)) - flog.close() # close the log used - #shutil.copytree(twd,logoutdir) - shutil.rmtree(twd) # clean up - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgCaCo.xml --- a/tools/rgenetics/rgCaCo.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -<tool id="rgCaCo1" name="Case Control:"> - <description>for unrelated subjects</description> - <command interpreter="python"> - rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$title" '$out_file1' '$logf' '$logf.files_path' '$gffout' - </command> - <inputs> - <param name="i" type="data" label="RGenetics genotype data from your current history" - format="pbed" /> - <param name='title' type='text' size="132" value='CaseControl' label="Title for this job"/> - - </inputs> - - <outputs> - <data format="tabular" name="out_file1" label="${title}_rgCaCo.xls" /> - <data format="txt" name="logf" label="${title}_rgCaCo.log"/> - <data format="gff" name="gffout" label="${title}_rgCaCoTop.gff" /> - </outputs> -<tests> - <test> - <param name='i' value='tinywga' ftype='pbed' > - <metadata name='base_name' value='tinywga' /> - <composite_data value='tinywga.bim' /> - <composite_data value='tinywga.bed' /> - <composite_data value='tinywga.fam' /> - <edit_attributes type='name' value='tinywga' /> - </param> - <param name='title' value='rgCaCotest1' /> - <output name='out_file1' file='rgCaCotest1_CaCo.xls' ftype='tabular' compare='diff' /> - <output name='logf' file='rgCaCotest1_CaCo_log.txt' ftype='txt' compare='diff' lines_diff='20' /> - <output name='gffout' file='rgCaCotest1_CaCo_topTable.gff' ftype='gff' compare='diff' /> - </test> -</tests> -<help> - -.. class:: infomark - -**Syntax** - -- **Genotype file** is the input case control data chosen from available library Plink binary files -- **Map file** is the linkage format .map file corresponding to the genotypes in the Genotype file -- **Type of test** is the kind of test statistic to report such as Armitage trend test or genotype test -- **Format** determines how your data will be returned to your Galaxy workspace - ------ - -**Summary** - -This tool will perform some standard statistical tests comparing subjects designated as -affected (cases) and unaffected subjects (controls). To avoid bias, it is important that -controls who had been affected would have been eligible for sampling as cases. This may seem -odd, but it requires that the cases and controls are drawn from the same sampling frame. - -The armitage trend test is robust to departure from HWE and so very attractive - after all, a real disease -mutation may well result in distorted HWE at least in cases. All the others are susceptible to -bias in the presence of HWE departures. - -All of these tests are exquisitely sensitive to non-differential population stratification in cases -compared to controls and this must be tested before believing any results here. Use the PCA method for -100k markers or more. - -If you don't see the genotype data set you want here, it can be imported using one of the methods available from -the Galaxy Get Data tool page. - -Output format can be UCSC .bed if you want to see your -results as a fully fledged UCSC track. A map file containing the chromosome and offset for each marker is required for -writing this kind of output. -Alternatively you can use .gg for the UCSC Genome Graphs tool which has all of the advantages -of the the .bed track, plus a neat, visual front end that displays a lot of useful clues. -Either of these are a very useful way of quickly getting a look -at your data in full genomic context. - -Finally, if you can't live without -spreadsheet data, choose the .xls tab delimited format. It's not a stupid binary excel file. Just a plain old tab delimited -one with a header. Fortunately excel is dumb enough to open these without much protest. - - ------ - -.. class:: infomark - -**Attribution** - -This Galaxy tool relies on Plink (see Plinksrc_) to test Casae Control association models. - -So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see -DOC_ - -Tool and Galaxy datatypes originally designed and written for the Rgenetics -series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com) - -Copyright Ross Lazarus March 2007 -This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL. - -I'm no lawyer, but it looks like you got GPL if you use this software. Good luck. - -.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html - -.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#cc - -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgClean.py --- a/tools/rgenetics/rgClean.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,160 +0,0 @@ -""" -# galaxy tool xml files can define a galaxy supplied output filename -# that must be passed to the tool and used to return output -# here, the plink log file is copied to that file and removed -# took a while to figure this out! -# use exec_before_job to give files sensible names -# -# ross april 14 2007 -# plink cleanup script -# ross lazarus March 2007 for camp illumina whole genome data -# note problems with multiple commands being ignored - eg --freq --missing --mendel -# only the first seems to get done... -# -##Summary statistics versus inclusion criteria -## -##Feature As summary statistic As inclusion criteria -##Missingness per individual --missing --mind N -##Missingness per marker --missing --geno N -##Allele frequency --freq --maf N -##Hardy-Weinberg equilibrium --hardy --hwe N -##Mendel error rates --mendel --me N M -# -# call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile -# note plinkClean_code.py does some renaming before the job starts - - - <command interpreter="python2.4"> - rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf' - '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId' - - -""" -import sys,shutil,os,subprocess, glob, string, tempfile, time -from rgutils import galhtmlprefix, timenow, plinke -prog = os.path.split(sys.argv[0])[-1] -myversion = 'January 4 2010' -verbose=False - - -def fixoutaff(outpath='',newaff='1'): - """ quick way to create test data sets - set all aff to 1 or 2 for - some hapmap data and then merge - [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam - 1341 14 0 0 2 1 - 1341 2 13 14 2 1 - 1341 13 0 0 1 1 - 1340 9 0 0 1 1 - 1340 10 0 0 2 1 - """ - nchanged = 0 - fam = '%s.fam' % outpath - famf = open(fam,'r') - fl = famf.readlines() - famf.close() - for i,row in enumerate(fl): - lrow = row.split() - if lrow[-1] <> newaff: - lrow[-1] = newaff - fl[i] = ' '.join(lrow) - fl[i] += '\n' - nchanged += 1 - fo = open(fam,'w') - fo.write(''.join(fl)) - fo.close() - return nchanged - - - -def clean(): - """ - """ - if len(sys.argv) < 16: - print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv) - print >> sys.stdout, """this script will filter a linkage format ped - and map file containing genotypes. It takes 14 parameters - the plink --f parameter and" - a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei" - documented in the plink docs plus the file to be returned to Galaxy - called as: - <command interpreter="python"> - rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' - '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path' - '$relfilter' '$afffilter' '$sexfilter' '$fixaff' - </command> - - """ - sys.exit(1) - plog = [] - inpath = sys.argv[1] - inbase = sys.argv[2] - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - title = sys.argv[3].translate(trantab) - mind = sys.argv[4] - geno = sys.argv[5] - hwe = sys.argv[6] - maf = sys.argv[7] - me1 = sys.argv[8] - me2 = sys.argv[9] - outfname = sys.argv[10] - outfpath = sys.argv[11] - relf = sys.argv[12] - afff = sys.argv[13] - sexf = sys.argv[14] - fixaff = sys.argv[15] - output = os.path.join(outfpath,outfname) - outpath = os.path.join(outfpath,title) - outprunepath = os.path.join(outfpath,'ldprune_%s' % title) - try: - os.makedirs(outfpath) - except: - pass - bfile = os.path.join(inpath,inbase) - outf = file(outfname,'w') - vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out', - outpath,'--set-hh-missing','--mind',mind, - '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2] - # yes - the --me parameter takes 2 values - mendels per snp and per family - if relf == 'oo': # plink filters are what they leave... - vcl.append('--filter-nonfounders') # leave only offspring - elif relf == 'fo': - vcl.append('--filter-founders') - if afff == 'affonly': - vcl.append('--filter-controls') - elif relf == 'unaffonly': - vcl.append('--filter-cases') - if sexf == 'fsex': - vcl.append('--filter-females') - elif relf == 'msex': - vcl.append('--filter-males') - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath) - retval = p.wait() - plog.append('%s started, called as %s' % (prog,' '.join(sys.argv))) - outf.write(galhtmlprefix % prog) - outf.write('<ul>\n') - plogf = '%s.log' % os.path.join(outfpath,title) - try: - plogl = file(plogf,'r').readlines() - plog += [x.strip() for x in plogl] - except: - plog += ['###Cannot open plink log file %s' % plogf,] - # if fixaff, want to 'fix' the fam file - if fixaff <> '0': - nchanged = fixoutaff(outpath=outpath,newaff=fixaff) - plog += ['## fixaff was requested %d subjects affection status changed to %s' % (nchanged,fixaff)] - pf = file(plogf,'w') - pf.write('\n'.join(plog)) - pf.close() - globme = os.path.join(outfpath,'*') - flist = glob.glob(globme) - flist.sort() - for i, data in enumerate( flist ): - outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) - outf.write('</ul>\n') - outf.write("</ul></br></div></body></html>") - outf.close() - - -if __name__ == "__main__": - clean() - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgClean.xml --- a/tools/rgenetics/rgClean.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ -<tool id="rgClean1" name="Clean genotypes:"> - <description>filter markers, subjects</description> - - <command interpreter="python"> - rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' - '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path' - '$relfilter' '$afffilter' '$sexfilter' '$fixaff' - </command> - - <inputs> - <param name="input_file" type="data" label="RGenetics genotype library file in compressed Plink format" - size="120" format="pbed" /> - <param name="title" type="text" size="80" label="Descriptive title for cleaned genotype file" value="Cleaned_data"/> - <param name="geno" type="text" label="Maximum Missing Fraction: Markers" value="0.05" /> - <param name="mind" type="text" value="0.1" label="Maximum Missing Fraction: Subjects"/> - <param name="mef" type="text" label="Maximum Mendel Error Rate: Family" value="0.05"/> - <param name="mei" type="text" label="Maximum Mendel Error Rate: Marker" value="0.05"/> - <param name="hwe" type="text" value="0" label="Smallest HWE p value (set to 0 for all)" /> - <param name="maf" type="text" value="0.01" - label="Smallest Minor Allele Frequency (set to 0 for all)"/> - <param name='relfilter' label = "Filter on pedigree relatedness" type="select" - optional="false" size="132" - help="Optionally remove related subjects if pedigree identifies founders and their offspring"> - <option value="all" selected='true'>No filter on relatedness</option> - <option value="fo" >Keep Founders only (pedigree m/f ID = "0")</option> - <option value="oo" >Keep Offspring only (one randomly chosen if >1 sibs in family)</option> - </param> - <param name='afffilter' label = "Filter on affection status" type="select" - optional="false" size="132" - help="Optionally remove affected or non affected subjects"> - <option value="allaff" selected='true'>No filter on affection status</option> - <option value="affonly" >Keep Controls only (affection='1')</option> - <option value="unaffonly" >Keep Cases only (affection='2')</option> - </param> - <param name='sexfilter' label = "Filter on gender" type="select" - optional="false" size="132" - help="Optionally remove all male or all female subjects"> - <option value="allsex" selected='true'>No filter on gender status</option> - <option value="msex" >Keep Males only (pedigree gender='1')</option> - <option value="fsex" >Keep Females only (pedigree gender='2')</option> - </param> - <param name="fixaff" type="text" value="0" - label = "Change ALL subjects affection status to (0=no change,1=unaff,2=aff)" - help="Use this option to switch the affection status to a new value for all output subjects" /> - </inputs> - - <outputs> - <data format="pbed" name="out_file1" metadata_source="input_file" label="${title}_rgClean.pbed" /> - </outputs> - -<tests> - <test> - <param name='input_file' value='tinywga' ftype='pbed' > - <metadata name='base_name' value='tinywga' /> - <composite_data value='tinywga.bim' /> - <composite_data value='tinywga.bed' /> - <composite_data value='tinywga.fam' /> - <edit_attributes type='name' value='tinywga' /> - </param> - <param name='title' value='rgCleantest1' /> - <param name="geno" value="1" /> - <param name="mind" value="1" /> - <param name="mef" value="0" /> - <param name="mei" value="0" /> - <param name="hwe" value="0" /> - <param name="maf" value="0" /> - <param name="relfilter" value="all" /> - <param name="afffilter" value="allaff" /> - <param name="sexfilter" value="allsex" /> - <param name="fixaff" value="0" /> - <output name='out_file1' file='rgtestouts/rgClean/rgCleantest1.pbed' compare="diff" lines_diff="25" > - <extra_files type="file" name='rgCleantest1.bim' value="rgtestouts/rgClean/rgCleantest1.bim" compare="diff" /> - <extra_files type="file" name='rgCleantest1.fam' value="rgtestouts/rgClean/rgCleantest1.fam" compare="diff" /> - <extra_files type="file" name='rgCleantest1.bed' value="rgtestouts/rgClean/rgCleantest1.bed" compare="diff" /> - </output> - </test> -</tests> -<help> - -.. class:: infomark - -**Syntax** - -- **Genotype data** is the input genotype file chosen from your current history -- **Descriptive title** is the name to use for the filtered output file -- **Missfrac threshold: subjects** is the threshold for missingness by subject. Subjects with more than this fraction missing will be excluded from the import -- **Missfrac threshold: markers** is the threshold for missingness by marker. Markers with more than this fraction missing will be excluded from the import -- **MaxMendel Individuals** Mendel error fraction above which to exclude subjects with more than the specified fraction of mendelian errors in transmission (for family data only) -- **MaxMendel Families** Mendel error fraction above which to exclude families with more than the specified fraction of mendelian errors in transmission (for family data only) -- **HWE** is the threshold for HWE test p values below which the marker will not be imported. Set this to -1 and all markers will be imported regardless of HWE p value -- **MAF** is the threshold for minor allele frequency - SNPs with lower MAF will be excluded -- **Filters** for founders/offspring or affected/unaffected or males/females are optionally available if needed -- **Change Affection** is only needed if you want to change the affection status for creating new analysis datasets - ------ - -**Attribution** - -This tool relies on the work of many people. It uses Plink http://pngu.mgh.harvard.edu/~purcell/plink/, -and the R http://cran.r-project.org/ and -Bioconductor http://www.bioconductor.org/ projects. -respectively. - -In particular, http://pngu.mgh.harvard.edu/~purcell/plink/ -has excellent documentation describing the parameters you can set here. - -This implementation is a Galaxy tool wrapper around these third party applications. -It was originally designed and written for family based data from the CAMP Illumina run of 2007 by -ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit. - -Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy. - ------ - -**Summary** - -Reliable statistical inference depends on reliable data. Poor quality samples and markers -may add more noise than signal, decreasing statistical power. Removing the worst of them -can be done by setting thresholds for some of the commonly used technical quality measures -for genotype data. Of course discordant replicate calls are also very informative but are not -in scope here. - -Marker cleaning: Filters are available to remove markers below a specific minor allele -frequency, beyond a Hardy Wienberg threshold, below a minor allele frequency threshold, -or above a threshold for missingness. If family data are available, thresholds for Mendelian -error can be set. - -Subject cleaning: Filters are available to remove subjects with many missing calls. Subjects and markers for family data can be filtered by proportions -of Mendelian errors in observed transmission. Use the QC reporting tool to -generate a comprehensive series of reports for quality control. - -Note that ancestry and cryptic relatedness should also be checked using the relevant tools. - ------ - -.. class:: infomark - -**Tip** - -You can check that you got what you asked for by running the QC tool to ensure that the distributions -are truncated the way you expect. Note that you do not expect that the thresholds will be exactly -what you set - some bad assays and subjects are out in multiple QC measures, so you sometimes have -more samples or markers than you exactly set for each threshold. Finally, the ordering of -operations matters and Plink is somewhat restrictive about what it will do on each pass -of the data. At least it's fixed. - ------ - -This Galaxy tool was written by Ross Lazarus for the Rgenetics project -It uses Plink for most calculations - for full Plink attribution, source code and documentation, -please see http://pngu.mgh.harvard.edu/~purcell/plink/ plus some custom python code - -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgClustalw.py --- a/tools/rgenetics/rgClustalw.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -""" -rgclustalw.py -wrapper for clustalw necessitated by bad choice of output path for .dnd file based on input file. Naughty. -Copyright ross lazarus march 2011 -All rights reserved -Licensed under the LGPL -""" - -import sys,optparse,os,subprocess,tempfile,shutil - -class Clustrunner: - """ - """ - def __init__(self,opts=None): - self.opts = opts - self.iname = 'infile_copy' - shutil.copy(self.opts.input,self.iname) - - def run(self): - tlf = open(self.opts.outlog,'w') - cl = ['clustalw2 -INFILE=%s -OUTFILE=%s -OUTORDER=%s -TYPE=%s -OUTPUT=%s' % (self.iname,self.opts.output,self.opts.out_order,self.opts.dnarna,self.opts.outform)] - if self.opts.seq_range_end <> None and self.opts.seq_range_start <> None: - cl.append('-RANGE=%s,%s' % (self.opts.seq_range_start,self.opts.seq_range_end)) - if self.opts.outform=='CLUSTAL' and self.opts.outseqnos <> None: - cl.append('-SEQNOS=ON') - process = subprocess.Popen(' '.join(cl), shell=True, stderr=tlf, stdout=tlf) - rval = process.wait() - dndf = '%s.dnd' % self.iname - if os.path.exists(dndf): - tlf.write('\nClustal created the following dnd file for your information:\n') - dnds = open('%s.dnd' % self.iname,'r').readlines() - for row in dnds: - tlf.write(row) - tlf.write('\n') - tlf.close() - os.unlink(self.iname) - - - -if __name__ == "__main__": - op = optparse.OptionParser() - op.add_option('-i', '--input', default=None) - op.add_option('-o', '--output', default=None) - op.add_option('-t', '--outname', default="rgClustal") - op.add_option('-s', '--out_order', default='ALIGNMENT') - op.add_option('-f', '--outform', default='CLUSTAL') - op.add_option('-e', '--seq_range_end',default=None) - op.add_option('-b', '--seq_range_start',default=None) - op.add_option('-l','--outlog',default='rgClustalw.log') - op.add_option('-q', '--outseqnos',default=None) - op.add_option('-d', '--dnarna',default='DNA') - - opts, args = op.parse_args() - assert opts.input <> None - assert os.path.isfile(opts.input) - c = Clustrunner(opts) - c.run() - - - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgClustalw.xml --- a/tools/rgenetics/rgClustalw.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ -<tool id="clustalw" name="ClustalW" version="0.1"> - <description>multiple sequence alignment program for DNA or proteins</description> - <command interpreter="python"> - rgClustalw.py -i "$input" -o "$output" -s "$out_order" -l "$outlog" -t "$outname" -d "$dnarna" - #if ($range.mode=="part") --b "$range.seq_range_start" -e "$range.seq_range_end" - #end if - #if ($outcontrol.outform=="clustal") --f "CLUSTAL" - #if ($outcontrol.out_seqnos=="ON") --q "ON" - #end if - #end if - #if ($outcontrol.outform=="phylip") --f "PHYLIP" - #end if - #if ($outcontrol.outform=="fasta") --f "FASTA" - #end if - </command> - <inputs> - <page> - <param format="fasta" name="input" type="data" label="Fasta File" /> - <param name="outname" label="Name for output files to make it easy to remember what you did" type="text" size="50" value="Clustal_run" /> - <param name="dnarna" type="select" label="Data Type"> - <option value="DNA" selected="True">DNA nucleotide sequences</option> - <option value="PROTEIN">Protein sequences</option> - </param> - <conditional name="outcontrol"> - <param name="outform" type="select" label="Output alignment format"> - <option value="clustal" selected="True">Native Clustal output format</option> - <option value="phylip">Phylip format</option> - <option value="fasta">Fasta format</option> - </param> - <when value="fasta" /> - <when value="phylip" /> - <when value="clustal"> - <param name="out_seqnos" type="select" label="Show residue numbers in clustal format output"> - <option value="ON">yes</option> - <option value="OFF" selected="true">no</option> - </param> - </when> - </conditional> - <param name="out_order" type="select" label="Output Order"> - <option value="ALIGNED">aligned</option> - <option value="INPUT">same order as input file</option> - </param> - - <conditional name="range"> - <param name="mode" type="select" label="Output complete alignment (or specify part to output)"> - <option value="complete">complete alignment</option> - <option value="part">only part of the alignment</option> - </param> - <when value="complete"> - </when> - <when value="part"> - <param name="seq_range_start" size="5" type="integer" value="1" label="start point" help="sequence range to write"> - </param> - <param name="seq_range_end" size="5" type="integer" value="99999" label="end point" > - </param> - </when> - </conditional> - </page> - </inputs> - <outputs> - <data format="clustal" name="output" label="${outname}_output.${outcontrol.outform}"> - <change_format> - <when input="outcontrol.outform" value="phylip" format="phylip" /> - <when input="outcontrol.outform" value="fasta" format="fasta" /> - </change_format> - </data> - <data format="txt" name="outlog" label="${outname}_clustal_log.txt"/> - </outputs> - <tests> - <test> - <param name="input" value="rgClustal_testin.fasta" /> - <param name = "outname" value="" /> - <param name = "outform" value="fasta" /> - <param name = "dnarna" value="DNA" /> - <param name = "mode" value="complete" /> - <param name = "out_order" value="ALIGNED" /> - <output name="output" file="rgClustal_testout.fasta" ftype="fasta" /> - <output name="outlog" file="rgClustal_testout.log" ftype="txt" lines_diff="5" /> - </test> - </tests> - <help> - -**Note** - -This tool allows you to run a multiple sequence alignment with ClustalW2 (see Clustsrc_) using the default options. - -For a tutorial introduction, see ClustalW2_ - -You can align DNA or protein sequences in the input file which should be multiple sequences to be aligned in a fasta file - -A log will be output to your history showing the output Clustal would normally write to standard output. - -The alignments will appear as a clustal format file or optionally, as phylip or fasta format files in your history. If you choose fasta as -the output format, you can create a 'Logo' image using the Sequence Logo tool. - -If Clustal format is chosen, you have the option of adding basepair counts to the output - -A subsequence of the alignment can be output by setting the Output complete parameter to "Partial" and defining the offset and end of the subsequence to be output - ----- - -**Attribution** - -Clustal attribution and associated documentation are available at Clustsrc_ - -The first iteration of this Galaxy wrapper was written by Hans-Rudolf Hotz - see Clustfirst_ - -It was modified by Ross Lazarus for the rgenetics project - tests and some additional parameters were added - -This wrapper is released licensed under the LGPL_ - -.. _ClustalW2: http://www.ebi.ac.uk/2can/tutorials/protein/clustalw.html - -.. _Clustsrc: http://www.clustal.org - -.. _Clustfirst: http://lists.bx.psu.edu/pipermail/galaxy-dev/2010-November/003732.html - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html - - </help> - -</tool> - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgEigPCA.py --- a/tools/rgenetics/rgEigPCA.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,342 +0,0 @@ -""" -run smartpca - -This uses galaxy code developed by Dan to deal with -arbitrary output files using an html dataset with it's own -subdirectory containing the arbitrary files -We create that html file and add all the links we need - -Note that we execute the smartpca.perl program in the output subdirectory -to avoid having to clear out the job directory after running - -Code to convert linkage format ped files into eigenstratgeno format is left here -in case we decide to autoconvert - -Added a plot in R with better labels than the default eigensoft plot december 26 2007 - -DOCUMENTATION OF smartpca program: - -smartpca runs Principal Components Analysis on input genotype data and - outputs principal components (eigenvectors) and eigenvalues. - The method assumes that samples are unrelated. (However, a small number - of cryptically related individuals is usually not a problem in practice - as they will typically be discarded as outliers.) - -5 different input formats are supported. See ../CONVERTF/README -for documentation on using the convertf program to convert between formats. - -The syntax of smartpca is "../bin/smartpca -p parfile". We illustrate -how parfile works via a toy example (see example.perl in this directory). -This example takes input in EIGENSTRAT format. The syntax of how to take input -in other formats is analogous to the convertf program, see ../CONVERTF/README. - -The smartpca program prints various statistics to standard output. -To redirect this information to a file, change the above syntax to -"../bin/smartpca -p parfile >logfile". For a description of these -statistics, see the documentation file smartpca.info in this directory. - -Estimated running time of the smartpca program is - 2.5e-12 * nSNP * NSAMPLES^2 hours if not removing outliers. - 2.5e-12 * nSNP * NSAMPLES^2 hours * (1+m) if m outlier removal iterations. -Thus, under the default of up to 5 outlier removal iterations, running time is - up to 1.5e-11 * nSNP * NSAMPLES^2 hours. - ------------------------------------------------------------------------- - -DESCRIPTION OF EACH PARAMETER in parfile for smartpca: - -genotypename: input genotype file (in any format: see ../CONVERTF/README) -snpname: input snp file (in any format: see ../CONVERTF/README) -indivname: input indiv file (in any format: see ../CONVERTF/README) -evecoutname: output file of eigenvectors. See numoutevec parameter below. -evaloutname: output file of all eigenvalues - -OPTIONAL PARAMETERS: - -numoutevec: number of eigenvectors to output. Default is 10. -numoutlieriter: maximum number of outlier removal iterations. - Default is 5. To turn off outlier removal, set this parameter to 0. -numoutlierevec: number of principal components along which to - remove outliers during each outlier removal iteration. Default is 10. -outliersigmathresh: number of standard deviations which an individual must - exceed, along one of the top (numoutlierevec) principal components, in - order for that individual to be removed as an outlier. Default is 6.0. -outlieroutname: output logfile of outlier individuals removed. If not specified, - smartpca will print this information to stdout, which is the default. -usenorm: Whether to normalize each SNP by a quantity related to allele freq. - Default is YES. (When analyzing microsatellite data, should be set to NO. - See Patterson et al. 2006.) -altnormstyle: Affects very subtle details in normalization formula. - Default is YES (normalization formulas of Patterson et al. 2006) - To match EIGENSTRAT (normalization formulas of Price et al. 2006), set to NO. -missingmode: If set to YES, then instead of doing PCA on # reference alleles, - do PCA on whether each data point is missing or nonmissing. Default is NO. - May be useful for detecting informative missingness (Clayton et al. 2005). -nsnpldregress: If set to a positive integer, then LD correction is turned on, - and input to PCA will be the residual of a regression involving that many - previous SNPs, according to physical location. See Patterson et al. 2006. - Default is 0 (no LD correction). If desiring LD correction, we recommend 2. -maxdistldregress: If doing LD correction, this is the maximum genetic distance - (in Morgans) for previous SNPs used in LD correction. Default is no maximum. -poplistname: If wishing to infer eigenvectors using only individuals from a - subset of populations, and then project individuals from all populations - onto those eigenvectors, this input file contains a list of population names, - one population name per line, which will be used to infer eigenvectors. - It is assumed that the population of each individual is specified in the - indiv file. Default is to use individuals from all populations. -phylipoutname: output file containing an fst matrix which can be used as input - to programs in the PHYLIP package, such as the "fitch" program for - constructing phylogenetic trees. -noxdata: if set to YES, all SNPs on X chr are excluded from the data set. - The smartpca default for this parameter is YES, since different variances - for males vs. females on X chr may confound PCA analysis. -nomalexhet: if set to YES, any het genotypes on X chr for males are changed - to missing data. The smartpca default for this parameter is YES. -badsnpname: specifies a list of SNPs which should be excluded from the data set. - Same format as example.snp. Cannot be used if input is in - PACKEDPED or PACKEDANCESTRYMAP format. -popsizelimit: If set to a positive integer, the result is that only the first - popsizelimit individuals from each population will be included in the - analysis. It is assumed that the population of each individual is specified - in the indiv file. Default is to use all individuals in the analysis. - -The next 5 optional parameters allow the user to output genotype, snp and - indiv files which will be identical to the input files except that: - Any individuals set to Ignore in the input indiv file will be - removed from the data set (see ../CONVERTF/README) - Any data excluded or set to missing based on noxdata, nomalexhet and - badsnpname parameters (see above) will be removed from the data set. - The user may decide to output these files in any format. -outputformat: ANCESTRYMAP, EIGENSTRAT, PED, PACKEDPED or PACKEDANCESTRYMAP -genotypeoutname: output genotype file -snpoutname: output snp file -indivoutname: output indiv file -outputgroup: see documentation in ../CONVERTF/README -""" -import sys,os,time,subprocess,string,glob -from rgutils import RRun, galhtmlprefix, galhtmlpostfix, timenow, smartpca, rexe, plinke -verbose = False - -def makePlot(eigpca='test.pca',title='test',pdfname='test.pdf',h=8,w=10,nfp=None,rexe=''): - """ - the eigenvec file has a # row with the eigenvectors, then subject ids, eigenvecs and lastly - the subject class - Rpy not being used here. Write a real R script and run it. Sadly, this means putting numbers - somewhere - like in the code as monster R vector constructor c(99.3,2.14) strings - At least you have the data and the analysis in one single place. Highly reproducible little - piece of research. - """ - debug=False - f = file(eigpca,'r') - R = [] - if debug: - R.append('sessionInfo()') - R.append("print('dir()=:')") - R.append('dir()') - R.append("print('pdfname=%s')" % pdfname) - gvec = [] - pca1 = [] - pca2 = [] - groups = {} - glist = [] # list for legend - ngroup = 1 # increment for each new group encountered for pch vector - for n,row in enumerate(f): - if n > 1: - rowlist = row.strip().split() - group = rowlist[-1] - v1 = rowlist[1] - v2 = rowlist[2] - try: - v1 = float(v1) - except: - v1 = 0.0 - try: - v2 = float(v2) - except: - v2 = 0.0 - if not groups.get(group,None): - groups[group] = ngroup - glist.append(group) - ngroup += 1 # for next group - gvec.append(groups[group]) # lookup group number - pca1.append('%f' % v1) - pca2.append('%f' % v2) - # now have vectors of group,pca1 and pca2 - llist = [x.encode('ascii') for x in glist] # remove label unicode - eesh - llist = ['"%s"' % x for x in llist] # need to quote for R - R.append('llist=c(%s)' % ','.join(llist)) - - plist = range(2,len(llist)+2) # pch - avoid black circles - R.append('glist=c(%s)' % ','.join(['%d' % x for x in plist])) - pgvec = ['%d' % (plist[i-1]) for i in gvec] # plot symbol/colour for each point - R.append("par(lab=c(10,10,10))") # so our grid is denser than the default 5 - R.append("par(mai=c(1,1,1,0.5))") - maint = title - R.append('pdf("%s",h=%d,w=%d)' % (pdfname,h,w)) - R.append("par(lab=c(10,10,10))") - R.append('pca1 = c(%s)' % ','.join(pca1)) - R.append('pca2 = c(%s)' % ','.join(pca2)) - R.append('pgvec = c(%s)' % ','.join(pgvec)) - s = "plot(pca1,pca2,type='p',main='%s', ylab='Second ancestry eigenvector'," % maint - s += "xlab='First ancestry eigenvector',col=pgvec,cex=0.8,pch=pgvec)" - R.append(s) - R.append('legend("top",legend=llist,pch=glist,col=glist,title="Sample")') - R.append('grid(nx = 10, ny = 10, col = "lightgray", lty = "dotted")') - R.append('dev.off()') - R.append('png("%s.png",h=%d,w=%d,units="in",res=72)' % (pdfname,h,w)) - s = "plot(pca1,pca2,type='p',main='%s', ylab='Second ancestry eigenvector'," % maint - s += "xlab='First ancestry eigenvector',col=pgvec,cex=0.8,pch=pgvec)" - R.append(s) - R.append('legend("top",legend=llist,pch=glist,col=glist,title="Sample")') - R.append('grid(nx = 10, ny = 10, col = "lightgray", lty = "dotted")') - R.append('dev.off()') - rlog,flist = RRun(rcmd=R,title=title,outdir=nfp) - print >> sys.stdout, '\n'.join(R) - print >> sys.stdout, rlog - - -def getfSize(fpath,outpath): - """ - format a nice file size string - """ - size = '' - fp = os.path.join(outpath,fpath) - if os.path.isfile(fp): - n = float(os.path.getsize(fp)) - if n > 2**20: - size = ' (%1.1f MB)' % (n/2**20) - elif n > 2**10: - size = ' (%1.1f KB)' % (n/2**10) - elif n > 0: - size = ' (%d B)' % (int(n)) - return size - - -def runEigen(): - """ run the smartpca prog - documentation follows - - smartpca.perl -i fakeped_100.eigenstratgeno -a fakeped_100.map -b fakeped_100.ind -p fakeped_100 -e fakeped_100.eigenvals -l - fakeped_100.eigenlog -o fakeped_100.eigenout - -DOCUMENTATION OF smartpca.perl program: - -This program calls the smartpca program (see ../POPGEN/README). -For this to work, the bin directory containing smartpca MUST be in your path. -See ./example.perl for a toy example. - -../bin/smartpca.perl --i example.geno : genotype file in EIGENSTRAT format (see ../CONVERTF/README) --a example.snp : snp file (see ../CONVERTF/README) --b example.ind : indiv file (see ../CONVERTF/README) --k k : (Default is 10) number of principal components to output --o example.pca : output file of principal components. Individuals removed - as outliers will have all values set to 0.0 in this file. --p example.plot : prefix of output plot files of top 2 principal components. - (labeling individuals according to labels in indiv file) --e example.eval : output file of all eigenvalues --l example.log : output logfile --m maxiter : (Default is 5) maximum number of outlier removal iterations. - To turn off outlier removal, set -m 0. --t topk : (Default is 10) number of principal components along which - to remove outliers during each outlier removal iteration. --s sigma : (Default is 6.0) number of standard deviations which an - individual must exceed, along one of topk top principal - components, in order to be removed as an outlier. - - now uses https://www.bx.psu.edu/cgi-bin/trac.cgi/galaxy/changeset/1832 - -All files can be viewed however, by making links in the primary (HTML) history item like: -<img src="display_child?parent_id=2&designation=SomeImage?" alt="Some Image"/> -<a href="display_child?parent_id=2&designation=SomeText?">Some Text</a> - - <command interpreter="python"> - rgEigPCA.py "$i.extra_files_path/$i.metadata.base_name" "$title" "$out_file1" - "$out_file1.files_path" "$k" "$m" "$t" "$s" "$pca" - </command> - - """ - if len(sys.argv) < 9: - print 'Need an input genotype file root, a title, a temp id and the temp file path for outputs,' - print ' and the 4 integer tuning parameters k,m,t and s in order. Given that, will run smartpca for eigensoft' - sys.exit(1) - else: - print >> sys.stdout, 'rgEigPCA.py got %s' % (' '.join(sys.argv)) - skillme = ' %s' % string.punctuation - trantab = string.maketrans(skillme,'_'*len(skillme)) - ofname = sys.argv[5] - progname = os.path.basename(sys.argv[0]) - infile = sys.argv[1] - infpath,base_name = os.path.split(infile) # now takes precomputed or autoconverted ldreduced dataset - title = sys.argv[2].translate(trantab) # must replace all of these for urls containing title - outfile1 = sys.argv[3] - newfilepath = sys.argv[4] - try: - os.mkdirs(newfilepath) - except: - pass - op = os.path.split(outfile1)[0] - try: # for test - needs this done - os.makedirs(op) - except: - pass - eigen_k = sys.argv[5] - eigen_m = sys.argv[6] - eigen_t = sys.argv[7] - eigen_s = sys.argv[8] - eigpca = sys.argv[9] # path to new dataset for pca results - for later adjustment - eigentitle = os.path.join(newfilepath,title) - explanations=['Samples plotted in first 2 eigenvector space','Principle components','Eigenvalues', - 'Smartpca log (contents shown below)'] - rplotname = 'PCAPlot.pdf' - eigenexts = [rplotname, "pca.xls", "eval.xls"] - newfiles = ['%s_%s' % (title,x) for x in eigenexts] # produced by eigenstrat - rplotout = os.path.join(newfilepath,newfiles[0]) # for R plots - eigenouts = [x for x in newfiles] - eigenlogf = '%s_log.txt' % title - newfiles.append(eigenlogf) # so it will also appear in the links - lfname = outfile1 - lf = file(lfname,'w') - lf.write(galhtmlprefix % progname) - try: - os.makedirs(newfilepath) - except: - pass - smartCL = '%s -i %s.bed -a %s.bim -b %s.fam -o %s -p %s -e %s -l %s -k %s -m %s -t %s -s %s' % \ - (smartpca,infile, infile, infile, eigenouts[1],'%s_eigensoftplot.pdf' % title,eigenouts[2],eigenlogf, \ - eigen_k, eigen_m, eigen_t, eigen_s) - env = os.environ - p=subprocess.Popen(smartCL,shell=True,cwd=newfilepath) - retval = p.wait() - # copy the eigenvector output file needed for adjustment to the user's eigenstrat library directory - elog = file(os.path.join(newfilepath,eigenlogf),'r').read() - eeigen = os.path.join(newfilepath,'%s.evec' % eigenouts[1]) # need these for adjusting - try: - eigpcaRes = file(eeigen,'r').read() - except: - eigpcaRes = '' - file(eigpca,'w').write(eigpcaRes) - makePlot(eigpca=eigpca,pdfname=newfiles[0],title=title,nfp=newfilepath,rexe=rexe) - s = 'Output from %s run at %s<br/>\n' % (progname,timenow()) - lf.write('<h4>%s</h4>\n' % s) - lf.write('newfilepath=%s, rexe=%s' % (newfilepath,rexe)) - lf.write('(click on the image below to see a much higher quality PDF version)') - thumbnail = '%s.png' % newfiles[0] # foo.pdf.png - who cares? - if os.path.exists(os.path.join(newfilepath,thumbnail)): - lf.write('<table border="0" cellpadding="10" cellspacing="10"><tr><td>\n') - lf.write('<a href="%s"><img src="%s" alt="%s" hspace="10" align="left" /></a></td></tr></table><br/>\n' \ - % (newfiles[0],thumbnail,explanations[0])) - allfiles = os.listdir(newfilepath) - allfiles.sort() - sizes = [getfSize(x,newfilepath) for x in allfiles] - lallfiles = ['<li><a href="%s">%s %s</a></li>\n' % (x,x,sizes[i]) for i,x in enumerate(allfiles)] # html list - lf.write('<div class="document">All Files:<ol>%s</ol></div>' % ''.join(lallfiles)) - lf.write('<div class="document">Log %s contents follow below<p/>' % eigenlogf) - lf.write('<pre>%s</pre></div>' % elog) # the eigenlog - s = 'If you need to rerun this analysis, the command line used was\n%s\n<p/>' % (smartCL) - lf.write(s) - lf.write(galhtmlpostfix) # end galhtmlprefix div - lf.close() - - -if __name__ == "__main__": - runEigen() diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgEigPCA.xml --- a/tools/rgenetics/rgEigPCA.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,167 +0,0 @@ -<tool id="rgEigPCA1" name="Eigensoft:"> - <description>PCA Ancestry using SNP</description> - - <command interpreter="python"> - rgEigPCA.py "$i.extra_files_path/$i.metadata.base_name" "$title" "$out_file1" - "$out_file1.files_path" "$k" "$m" "$t" "$s" "$pca" - </command> - - <inputs> - - <param name="i" type="data" label="Input genotype data file" - size="120" format="ldindep" /> - <param name="title" type="text" value="Ancestry PCA" label="Title for outputs from this run" - size="80" /> - <param name="k" type="integer" value="4" label="Number of principal components to output" - size="3" /> - <param name="m" type="integer" value="0" label="Max. outlier removal iterations" - help="To turn on outlier removal, set m=5 or so. Do this if you plan on adjusting any analyses" - size="3" /> - <param name="t" type="integer" value="5" label="# principal components used for outlier removal" - size="3" /> - <param name="s" type="integer" value="6" label="#SDs for outlier removal" - help = "Any individual with SD along one of k top principal components > s will be removed as an outlier." - size="3" /> - - </inputs> - - <outputs> - <data name="out_file1" format="html" label="${title}_rgEig.html"/> - <data name="pca" format="txt" label="${title}_rgEig.txt"/> - </outputs> - -<tests> - <test> - <param name='i' value='tinywga' ftype='ldindep' > - <metadata name='base_name' value='tinywga' /> - <composite_data value='tinywga.bim' /> - <composite_data value='tinywga.bed' /> - <composite_data value='tinywga.fam' /> - <edit_attributes type='name' value='tinywga' /> - </param> - <param name='title' value='rgEigPCAtest1' /> - <param name="k" value="4" /> - <param name="m" value="2" /> - <param name="t" value="2" /> - <param name="s" value="2" /> - <output name='out_file1' file='rgtestouts/rgEigPCA/rgEigPCAtest1.html' ftype='html' compare='diff' lines_diff='195'> - <extra_files type="file" name='rgEigPCAtest1_PCAPlot.pdf' value="rgtestouts/rgEigPCA/rgEigPCAtest1_PCAPlot.pdf" compare="sim_size" delta="3000"/> - </output> - <output name='pca' file='rgtestouts/rgEigPCA/rgEigPCAtest1.txt' compare='diff'/> - </test> -</tests> - -<help> - - -**Syntax** - -- **Genotype data** is an input genotype dataset in Plink lped (http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml) format. See below for notes -- **Title** is used to name the output files so you can remember what the outputs are for -- **Tuning parameters** are documented in the Eigensoft (http://genepath.med.harvard.edu/~reich/Software.htm) documentation - see below - - ------ - -**Summary** - -Eigensoft requires ld-reduced genotype data. -Galaxy has an automatic converter for genotype data in Plink linkage pedigree (lped) format. -For details of this generic genotype format, please see the Plink documentation at -http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml - -Reading that documentation, you'll see that the linkage pedigree format is really two related files with the same -file base name - a map and ped file - eg 'mygeno.ped' and 'mygeno.map'. -The map file has the chromosome, offset, genetic offset and snp name corresponding to each -genotype stored as separate alleles in the ped file. The ped file has family id, individual id, father id (or 0), mother id -(or 0), gender (1=male, 2=female, 0=unknown) and affection (1=unaffected, 2=affected, 0=unknown), -then two separate allele columns for each genotype. - -Once you have your data in the right format, you can upload those into your Galaxy history using the "upload" tool. - -To upload your lped data in the upload tool, choose 'lped' as the 'file format'. The tool form will change to -allow you to navigate to and select each member of the pair of ped and map files stored on your local computer -(or available at a public URL for Galaxy to grab). -Give the dataset a meaningful name (replace rgeneticsData with something more useful!) and click execute. - -When the upload is done, your new lped format dataset will appear in your history and then, -when you choose the ancestry tool, that history dataset will be available as input. - -**Warning for the Impatient** - -When you execute the tool, it will look like it has not started running for a while as the automatic converter -reduces the amount of LD - otherwise eigenstrat gives biased results. - - -**Attribution** - -This tool runs and relies on the work of many others, including the -maintainers of the Eigensoft program, and the R and -Bioconductor projects. For full attribution, source code and documentation, please see -http://genepath.med.harvard.edu/~reich/Software.htm, http://cran.r-project.org/ -and http://www.bioconductor.org/ respectively - -This implementation is a Galaxy tool wrapper around these third party applications. -It was originally designed and written for family based data from the CAMP Illumina run of 2007 by -ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit. - -copyright Ross Lazarus 2007 -Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html -but is about as useful as a sponge boat without EIGENSOFT pca code. - -**README from eigensoft2 distribution at http://genepath.med.harvard.edu/~reich/Software.htm** - -[rerla@beast eigensoft2]$ cat README -EIGENSOFT version 2.0, January 2008 (for Linux only) - -This is the same as our EIGENSOFT 2.0 BETA release with a few recent changes -as described at http://genepath.med.harvard.edu/~reich/New_In_EIGENSOFT.htm. - -Features of EIGENSOFT version 2.0 include: --- Keeping track of ref/var alleles in all file formats: see CONVERTF/README --- Handling data sets up to 8 billion genotypes: see CONVERTF/README --- Output SNP weightings of each principal component: see POPGEN/README - -The EIGENSOFT package implements methods from the following 2 papers: -Patterson N. et al. 2006 PLoS Genetics in press (population structure) -Price A.L. et al. 2006 NG 38:904-9 (EIGENSTRAT stratification correction) - -See POPGEN/README for documentation of population structure programs. - -See EIGENSTRAT/README for documentation of EIGENSTRAT programs. - -See CONVERTF/README for documentation of programs for converting file formats. - - -Executables and source code: ----------------------------- -All C executables are in the bin/ directory. - -We have placed source code for all C executables in the src/ directory, -for users who wish to modify and recompile our programs. For example, to -recompile the eigenstrat program, type -"cd src" -"make eigenstrat" -"mv eigenstrat ../bin" - -Note that some of our software will only compile if your system has the -lapack package installed. (This package is used to compute eigenvectors.) -Some users may need to change "blas-3" to "blas" in the Makefile, -depending on how blas and lapack are installed. - -If cc is not available on your system, try "cp Makefile.alt Makefile" -and then recompile. - -If you have trouble compiling and running our code, try compiling and -running the pcatoy program in the src directory: -"cd src" -"make pcatoy" -"./pcatoy" -If you are unable to run the pcatoy program successfully, please contact -your system administrator for help, as this is a systems issue which is -beyond our scope. Your system administrator will be able to troubleshoot -your systems issue using this trivial program. [You can also try running -the pcatoy program in the bin directory, which we have already compiled.] -</help> -</tool> - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgFastQC.py --- a/tools/rgenetics/rgFastQC.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,149 +0,0 @@ -""" -wrapper for fastqc - -called as - <command interpreter="python"> - rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" - </command> - - - -Current release seems overly intolerant of sam/bam header strangeness -Author notified... - - -""" - -import os,sys,subprocess,optparse,shutil,tempfile -from rgutils import getFileString - -class FastQC(): - """wrapper - """ - - - def __init__(self,opts=None): - assert opts <> None - self.opts = opts - - - def run_fastqc(self): - """ - In batch mode fastqc behaves not very nicely - will write to a new folder in - the same place as the infile called [infilebasename]_fastqc - rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc - duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt - error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png - fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png - - """ - dummy,tlog = tempfile.mkstemp(prefix='rgFastQClog') - sout = open(tlog, 'w') - fastq = os.path.basename(self.opts.input) - cl = [self.opts.executable,'-o %s' % self.opts.outputdir] - if self.opts.informat in ['sam','bam']: - cl.append('-f %s' % self.opts.informat) - if self.opts.contaminants <> None : - cl.append('-c %s' % self.opts.contaminants) - cl.append(self.opts.input) - p = subprocess.Popen(' '.join(cl), shell=True, stderr=sout, stdout=sout, cwd=self.opts.outputdir) - return_value = p.wait() - sout.close() - runlog = open(tlog,'r').readlines() - os.unlink(tlog) - flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh - odpath = None - for f in flist: - d = os.path.join(self.opts.outputdir,f) - if os.path.isdir(d): - if d.endswith('_fastqc'): - odpath = d - hpath = None - if odpath <> None: - try: - hpath = os.path.join(odpath,'fastqc_report.html') - rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag - except: - pass - if hpath == None: - res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),] - res += runlog - res += ['</pre>\n', - 'Please read the above for clues<br/>\n', - 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n', - 'It is also possible that the log shows that fastqc is not installed?<br/>\n', - 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n', - 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',] - return res - self.fix_fastqcimages(odpath) - flist = os.listdir(self.opts.outputdir) # these have now been fixed - excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png'] - flist = [x for x in flist if not x in excludefiles] - for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh - rep[i] = rep[i].replace('Icons/','') - rep[i] = rep[i].replace('Images/','') - - html = self.fix_fastqc(rep,flist,runlog) - return html - - - - def fix_fastqc(self,rep=[],flist=[],runlog=[]): - """ add some of our stuff to the html - """ - bs = '</body></html>\n' # hope they don't change this - try: - bodyindex = rep.index(bs) # hope they don't change this - except: - bodyindex = len(rep) - 1 - res = [] - res.append('<table>\n') - flist.sort() - for i,f in enumerate(flist): - if not(os.path.isdir(f)): - fn = os.path.split(f)[-1] - res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir))) - res.append('</table><p/>\n') - res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n') - res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n') - fixed = rep[:bodyindex] + res + rep[bodyindex:] - return fixed # with our additions - - - def fix_fastqcimages(self,odpath): - """ Galaxy wants everything in the same files_dir - """ - icpath = os.path.join(odpath,'Icons') - impath = os.path.join(odpath,'Images') - for adir in [icpath,impath,odpath]: - if os.path.exists(adir): - flist = os.listdir(adir) # get all files created - for f in flist: - if not os.path.isdir(os.path.join(adir,f)): - sauce = os.path.join(adir,f) - dest = os.path.join(self.opts.outputdir,f) - shutil.move(sauce,dest) - os.rmdir(adir) - - - -if __name__ == '__main__': - op = optparse.OptionParser() - op.add_option('-i', '--input', default=None) - op.add_option('-o', '--htmloutput', default=None) - op.add_option('-d', '--outputdir', default="/tmp/shortread") - op.add_option('-f', '--informat', default='fastq') - op.add_option('-n', '--namejob', default='rgFastQC') - op.add_option('-c', '--contaminants', default=None) - op.add_option('-e', '--executable', default='fastqc') - opts, args = op.parse_args() - assert opts.input <> None - assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable - if not os.path.exists(opts.outputdir): - os.makedirs(opts.outputdir) - f = FastQC(opts) - html = f.run_fastqc() - f = open(opts.htmloutput, 'w') - f.write(''.join(html)) - f.close() - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgFastQC.xml --- a/tools/rgenetics/rgFastQC.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ -<tool name="Fastqc: Fastqc QC" id="fastqc" version="0.1"> - <description>using FastQC from Babraham</description> - <command interpreter="python"> - rgFastQC.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" -f $input_file.ext -e ${GALAXY_DATA_INDEX_DIR}/shared/jars/FastQC/fastqc -#if $contaminants.dataset and str($contaminants) > '' --c "$contaminants" -#end if - </command> - <requirements> - <requirement type="package">FastQC</requirement> - </requirements> - <inputs> - <param format="fastqsanger,fastq,bam,sam" name="input_file" type="data" label="Short read data from your current history" /> - <param name="out_prefix" value="FastQC" type="text" label="Title for the output file - to remind you what the job was for" size="80" /> - <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list" - help="tab delimited file with 2 columns: name and sequence. For example: Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA"/> - </inputs> - <outputs> - <data format="html" name="html_file" label="${out_prefix}.html" /> - </outputs> - <tests> - <test> - <param name="input_file" value="1000gsample.fastq" /> - <param name="out_prefix" value="fastqc_out" /> - <param name="contaminants" value="fastqc_contaminants.txt" ftype="tabular" /> - <output name="html_file" file="fastqc_report.html" ftype="html" lines_diff="100"/> - </test> - </tests> - <help> - -.. class:: infomark - -**Purpose** - -FastQC aims to provide a simple way to do some quality control checks on raw -sequence data coming from high throughput sequencing pipelines. -It provides a modular set of analyses which you can use to give a quick -impression of whether your data has any problems of -which you should be aware before doing any further analysis. - -The main functions of FastQC are: - -- Import of data from BAM, SAM or FastQ files (any variant) -- Providing a quick overview to tell you in which areas there may be problems -- Summary graphs and tables to quickly assess your data -- Export of results to an HTML based permanent report -- Offline operation to allow automated generation of reports without running the interactive application - -**FastQC documentation** - -This is a Galaxy interface to the external package FastQC_. -Specific documentation on FastQC can be found on that site. -FastQC incorporates the Picard-tools_ libraries for sam/bam processing. - - .. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/ - .. _Picard-tools: http://picard.sourceforge.net/index.shtml - -The contaminants file parameter was borrowed from the independently developed -fastqcwrapper contributed to the Galaxy Community Tool Shed by J. Johnson. - ------ - -.. class:: infomark - -**Inputs and outputs** - -This wrapper will accept any fastq file as well as sam or bam as the primary file to check. -It will also take an optional file containing a list of contaminants information, in the form of -a tab-delimited file with 2 columns, name and sequence. - -The tool produces a single HTML output file that contains all of the results, including the following: - -- Basic Statistics -- Per base sequence quality -- Per sequence quality scores -- Per base sequence content -- Per base GC content -- Per sequence GC content -- Per base N content -- Sequence Length Distribution -- Sequence Duplication Levels -- Overrepresented sequences -- Kmer Content - -All except Basic Statistics and Overrepresented sequences are plots. - -</help> -</tool> diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgGLM.py --- a/tools/rgenetics/rgGLM.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,287 +0,0 @@ -#!/usr/local/bin/python -""" -# added most of the available options for linear models -# june 2009 rml -# hack to run and process a plink quantitative trait -# - -This is a wrapper for Shaun Purcell's Plink linear/logistic models for -traits, covariates and genotypes. - -It requires some judgement to interpret the findings -We need some better visualizations - manhattan plots are good. -svg with rs numbers for top 1%? - -toptable tools - truncate a gg file down to some low percentile - -intersect with other tables - eg gene expression regressions on snps - - - -""" - -import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands -from rgutils import plinke - -def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): - """ - score must be scaled to 0-1000 - - Want to make some wig tracks from each analysis - Best n -log10(p). Make top hit the window. - we use our tab output which has - rs chrom offset ADD_stat ADD_p ADD_log10p - rs3094315 1 792429 1.151 0.2528 0.597223 - - """ - - def is_number(s): - try: - float(s) - return True - except ValueError: - return False - header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) - column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] - halfwidth=100 - resfpath = os.path.join(twd,resf) - resf = open(resfpath,'r') - resfl = resf.readlines() # dumb but convenient for millions of rows - resfl = [x.split() for x in resfl] - headl = resfl[0] - resfl = resfl[1:] - headl = [x.strip().upper() for x in headl] - headIndex = dict(zip(headl,range(0,len(headl)))) - chrpos = headIndex.get('CHROM',None) - rspos = headIndex.get('RS',None) - offspos = headIndex.get('OFFSET',None) - ppos = headIndex.get('ADD_LOG10P',None) - wewant = [chrpos,rspos,offspos,ppos] - if None in wewant: # missing something - logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) - return - resfl = [x for x in resfl if x[ppos] > ''] - resfl = [(float(x[ppos]),x) for x in resfl] # decorate - resfl.sort() - resfl.reverse() # using -log10 so larger is better - resfl = resfl[:topn] # truncate - pvals = [x[0] for x in resfl] # need to scale - resfl = [x[1] for x in resfl] # drop decoration - if len(pvals) == 0: - logf.write('### no pvalues found in resfl - %s' % (resfl[:3])) - sys.exit(1) - maxp = max(pvals) # need to scale - minp = min(pvals) - prange = abs(maxp-minp) + 0.5 # fudge - scalefact = 1000.0/prange - logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) - for i,row in enumerate(resfl): - row[ppos] = '%d' % (int(scalefact*pvals[i])) - resfl[i] = row # replace - outf = file(outfname,'w') - outf.write(header) - outres = [] # need to resort into chrom offset order - for i,lrow in enumerate(resfl): - chrom,snp,offset,p, = [lrow[x] for x in wewant] - gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), - '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) - outres.append(gff) - outres = [(x[0],int(x[3]),x) for x in outres] # decorate - outres.sort() # into chrom offset - outres=[x[2] for x in outres] # undecorate - outres = ['\t'.join(x) for x in outres] - outf.write('\n'.join(outres)) - outf.write('\n') - outf.close() - - - -def xformQassoc(resf='',outfname='',logf=None,twd='.'): - """ plink.assoc.linear to gg file -from the docs -The output per each SNP might look something like: - - CHR SNP BP A1 TEST NMISS OR STAT P - 5 rs000001 10001 A ADD 664 0.7806 -1.942 0.05216 - 5 rs000001 10001 A DOMDEV 664 0.9395 -0.3562 0.7217 - 5 rs000001 10001 A COV1 664 0.9723 -0.7894 0.4299 - 5 rs000001 10001 A COV2 664 1.159 0.5132 0.6078 - 5 rs000001 10001 A GENO_2DF 664 NA 5.059 0.0797 - need to transform into gg columns for each distinct test - or bed for tracks? - - """ - logf.write('xformQassoc got resf=%s, outfname=%s\n' % (resf,outfname)) - resdict = {} - rsdict = {} - markerlist = [] - # plink is "clever" - will run logistic if only 2 categories such as gender - resfs = resf.split('.') - if resfs[-1] == 'logistic': - resfs[-1] = 'linear' - else: - resfs[-1] = 'logistic' - altresf = '.'.join(resfs) - - altresfpath = os.path.join(twd,altresf) - resfpath = os.path.join(twd,resf) - try: - resf = open(resfpath,'r') - except: - try: - resf = open(altresfpath,'r') - except: - print >> sys.stderr, '## error - no file plink output %s or %s found - cannot continue' % (resfpath, altresfpath) - sys.exit(1) - for lnum,row in enumerate(resf): - if lnum == 0: - headl = row.split() - headl = [x.strip().upper() for x in headl] - headIndex = dict(zip(headl,range(0,len(headl)))) - chrpos = headIndex.get('CHR',None) - rspos = headIndex.get('SNP',None) - offspos = headIndex.get('BP',None) - nmisspos = headIndex.get('NMISS',None) - testpos = headIndex.get('TEST',None) - ppos = headIndex.get('P',None) - coeffpos = headIndex.get('OR',None) - if not coeffpos: - coeffpos = headIndex.get('BETA',None) - apos = headIndex.get('A1',None) - statpos = headIndex.get('STAT',None) - wewant = [chrpos,rspos,offspos,testpos,statpos,ppos,coeffpos,apos] - if None in wewant: # missing something - logf.write('missing a required header in xformQassoc - headIndex=%s\n' % headIndex) - return - llen = len(headl) - else: # no Nones! - ll = row.split() - if len(ll) >= llen: # valid line - chrom,snp,offset,test,stat,p,coeff,allele = [ll[x] for x in wewant] - snp = snp.strip() - if p <> 'NA' : - try: - ffp = float(p) - if ffp <> 0: - lp = -math.log10(ffp) - except: - lp = 0.0 - resdict.setdefault(test,{}) - resdict[test][snp] = (stat,p,'%f' % lp) - if rsdict.get(snp,None) == None: - rsdict[snp] = (chrom,offset) - markerlist.append(snp) - # now have various tests indexed by rs - tk = resdict.keys() - tk.sort() # tests - ohead = ['rs','chrom','offset'] - for t in tk: # add headers - ohead.append('%s_stat' % t) - ohead.append('%s_p' % t) - ohead.append('%s_log10p' % t) - oheads = '\t'.join(ohead) - res = [oheads,] - for snp in markerlist: # retain original order - chrom,offset = rsdict[snp] - outl = [snp,chrom,offset] - for t in tk: - outl += resdict[t][snp] # add stat,p for this test - outs = '\t'.join(outl) - res.append(outs) - f = file(outfname,'w') - res.append('') - f.write('\n'.join(res)) - f.close() - - -if __name__ == "__main__": - """ - - <command interpreter="python"> - rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name' - "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name' - '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$wigout' - </command> - """ - topn = 1000 - killme = string.punctuation+string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - if len(sys.argv) < 17: - s = 'rgGLM.py needs 17 params - got %s \n' % (sys.argv) - sys.stderr.write(s) # print >>,s would probably also work? - sys.exit(0) - blurb = 'rgGLM.py called with %s' % sys.argv - print >> sys.stdout,blurb - bfname = sys.argv[1] - phename = sys.argv[2] - title = sys.argv[3] - title.translate(trantab) - predvar = sys.argv[4] - covar = sys.argv[5].strip() - outfname = sys.argv[6] - logfname = sys.argv[7] - op = os.path.split(logfname)[0] - try: # for test - needs this done - os.makedirs(op) - except: - pass - basename = sys.argv[8].translate(trantab) - inter = sys.argv[9] == '1' - cond = sys.argv[10].strip() - if cond == 'None': - cond = '' - gender = sys.argv[11] == '1' - mind = sys.argv[12] - geno = sys.argv[13] - maf = sys.argv[14] - logistic = sys.argv[15].strip()=='1' - gffout = sys.argv[16] - me = sys.argv[0] - phepath = '%s.pphe' % phename - twd = tempfile.mkdtemp(suffix='rgGLM') # make sure plink doesn't spew log file into the root! - tplog = os.path.join(twd,'%s.log' % basename) # should be path to plink log - vcl = [plinke,'--noweb','--bfile',bfname,'--pheno-name','"%s"' % predvar,'--pheno', - phepath,'--out',basename,'--mind %s' % mind, '--geno %s' % geno, - '--maf %s' % maf] - if logistic: - vcl.append('--logistic') - resf = '%s.assoc.logistic' % basename # plink output is here we hope - else: - vcl.append('--linear') - resf = '%s.assoc.linear' % basename # plink output is here we hope - resf = os.path.join(twd,resf) - if gender: - vcl.append('--sex') - if inter: - vcl.append('--interaction') - if covar > 'None': - vcl += ['--covar',phepath,'--covar-name',covar] # comma sep list of covariates - tcfile = None - if len(cond) > 0: # plink wants these in a file.. - dummy,tcfile = tempfile.mkstemp(suffix='condlist') # - f = open(tcfile,'w') - cl = cond.split() - f.write('\n'.join(cl)) - f.write('\n') - f.close() - vcl.append('--condition-list %s' % tcfile) - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd) - retval = p.wait() - if tcfile: - os.unlink(tcfile) - plinklog = file(tplog,'r').read() - logf = file(logfname,'w') - logf.write(blurb) - logf.write('\n') - logf.write('vcl=%s\n' % vcl) - xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file - makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name='rgGLM_TopTable',description=title,topn=topn) - logf.write('\n') - logf.write(plinklog) - logf.close() - #shutil.rmtree(twd) # clean up - - - - - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgGLM.xml --- a/tools/rgenetics/rgGLM.xml Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,146 +0,0 @@ -<tool id="rgGLM1" name="Linear Models:" version="0.2"> - <description>for genotype data</description> - <code file="rgGLM_code.py"/> - <command interpreter="python"> - rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name' - "$title" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name' - '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$gffout' - </command> - - <inputs> - <page> - <param name='title' label='Title for outputs' type='text' value='GLM' size="80" /> - <param name="i" type="data" format="pbed" label="Genotype file" size="80" /> - <param name="phef" type="data" format="pphe" label="Phenotype file" size="80" - help="Dependent variable and covariates will be chosen from this file on the next page"/> - <param name="logistic" type="text" value = "0" label="1=Use a logistic model (trait must be 1/2 coded like affection)" - help="Please read the Plink documentation about this option" /> - <param name="gender" type="text" value = "0" label="1=Add a gender term to model" /> - <param name='inter' label='1=Build an interaction model - please read the docs carefully before using this' - type='text' value='0' size="1" /> - <param name="cond" type="text" area='true' size='15x20' value = "" - label="condition on this whitespace delimited rs (snp id) list" /> - <param name="mind" type="float" value = "0.1" label="Remove subjects with missing genotypes gt (eg 0.1)" - help = "Set to 1 to include all subjects in the input file" /> - <param name="geno" type="float" value = "0.1" label="Remove markers with missing genotypes gt (eg 0.1)" - help = "Set to 1 to include all markers in the input file" /> - <param name="maf" type="float" value = "0.01" label="Remove markers with MAF lt (eg 0.01) " - help = "Set to 0 to include all markers in the input file"/> - </page> - <page> - <param name="predvar" size="80" type="select" label="Dependent Trait" - dynamic_options="get_phecols(phef=phef,selectOne=1)" display="radio" multiple="false" - help="Model this characteristic in terms of subject snp genotypes - eg rare allele dosage for additive model" /> - <param name="covar" size="80" type="select" label="Covariates" - dynamic_options="get_phecols(phef=phef,selectOne=0)" multiple="true" display="checkboxes" - help="Use these phenotypes as covariates in models of snp dosage effects on the dependent trait"/> - </page> - </inputs> - - <outputs> - <data format="tabular" name="out_file1" label="${title}_rgGLM.xls"/> - <data format="txt" name="logf" label="${title}_rgGLMlog.txt" /> - <data format="gff" name="gffout" label="${title}_rgGLM.gff"/> - </outputs> -<tests> - <test> - <param name='i' value='tinywga' ftype='pbed' > - <metadata name='base_name' value='tinywga' /> - <composite_data value='tinywga.bim' /> - <composite_data value='tinywga.bed' /> - <composite_data value='tinywga.fam' /> - <edit_attributes type='name' value='tinywga' /> - </param> - <param name='phef' value='tinywga' ftype='pphe' > - <metadata name='base_name' value='tinywga' /> - <composite_data value='tinywga.pphe' /> - <edit_attributes type='name' value='tinywga' /> - </param> - <param name='title' value='rgGLMtest1' /> - <param name='predvar' value='c1' /> - <param name='covar' value='None' /> - <param name='inter' value='0' /> - <param name='cond' value='' /> - <param name='gender' value='0' /> - <param name='mind' value='1.0' /> - <param name='geno' value='1.0' /> - <param name='maf' value='0.0' /> - <param name='logistic' value='0' /> - <output name='out_file1' file='rgGLMtest1_GLM.xls' ftype='tabular' compare="diff" /> - <output name='logf' file='rgGLMtest1_GLM_log.txt' ftype='txt' compare="diff" lines_diff='36'/> - <output name='gffout' file='rgGLMtest1_GLM_topTable.gff' compare="diff" ftype='gff' /> - </test> -</tests> -<help> - -.. class:: infomark - -**Syntax** - -Note this is a two form tool - you will choose the dependent trait and covariates -on the second page based on the phenotype file you choose on the first page - -- **Genotype file** is the input Plink format compressed genotype (pbed) file -- **Phenotype file** is the input Plink phenotype (pphe) file with FAMID IID followed by phenotypes -- **Dependant variable** is the term on the left of the model and is chosen from the pphe columns on the second page -- **Logistic** if you are (eg) using disease status as the outcome variable (case/control) - otherwise the model is linear. -- **Covariates** are covariate terms on the right of the model, also chosen on the second page -- **Interactions** will add interactions - please be careful how you interpret these - see the Plink documentation. -- **Gender** will add gender as a model term - described in the Plink documentation -- **Condition** will condition the model on one or more specific SNP rs ids as a whitespace delimited sequence -- **Format** determines how your data will be returned to your Galaxy workspace - ------ - -.. class:: infomark - -**Summary** - -This tool will test GLM models for SNP predicting a dependent phenotype -variable with adjustment for specified covariates. - -If you don't see the genotype or phenotype data set you want here, it can be imported using -one of the methods available from the rg get data tool group. - -Output format can be UCSC .bed if you want to see one column of your -results as a fully fledged UCSC genome browser track. A map file containing the chromosome and offset for each marker is -required for writing this kind of output. -Alternatively you can use .gg for the UCSC Genome Graphs tool which has all of the advantages -of the the .bed track, plus a neat, visual front end that displays a lot of useful clues. -Either of these are a very useful way of quickly getting a look -at your data in full genomic context. - -Finally, if you can't live without -spreadsheet data, choose the .xls tab delimited format. It's not a stupid binary excel file. Just a plain old tab -delimited -one with a header. Fortunately excel is dumb enough to open these without much protest. - ------ - -.. class:: infomark - -**Attribution** - -This Galaxy tool relies on Plink (see Plinksrc_) to test GLM models. - -So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see -DOC_ - -Tool and Galaxy datatypes originally designed and written for the Rgenetics -series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com) - -Copyright Ross Lazarus March 2007 -This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL. - -I'm no lawyer, but it looks like you got GPL if you use this software. Good luck. - -.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html - -.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#glm - -</help> -</tool> - - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgGLM_code.py --- a/tools/rgenetics/rgGLM_code.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -# before running the qc, need to rename various output files -import os,string,time -from galaxy import datatypes - - -def get_phecols(phef='',selectOne=0): - """return column names """ - phepath = phef.extra_files_path - phename = phef.metadata.base_name - phe = os.path.join(phepath,'%s.pphe' % phename) - head = open(phe,'r').next() - c = head.strip().split()[2:] # first are fid,iid - res = [(cname,cname,False) for cname in c] - if len(res) >= 1: - if selectOne: - x,y,z = res[0] # 0,1 = fid,iid - res[0] = (x,y,True) # set second selected - else: - res.insert(0,('None','None',True)) - else: - res = [('None','no phenotype columns found',False),] - return res - diff -r c2a356708570 -r 33c067c3ae34 tools/rgenetics/rgGRR.py --- a/tools/rgenetics/rgGRR.py Fri Mar 09 19:45:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1089 +0,0 @@ -""" -# july 2009: Need to see outliers so need to draw them last? -# could use clustering on the zscores to guess real relationships for unrelateds -# but definitely need to draw last -# added MAX_SHOW_ROWS to limit the length of the main report page -# Changes for Galaxy integration -# added more robust knuth method for one pass mean and sd -# no difference really - let's use scipy.mean() and scipy.std() instead... -# fixed labels and changed to .xls for outlier reports so can open in excel -# interesting - with a few hundred subjects, 5k gives good resolution -# and 100k gives better but not by much -# TODO remove non autosomal markers -# TODO it would be best if label had the zmean and zsd as these are what matter for -# outliers rather than the group mean/sd -# mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots -# to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log -# so the result should be an HTML file - -# rgIBS.py -# use a random subset of markers for a quick ibs -# to identify sample dups and closely related subjects -# try snpMatrix and plink and see which one works best for us? -# abecasis grr plots mean*sd for every subject to show clusters -# mods june 23 rml to avoid non-autosomal markers -# we seem to be distinguishing parent-child by gender - 2 clouds! - - -snpMatrix from David Clayton has: -ibs.stats function to calculate the identity-by-state stats of a group of samples -Description -Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics -about the relatedness of every pair of samples within. - -Usage -ibs.stats(x) -8 ibs.stats -Arguments -x a snp.matrix-class or a X.snp.matrix-class object containing N samples -Details -No-calls are excluded from consideration here. -Value -A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated -by a comma, and the columns are: -Count count of identical calls, exclusing no-calls -Fraction fraction of identical calls comparied to actual calls being made in both samples -Warning -In some applications, it may be preferable to subset a (random) selection of SNPs first - the -calculation -time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the -calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for -simple applications such as checking for duplicate or related samples. -Note -This is mostly written to find mislabelled and/or duplicate samples. -Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most -purpose it is undesirable to use these SNPs for IBS purposes. -TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to -subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility -""" -import sys,os,time,random,string,copy,optparse - -try: - set -except NameError: - from Sets import Set as set - -from rgutils import timenow,plinke - -import plinkbinJZ - - -opts = None -verbose = False - -showPolygons = False - -class NullDevice: - def write(self, s): - pass - -tempstderr = sys.stderr # save -#sys.stderr = NullDevice() -# need to avoid blather about deprecation and other strange stuff from scipy -# the current galaxy job runner assumes that -# the job is in error if anything appears on sys.stderr -# grrrrr. James wants to keep it that way instead of using the -# status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy) -import numpy -import scipy -from scipy import weave - - -sys.stderr=tempstderr - - -PROGNAME = os.path.split(sys.argv[0])[-1] -X_AXIS_LABEL = 'Mean Alleles Shared' -Y_AXIS_LABEL = 'SD Alleles Shared' -LEGEND_ALIGN = 'topleft' -LEGEND_TITLE = 'Relationship' -DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size -DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size - -### Some colors for R/rpy -R_BLACK = 1 -R_RED = 2 -R_GREEN = 3 -R_BLUE = 4 -R_CYAN = 5 -R_PURPLE = 6 -R_YELLOW = 7 -R_GRAY = 8 - -### ... and some point-styles - -### -PLOT_HEIGHT = 600 -PLOT_WIDTH = 1150 - - -#SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson') -#SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray') -SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray') -# dupe,parentchild,sibpair,halfsib,parents,unrel,unkn -#('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray') - -OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped'] -OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list) -TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n' - - -### Relationship codes, text, and lookups/mappings -N_RELATIONSHIP_TYPES = 7 -REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES) -REL_LOOKUP = { - REL_DUPE: ('dupe', R_BLUE, 1), - REL_PARENTCHILD: ('parentchild', R_YELLOW, 1), - REL_SIBS: ('sibpairs', R_RED, 1), - REL_HALFSIBS: ('halfsibs', R_GREEN, 1), - REL_RELATED: ('parents', R_PURPLE, 1), - REL_UNRELATED: ('unrelated', R_CYAN, 1), - REL_UNKNOWN: ('unknown', R_GRAY, 1), - } -OUTLIER_STDEVS = { - REL_DUPE: 2, - REL_PARENTCHILD: 2, - REL_SIBS: 2, - REL_HALFSIBS: 2, - REL_RELATED: 2, - REL_UNRELATED: 3, - REL_UNKNOWN: 2, - } -# note now Z can be passed in - -REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)] -REL_COLORS = SVG_COLORS -REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)] - -DEFAULT_MAX_SAMPLE_SIZE = 10000 - -REF_COUNT_HOM1 = 3 -REF_COUNT_HET = 2 -REF_COUNT_HOM2 = 1 -MISSING = 0 -MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain -MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0 -MARKER_PAIRS_PER_SECOND_FAST = 70000000.0 - - -galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> -<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> -<title> - - - -
-""" - - -SVG_HEADER = ''' - - - - - - - - - - - - - - - - - - - - - -Developer Works Dynamic Scatter Graph Scaling Example - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Given Pair Relationship - - - - - - - - Zscore gt 15 - - Zscore 4 to 15 - - Zscore lt 4 - - - - - - - - - Mean Alleles Shared - 1.0 - 1.25 - 1.5 - 1.75 - 2.0 - - - - - SD Alleles Shared - 1.0 - 0.75 - 0.5 - 0.25 - 0.0 - - - - - %s - - - -''' - -SVG_FOOTER = ''' - - - - - - unrelated - mean=1.5 +/- 0.04 - sdev=0.7 +/- 0.03 - npairs=1152 - ngenos=4783 +/- 24 (min=1000, max=5000) - - - - - - sibpairs - s1=fid1,iid1 - s2=fid2,iid2 - mean=1.82 - sdev=0.7 - ngeno=4487 - relmean=1.85 - relsdev=0.65 - - -''' - - -DEFAULT_MAX_SAMPLE_SIZE = 5000 - -REF_COUNT_HOM1 = 3 -REF_COUNT_HET = 2 -REF_COUNT_HOM2 = 1 -MISSING = 0 - -MARKER_PAIRS_PER_SECOND_SLOW = 15000000 -MARKER_PAIRS_PER_SECOND_FAST = 70000000 - -POLYGONS = { - REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)), - REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)), - REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)), - REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)), - REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)), - } - -def distance(point1, point2): - """ Calculate the distance between two points - """ - (x1,y1) = [float(d) for d in point1] - (x2,y2) = [float(d) for d in point2] - dx = abs(x1 - x2) - dy = abs(y1 - y2) - return math.sqrt(dx**2 + dy**2) - -def point_inside_polygon(x, y, poly): - """ Determine if a point (x,y) is inside a given polygon or not - poly is a list of (x,y) pairs. - - Taken from: http://www.ariel.com.au/a/python-point-int-poly.html - """ - - n = len(poly) - inside = False - - p1x,p1y = poly[0] - for i in range(n+1): - p2x,p2y = poly[i % n] - if y > min(p1y,p2y): - if y <= max(p1y,p2y): - if x <= max(p1x,p2x): - if p1y != p2y: - xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x - if p1x == p2x or x <= xinters: - inside = not inside - p1x,p1y = p2x,p2y - return inside - -def readMap(pedfile): - """ - """ - mapfile = pedfile.replace('.ped', '.map') - marker_list = [] - if os.path.exists(mapfile): - print 'readMap: %s' % (mapfile) - fh = file(mapfile, 'r') - for line in fh: - marker_list.append(line.strip().split()) - fh.close() - print 'readMap: %s markers' % (len(marker_list)) - return marker_list - -def calcMeanSD(useme): - """ - A numerically stable algorithm is given below. It also computes the mean. - This algorithm is due to Knuth,[1] who cites Welford.[2] - n = 0 - mean = 0 - M2 = 0 - - foreach x in data: - n = n + 1 - delta = x - mean - mean = mean + delta/n - M2 = M2 + delta*(x - mean) // This expression uses the new value of mean - end for - - variance_n = M2/n - variance = M2/(n - 1) - """ - mean = 0.0 - M2 = 0.0 - sd = 0.0 - n = len(useme) - if n > 1: - for i,x in enumerate(useme): - delta = x - mean - mean = mean + delta/(i+1) # knuth uses n+=1 at start - M2 = M2 + delta*(x - mean) # This expression uses the new value of mean - variance = M2/(n-1) # assume is sample so lose 1 DOF - sd = pow(variance,0.5) - return mean,sd - - -def doIBSpy(ped=None,basename='',outdir=None,logf=None, - nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0): - #def doIBS(pedName, title, nrsSamples=None, pdftoo=False): - """ started with snpmatrix but GRR uses actual IBS counts and sd's - """ - repOut = [] # text strings to add to the html display - refallele = {} - tblf = '%s_table.xls' % (title) - tbl = file(os.path.join(outdir,tblf), 'w') - tbl.write(TABLE_HEADER) - svgf = '%s.svg' % (title) - svg = file(os.path.join(outdir,svgf), 'w') - - nMarkers = len(ped._markers) - if nMarkers < 5: - print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME) - sys.exit(1) - nSubjects = len(ped._subjects) - nrsSamples = min(nMarkers, nrsSamples) - if opts and opts.use_mito: - markers = range(nMarkers) - nrsSamples = min(len(markers), nrsSamples) - sampleIndexes = sorted(random.sample(markers, nrsSamples)) - else: - autosomals = ped.autosomal_indices() - nrsSamples = min(len(autosomals), nrsSamples) - sampleIndexes = sorted(random.sample(autosomals, nrsSamples)) - - print '' - print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers) - npairs = (nSubjects*(nSubjects-1))/2 # total rows in table - newfiles=[svgf,tblf] - explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs] - # these go with the output file links in the html file - s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples) - logf.write(s) - minUsegenos = nrsSamples/2 # must have half? - nGenotypes = nSubjects*nrsSamples - stime = time.time() - emptyRows = set() - genos = numpy.zeros((nSubjects, nrsSamples), dtype=int) - for s in xrange(nSubjects): - nValid = 0 - #getGenotypesByIndices(self, s, mlist, format) - genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref') - nValid = sum([1 for g in genos[s] if g]) - if not nValid: - emptyRows.add(s) - sub = ped.getSubject(s) - print 'All missing for row %d (%s)' % (s, sub) - logf.write('All missing for row %d (%s)\n' % (s, sub)) - rtime = time.time() - stime - if verbose: - print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime) - - - ### Now the expensive part. For each pair of subjects, we get the mean number - ### and standard deviation of shared alleles over all of the markers where both - ### subjects have a known genotype. Identical subjects should have mean shared - ### alleles very close to 2.0 with a standard deviation very close to 0.0. - tot = nSubjects*(nSubjects-1)/2 - nprog = tot/10 - nMarkerpairs = tot * nrsSamples - estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW - estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST - - pairs = [] - pair_data = {} - means = [] ## Mean IBS for each pair - ngenoL = [] ## Count of comparable genotypes for each pair - sdevs = [] ## Standard dev for each pair - rels = [] ## A relationship code for each pair - zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup - zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp - skip = set() - ndone = 0 ## How many have been done so far - - logf.write('Calculating %d pairs...\n' % (tot)) - logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow)) - - t1sum = 0 - t2sum = 0 - t3sum = 0 - now = time.time() - scache = {} - _founder_cache = {} - C_CODE = """ - #include "math.h" - int i; - int sumibs = 0; - int ssqibs = 0; - int ngeno = 0; - float mean = 0; - float M2 = 0; - float delta = 0; - float sdev=0; - float variance=0; - for (i=0; i 1) { - variance = M2/(ngeno-1); - sdev = sqrt(variance); - //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev); - } - //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev); - result[0] = ngeno; - result[1] = mean; - result[2] = sdev; - return_val = ngeno; - """ - started = time.time() - for s1 in xrange(nSubjects): - if s1 in emptyRows: - continue - (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1)) - - isFounder1 = _founder_cache.setdefault(s1, (did1==mid1)) - g1 = genos[s1] - - for s2 in xrange(s1+1, nSubjects): - if s2 in emptyRows: - continue - t1s = time.time() - - (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2)) - - g2 = genos[s2] - isFounder2 = _founder_cache.setdefault(s2, (did2==mid2)) - - # Determine the relationship for this pair - relcode = REL_UNKNOWN - if (fid2 == fid1): - if iid1 == iid2: - relcode = REL_DUPE - elif (did2 == did1) and (mid2 == mid1) and did1 != mid1: - relcode = REL_SIBS - elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1): - relcode = REL_PARENTCHILD - elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)): - relcode = REL_HALFSIBS - else: - # People in the same family should be marked as some other - # form of related. In general, these people will have a - # pretty random spread of similarity. This distinction is - # probably not very useful most of the time - relcode = REL_RELATED - else: - ### Different families - relcode = REL_UNRELATED - - t1e = time.time() - t1sum += t1e-t1s - - - ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count - ### the number of contributing genotypes. These values are not actually - ### calculated here, but instead are looked up in a table for speed. - ### FIXME: This is still too slow ... - result = [0.0, 0.0, 0.0] - ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result']) - if ngeno >= minUsegenos: - _, mean, sdev = result - means.append(mean) - sdevs.append(sdev) - ngenoL.append(ngeno) - pairs.append((s1, s2)) - rels.append(relcode) - else: - skip.add(ndone) # signal no comparable genotypes for this pair - ndone += 1 - t2e = time.time() - t2sum += t2e-t1e - t3e = time.time() - t3sum += t3e-t2e - - logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now), - '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip), - float(len(skip))/float(tot)*100)] - logf.write('%s\n' % '\t'.join(logme)) - ### Calculate mean and standard deviation of scores on a per relationship - ### type basis, allowing us to flag outliers for each particular relationship - ### type - relstats = {} - relCounts = {} - outlierFiles = {} - for relCode, relInfo in REL_LOOKUP.items(): - relName, relColor, relStyle = relInfo - useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode] - relCounts[relCode] = len(useme) - mm = scipy.mean(useme) - ms = scipy.std(useme) - useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode] - sm = scipy.mean(useme) - ss = scipy.std(useme) - relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)} - s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \ - (relName,relCounts[relCode], mm, ms, sm, ss) - logf.write(s) - - ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|) - ### within each group, for each pair, z=(groupmean-pairmean)/groupsd - available = len(means) - logf.write('%d pairs are available of %d\n' % (available, tot)) - ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n' - ### logf.write(s) - pairnum = 0 - offset = 0 - nOutliers = 0 - cexs = [] - outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)]) - zsdmax = 0 - for s1 in range(nSubjects): - if s1 in emptyRows: - continue - (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1] - for s2 in range(s1+1, nSubjects): - if s2 in emptyRows: - continue - if pairnum not in skip: - ### Get group stats for this relationship - (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2] - try: - r = rels[offset] - except IndexError: - logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels)) - notfound = ('?',('?','0','0')) - relInfo = REL_LOOKUP.get(r,notfound) - relName, relColor, relStyle = relInfo - rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared - rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared - - try: - zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units - except: - zsd = 1 - if abs(zsd) > zsdmax: - zsdmax = zsd # keep for sort scaling - try: - zmean = (means[offset] - rmm)/rmd # distance from group mean - except: - zmean = 1 - zmeans[offset] = zmean - zstds[offset] = zsd - pid=(s1,s2) - zrad = max(zsd,zmean) - if zrad < 4: - zrad = 2 - elif 4 < zrad < 15: - zrad = 3 # to 9 - else: # > 15 6=24+ - zrad=zrad/4 - zrad = min(zrad,6) # scale limit - zrad = max(2,max(zsd,zmean)) # as > 2, z grows - pair_data[pid] = (zmean,zsd,r,zrad) - if max(zsd,zmean) > Zcutoff: # is potentially interesting - mean = means[offset] - sdev = sdevs[offset] - outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2)) - nOutliers += 1 - tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \ - (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2)) - offset += 1 - pairnum += 1 - logf.write( 'Outliers: %s\n' % (nOutliers)) - - ### Write outlier files for each relationship type - repOut.append('

Outliers in tab delimited files linked above are also listed below

') - lzsd = round(numpy.log10(zsdmax)) + 1 - scalefactor = 10**lzsd - for relCode, relInfo in REL_LOOKUP.items(): - relName, _, _ = relInfo - outliers = outlierRecords[relCode] - if not outliers: - continue - outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate - outliers.sort() - outliers.reverse() # largest deviation first - outliers = [x[1] for x in outliers] # undecorate - nrows = len(outliers) - truncated = 0 - if nrows > MAX_SHOW_ROWS: - s = '

%s outlying pairs (top %d of %d) from %s

' % \ - (relName,MAX_SHOW_ROWS,nrows,title) - truncated = nrows - MAX_SHOW_ROWS - else: - s = '

%s outlying pairs (n=%d) from %s

' % (relName,nrows,title) - repOut.append(s) - fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName) - fhpath = os.path.join(outdir,fhname) - fh = open(fhpath, 'w') - newfiles.append(fhname) - explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff)) - fh.write(OUTLIERS_HEADER) - s = ''.join(['' % x for x in OUTLIERS_HEADER_list]) - repOut.append('%s' % s) - for n,rec in enumerate(outliers): - #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec - s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec) - fh.write('%s%s\n' % (s,relName)) - # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2)) - s = ''' - ''' % tuple(rec) - s = '%s' % (s,relName) - if n < MAX_SHOW_ROWS: - repOut.append('%s' % s) - if truncated > 0: - repOut.append('

WARNING: %d rows truncated - see outlier file for all %d rows

' % (truncated, - nrows)) - fh.close() - repOut.append('
%s
%f%f%f%f%s%s%s%s%f%f%f%f%s%s%s%s%s

') - - ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format - ### if requested - logf.write('Plotting ...') - pointColors = [REL_COLORS[rel] for rel in rels] - pointStyles = [REL_POINTS[rel] for rel in rels] - - mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples) - svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4], - SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1], - SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4], - SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle)) - #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white') - #rpy.r.par(mai=(1,1,1,0.5)) - #rpy.r('par(xaxs="i",yaxs="i")') - #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) - #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) - #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') - #rpy.r.dev_off() - - ### We will now go through each relationship type to partition plot points - ### into "bulk" and "outlier" groups. Bulk points will represent common - ### mean/sdev pairs and will cover the majority of the points in the plot -- - ### they will use generic tooltip informtion about all of the pairs - ### represented by that point. "Outlier" points will be uncommon pairs, - ### with very specific information in their tooltips. It would be nice to - ### keep hte total number of plotted points in the SVG representation to - ### ~10000 (certainly less than 100000?) - pointMap = {} - orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))] - # do we really want this? I want out of zone points last and big - for relCode in orderedRels: - svgColor = SVG_COLORS[relCode] - relName, relColor, relStyle = REL_LOOKUP[relCode] - svg.write('\n' % (relName, svgColor, svgColor)) - pMap = pointMap.setdefault(relCode, {}) - nPoints = 0 - rpairs=[] - rgenos=[] - rmeans=[] - rsdevs=[] - rz = [] - for x,rel in enumerate(rels): # all pairs - if rel == relCode: - s1,s2 = pairs[x] - pid=(s1,s2) - zmean,zsd,r,zrad = pair_data[pid][:4] - rpairs.append(pairs[x]) - rgenos.append(ngenoL[x]) - rmeans.append(means[x]) - rsdevs.append(sdevs[x]) - rz.append(zrad) - ### Now add the svg point group for this relationship to the svg file - for x in range(len(rmeans)): - svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2 - svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1 - s1, s2 = rpairs[x] - (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1] - (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2] - ngenos = rgenos[x] - nPoints += 1 - point = pMap.setdefault((svgX, svgY), []) - point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x])) - for (svgX, svgY) in pMap: - points = pMap[(svgX, svgY)] - svgX = int(svgX) - svgY = int(svgY) - if len(points) > 1: - mmean,dmean = calcMeanSD([p[0] for p in points]) - msdev,dsdev = calcMeanSD([p[1] for p in points]) - mgeno,dgeno = calcMeanSD([p[-1] for p in points]) - mingeno = min([p[-1] for p in points]) - maxgeno = max([p[-1] for p in points]) - svg.write("""\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno)) - else: - mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12] - rmean = float(relstats[relCode]['mean'][0]) - rsdev = float(relstats[relCode]['sd'][0]) - if zrad < 4: - zrad = 2 - elif 4 < zrad < 9: - zrad = 3 # to 9 - else: # > 9 5=15+ - zrad=zrad/3 - zrad = min(zrad,5) # scale limit - if zrad <= 3: - svg.write('\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) - else: # highlight pairs a long way from expectation by outlining circle in red - svg.write("""\n""" % \ - (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) - svg.write('\n') - - ### Create a pdf as well if indicated on the command line - ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf! -## if pdftoo: -## pdfname = '%s.pdf' % (title) -## rpy.r.pdf(pdfname, 6, 6) -## rpy.r.par(mai=(1,1,1,0.5)) -## rpy.r('par(xaxs="i",yaxs="i")') -## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) -## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) -## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') -## rpy.r.dev_off() - - ### Draw polygons - if showPolygons: - svg.write('\n') - for rel, poly in POLYGONS.items(): - points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly]) - svg.write('\n' % (points, SVG_COLORS[rel])) - svg.write('\n') - - - svg.write(SVG_FOOTER) - svg.close() - return newfiles,explanations,repOut - -def doIBS(n=100): - """parse parameters from galaxy - expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n' - - rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" - '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' - - - """ - u=""" - rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" - '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' - - """ - - - if len(sys.argv) < 7: - print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please' - print >> sys.stdout, u - sys.exit(1) - ts = '%s%s' % (string.punctuation,string.whitespace) - ptran = string.maketrans(ts,'_'*len(ts)) - inpath = sys.argv[1] - ldinpath = os.path.split(inpath)[0] - basename = sys.argv[2] - outhtml = sys.argv[3] - newfilepath = sys.argv[4] - title = sys.argv[5].translate(ptran) - logfname = 'Log_%s.txt' % title - logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo - n = int(sys.argv[6]) - try: - Zcutoff = float(sys.argv[7]) - except: - Zcutoff = 2.0 - try: - os.makedirs(newfilepath) - except: - pass - logf = file(logpath,'w') - efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path - ped = plinkbinJZ.BPed(inpath) - ped.parse(quick=True) - if ped == None: - print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename) - sys.exit(1) - newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath, - logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff) - logf.close() - logfs = file(logpath,'r').readlines() - lf = file(outhtml,'w') - lf.write(galhtmlprefix % PROGNAME) - # this is a mess. todo clean up - should each datatype have it's own directory? Yes - # probably. Then titles are universal - but userId libraries are separate. - s = '

Output from %s run at %s
\n' % (PROGNAME,timenow()) - lf.write('

%s

\n' % s) - fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case - s = 'If you need to rerun this analysis, the command line was\n
%s
\n
' % (' '.join(fixed)) - lf.write(s) - # various ways of displaying svg - experiments related to missing svg mimetype on test (!) - #s = """ - # - # """ % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) - s = """ """ % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) - #s = """