Next changeset 1:e7d962da0397 (2018-04-30) |
Commit message:
planemo upload commit 81ece2551cea27cbd0e718ef5b7a2fe8d4abd071 |
added:
data_source/access_libraries.xml data_source/bed_convert.xml data_source/biomart.xml data_source/biomart_test.xml data_source/cbi_rice_mart.xml data_source/data_source.py data_source/ebi_sra.xml data_source/eupathdb.xml data_source/fetch.py data_source/fly_modencode.xml data_source/flymine.xml data_source/flymine_test.xml data_source/genbank.py data_source/genbank.xml data_source/gramene_mart.xml data_source/hapmapmart.xml data_source/hbvar.xml data_source/hbvar_filter.py data_source/import.py data_source/import.xml data_source/metabolicmine.xml data_source/microbial_import.py data_source/microbial_import.xml data_source/microbial_import_code.py data_source/modmine.xml data_source/mousemine.xml data_source/ratmine.xml data_source/ucsc_tablebrowser.xml data_source/ucsc_tablebrowser_archaea.xml data_source/ucsc_tablebrowser_test.xml data_source/upload.py data_source/upload.xml data_source/worm_modencode.xml data_source/wormbase.xml data_source/wormbase_test.xml data_source/yeastmine.xml data_source/zebrafishmine.xml evolution/add_scores.py evolution/add_scores.xml evolution/codingSnps.pl evolution/codingSnps.xml evolution/codingSnps_filter.py extract/extract_genomic_dna.py extract/extract_genomic_dna.xml extract/liftOver_wrapper.py extract/liftOver_wrapper.xml filters/CreateInterval.pl filters/CreateInterval.xml filters/axt_to_concat_fasta.py filters/axt_to_concat_fasta.xml filters/axt_to_fasta.py filters/axt_to_fasta.xml filters/axt_to_lav.py filters/axt_to_lav.xml filters/axt_to_lav_code.py filters/bed2gff.xml filters/bed_to_bigbed.xml filters/bed_to_gff_converter.py filters/catWrapper.py filters/catWrapper.xml filters/changeCase.pl filters/changeCase.xml filters/commWrapper.pl filters/commWrapper.xml filters/compare.xml filters/condense_characters.pl filters/condense_characters.xml filters/convert_characters.pl filters/convert_characters.py filters/convert_characters.xml filters/cutWrapper.pl filters/cutWrapper.xml filters/fileGrep.xml filters/fixedValueColumn.pl filters/fixedValueColumn.xml filters/gff/extract_GFF_Features.py filters/gff/extract_GFF_Features.xml filters/gff/gff_filter_by_attribute.py filters/gff/gff_filter_by_attribute.xml filters/gff/gff_filter_by_feature_count.py filters/gff/gff_filter_by_feature_count.xml filters/gff/gtf_filter_by_attribute_values_list.py filters/gff/gtf_filter_by_attribute_values_list.xml filters/gff2bed.xml filters/gff_to_bed_converter.py filters/grep.py filters/grep.xml filters/gtf2bedgraph.xml filters/gtf_to_bedgraph_converter.py filters/headWrapper.pl filters/headWrapper.xml filters/join.py filters/joinWrapper.pl filters/joinWrapper.py filters/joiner.xml filters/joiner2.xml filters/lav_to_bed.py filters/lav_to_bed.xml filters/lav_to_bed_code.py filters/mergeCols.py filters/mergeCols.xml filters/pasteWrapper.pl filters/pasteWrapper.xml filters/random_lines_two_pass.py filters/randomlines.py filters/randomlines.xml filters/remove_beginning.pl filters/remove_beginning.xml filters/secure_hash_message_digest.py filters/secure_hash_message_digest.xml filters/sff_extract.py filters/sff_extractor.xml filters/sorter.py filters/sorter.xml filters/tailWrapper.pl filters/tailWrapper.xml filters/trimmer.py filters/trimmer.xml filters/ucsc_gene_bed_to_exon_bed.py filters/ucsc_gene_bed_to_exon_bed.xml filters/ucsc_gene_bed_to_intron_bed.py filters/ucsc_gene_bed_to_intron_bed.xml filters/ucsc_gene_table_to_intervals.py filters/ucsc_gene_table_to_intervals.xml filters/uniq.py filters/uniq.xml filters/wc_gnu.xml filters/wig_to_bigwig.xml filters/wiggle_to_simple.py filters/wiggle_to_simple.xml genomespace/genomespace_exporter.py genomespace/genomespace_exporter.xml genomespace/genomespace_file_browser.py genomespace/genomespace_file_browser_dev.xml genomespace/genomespace_file_browser_prod.xml genomespace/genomespace_file_browser_test.xml genomespace/genomespace_importer.py genomespace/genomespace_importer.xml maf/genebed_maf_to_fasta.xml maf/interval2maf.py maf/interval2maf.xml maf/interval2maf_pairwise.xml maf/interval_maf_to_merged_fasta.py maf/interval_maf_to_merged_fasta.xml maf/macros.xml maf/maf_by_block_number.py maf/maf_by_block_number.xml maf/maf_filter.py maf/maf_filter.xml maf/maf_limit_size.py maf/maf_limit_size.xml maf/maf_limit_to_species.py maf/maf_limit_to_species.xml maf/maf_reverse_complement.py maf/maf_reverse_complement.xml maf/maf_split_by_species.py maf/maf_split_by_species.xml maf/maf_stats.py maf/maf_stats.xml maf/maf_thread_for_species.py maf/maf_thread_for_species.xml maf/maf_to_bed.py maf/maf_to_bed.xml maf/maf_to_bed_code.py maf/maf_to_fasta.xml maf/maf_to_fasta_concat.py maf/maf_to_fasta_multiple_sets.py maf/maf_to_interval.py maf/maf_to_interval.xml maf/vcf_to_maf_customtrack.py maf/vcf_to_maf_customtrack.xml meme/fimo.xml meme/fimo_wrapper.py meme/meme.xml metag_tools/blat_wrapper.py metag_tools/blat_wrapper.xml metag_tools/shrimp_color_wrapper.py metag_tools/shrimp_color_wrapper.xml metag_tools/shrimp_wrapper.py metag_tools/shrimp_wrapper.xml next_gen_conversion/bwa_solid2fastq_modified.pl next_gen_conversion/fastq_conversions.py next_gen_conversion/fastq_conversions.xml next_gen_conversion/fastq_gen_conv.py next_gen_conversion/fastq_gen_conv.xml next_gen_conversion/solid2fastq.py next_gen_conversion/solid2fastq.xml next_gen_conversion/solid_to_fastq.py next_gen_conversion/solid_to_fastq.xml ngs_simulation/ngs_simulation.py ngs_simulation/ngs_simulation.xml phenotype_association/BEAM2_wrapper.sh phenotype_association/beam.xml phenotype_association/gpass.pl phenotype_association/gpass.xml phenotype_association/ldtools.xml phenotype_association/ldtools_wrapper.sh phenotype_association/linkToDavid.pl phenotype_association/linkToDavid.xml phenotype_association/linkToGProfile.pl phenotype_association/linkToGProfile.xml phenotype_association/lped_to_geno.pl phenotype_association/lps.xml phenotype_association/lps_tool_wrapper.sh phenotype_association/master2gd_snp.pl phenotype_association/master2gd_snp.xml phenotype_association/master2pg.pl phenotype_association/master2pg.xml phenotype_association/mergeSnps.pl phenotype_association/pagetag.py phenotype_association/pass.xml phenotype_association/pass_wrapper.sh phenotype_association/senatag.py phenotype_association/sift.xml phenotype_association/sift_variants_wrapper.sh phenotype_association/vcf2pgSnpMult.pl plotting/bar_chart.py plotting/bar_chart.xml plotting/boxplot.xml solid_tools/maq_cs_wrapper.py solid_tools/maq_cs_wrapper.xml solid_tools/maq_cs_wrapper_code.py solid_tools/qualsolid_boxplot_graph.sh solid_tools/solid_qual_boxplot.xml solid_tools/solid_qual_stats.py solid_tools/solid_qual_stats.xml splicescope/README splicescope/annotation/mm10/Mm.seq.all.devcortex.cass.chrom.can.id2gene2symbol splicescope/annotation/mm10/Mm.seq.devcortex.cass.chrom.can.bed splicescope/annotation/mm10/mm10.conf splicescope/splicescope4maturation.xml splicescope/splicescope_wrapper.amazon.sh splicescope/splicescope_wrapper.intron.sh splicescope/splicescope_wrapper.sh splicescope/test.sh splicescope/test/DGN.cass.mat.txt splicescope/test/DRG1.bed splicescope/test/DRG2.bed sr_assembly/velvetg.xml sr_assembly/velvetg_wrapper.py sr_assembly/velveth.xml sr_assembly/velveth_wrapper.py sr_mapping/PerM.xml sr_mapping/bfast_wrapper.py sr_mapping/bfast_wrapper.xml sr_mapping/fastq_statistics.xml sr_mapping/mosaik.xml sr_mapping/srma_wrapper.py sr_mapping/srma_wrapper.xml stats/aggregate_binned_scores_in_intervals.xml stats/aggregate_scores_in_intervals.py stats/filtering.py stats/filtering.xml stats/grouping.py stats/grouping.xml stats/gsummary.py stats/gsummary.xml stats/gsummary.xml.groups stats/r_wrapper.sh visualization/LAJ.py visualization/LAJ.xml visualization/LAJ_code.py |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/access_libraries.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/access_libraries.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tool name="Access Libraries" id="library_access1" version="1.0.0"> + <description>stored locally</description> + <inputs action="/library/index" method="get" target="_parent"> + <param name="default_action" type="hidden" value="import_to_histories" /> + </inputs> + <uihints minwidth="800"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/bed_convert.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/bed_convert.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,14 @@ +<tool id="BED File Converter1" name="BED File Converter" version="1.0.0"> + <description>creates a bed or xbed file containing from text query</description> + <command>noop</command> + <inputs> + <display>creates a bed or xbed file containing user assigned input of $input</display> + <param format="tabular" name="input" type="data" /> + <param name="chrom" size="4" type="text" value="all" /> + </inputs> + <outputs> + <data format="bed" name="out_file1" /> + </outputs> + <help>User specifies delimiter, header information, and column assignments and the file will be converted to BED or xBED. +</help> +</tool> \ No newline at end of file |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/biomart.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/biomart.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,46 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. + + TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile + everything including and beyond the first '&' is truncated from URL. They said they'll let us know when this is fixed at their end. +--> +<tool name="BioMart" id="biomart" tool_type="data_source" version="1.0.1"> + <description>Ensembl server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://www.ensembl.org/biomart/martview" check_values="false" method="get" target="_top"> + <display>go to BioMart Ensembl $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="_export" missing="1" /> + <value name="GALAXY_URL" missing="0" /> + </append_param> + </request_param> + <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" > + <value_translation> + <value galaxy_value="tabular" remote_value="TSV" /> + </value_translation> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" /> + <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="Biomart query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <options sanitize="False" refresh="True"/> + <citations> + <citation type="doi">10.1093/database/bar011</citation> + <citation type="doi">10.1093/nar/gkv350</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/biomart_test.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/biomart_test.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,46 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. + + TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile + everything including and beyond the first '&' is truncated from URL. They said they'll let us know when this is fixed at their end. +--> +<tool name="BioMart" id="biomart_test" tool_type="data_source" version="1.0.1"> + <description>Test server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://test.biomart.org/biomart/martview" check_values="false" method="get" target="_top"> + <display>go to BioMart Central $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="_export" missing="1" /> + <value name="GALAXY_URL" missing="0" /> + </append_param> + </request_param> + <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" > + <value_translation> + <value galaxy_value="tabular" remote_value="TSV" /> + </value_translation> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" /> + <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="Biomart test query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <options sanitize="False" refresh="True"/> + <citations> + <citation type="doi">10.1093/database/bar011</citation> + <citation type="doi">10.1093/nar/gkv350</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/cbi_rice_mart.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/cbi_rice_mart.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,39 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="CBI Rice Mart" id="cbi_rice_mart" tool_type="data_source" version="1.0.1"> + <description>rice mart</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://ricemart.cbi.edu.cn/biomart/martview/" check_values="false" method="get" target="_top"> + <display>go to RMap rice mart $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="_export" missing="1" /> + <value name="GALAXY_URL" missing="0" /> + </append_param> + </request_param> + <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" > + <value_translation> + <value galaxy_value="tabular" remote_value="TSV" /> + </value_translation> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" /> + <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="Rice mart query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/data_source.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/data_source.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,119 @@ +#!/usr/bin/env python +# Retrieves data from external data source applications and stores in a dataset file. +# Data source application parameters are temporarily stored in the dataset file. +import os +import socket +import sys +from json import dumps, loads + +from six.moves.urllib.parse import urlencode +from six.moves.urllib.request import urlopen + +from galaxy.datatypes import sniff +from galaxy.datatypes.registry import Registry +from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_FILE +from galaxy.util import get_charset_from_http_headers + +GALAXY_PARAM_PREFIX = 'GALAXY' +GALAXY_ROOT_DIR = os.path.realpath( os.path.join( os.path.dirname( __file__ ), os.pardir, os.pardir ) ) +GALAXY_DATATYPES_CONF_FILE = os.path.join( GALAXY_ROOT_DIR, 'datatypes_conf.xml' ) + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def load_input_parameters( filename, erase_file=True ): + datasource_params = {} + try: + json_params = loads( open( filename, 'r' ).read() ) + datasource_params = json_params.get( 'param_dict' ) + except: + json_params = None + for line in open( filename, 'r' ): + try: + line = line.strip() + fields = line.split( '\t' ) + datasource_params[ fields[0] ] = fields[1] + except: + continue + if erase_file: + open( filename, 'w' ).close() # open file for writing, then close, removes params from file + return json_params, datasource_params + + +def __main__(): + filename = sys.argv[1] + try: + max_file_size = int( sys.argv[2] ) + except: + max_file_size = 0 + + job_params, params = load_input_parameters( filename ) + if job_params is None: # using an older tabular file + enhanced_handling = False + job_params = dict( param_dict=params ) + job_params[ 'output_data' ] = [ dict( out_data_name='output', + ext='data', + file_name=filename, + extra_files_path=None ) ] + job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE ) + else: + enhanced_handling = True + json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) # specially named file for output junk to pass onto set metadata + + datatypes_registry = Registry() + datatypes_registry.load_datatypes( root_dir=job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config=job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) + + URL = params.get( 'URL', None ) # using exactly URL indicates that only one dataset is being downloaded + URL_method = params.get( 'URL_method', None ) + + # The Python support for fetching resources from the web is layered. urllib uses the httplib + # library, which in turn uses the socket library. As of Python 2.3 you can specify how long + # a socket should wait for a response before timing out. By default the socket module has no + # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 + # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by + # doing the following. + socket.setdefaulttimeout( 600 ) + + for data_dict in job_params[ 'output_data' ]: + cur_filename = data_dict.get( 'file_name', filename ) + cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL ) + if not cur_URL: + open( cur_filename, 'w' ).write( "" ) + stop_err( 'The remote data source application has not sent back a URL parameter in the request.' ) + + # The following calls to urlopen() will use the above default timeout + try: + if not URL_method or URL_method == 'get': + page = urlopen( cur_URL ) + elif URL_method == 'post': + page = urlopen( cur_URL, urlencode( params ) ) + except Exception as e: + stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) + if max_file_size: + file_size = int( page.info().get( 'Content-Length', 0 ) ) + if file_size > max_file_size: + stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) + # do sniff stream for multi_byte + try: + cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) ) + except Exception as e: + stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) ) + + # here import checks that upload tool performs + if enhanced_handling: + try: + ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext=data_dict[ 'ext' ], is_multi_byte=is_multi_byte ) + except Exception as e: + stop_err( str( e ) ) + info = dict( type='dataset', + dataset_id=data_dict[ 'dataset_id' ], + ext=ext) + + json_file.write( "%s\n" % dumps( info ) ) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/ebi_sra.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/ebi_sra.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tool name="EBI SRA" id="ebi_sra_main" tool_type="data_source" version="1.0.1"> + <description>ENA SRA</description> + <!-- This paython script imports the file into Galaxy --> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <!-- The URL where Galaxy will forwards the user when this tool is accessed from the Get Data menu --> + <inputs action="https://www.ebi.ac.uk/ena/data/search" check_values="false" method="get"> + <display>go to EBI SRA server $GALAXY_URL</display> + </inputs> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="auto"/> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/eupathdb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/eupathdb.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,13 @@ +<tool name="EuPathDB" id="eupathdb" tool_type="data_source" url_method="post" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://eupathdb.org/eupathdb/queries_tools.jsp" check_values="false" method="get"> + <display>go to EuPathDB server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=eupathdb" /> + </inputs> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/fetch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/fetch.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,28 @@ +#!/usr/bin/env python +""" +Script that just echos the command line. +""" +from __future__ import print_function + +import sys + +from six.moves.urllib.request import urlopen + +assert sys.version_info[:2] >= ( 2, 4 ) + +BUFFER = 1048576 + +url = sys.argv[1] +out_name = sys.argv[2] + +out = open(out_name, 'wt') +try: + page = urlopen(url) + while 1: + data = page.read(BUFFER) + if not data: + break + out.write(data) +except Exception as e: + print('Error getting the data -> %s' % e) +out.close() |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/fly_modencode.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/fly_modencode.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,32 @@ +<?xml version="1.0"?> +<tool name="modENCODE fly" id="modENCODEfly" tool_type="data_source" version="1.0.1"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://gbrowse.modencode.org/fgb2/gbrowse/fly" check_values="false" target="_top"> + <display>go to modENCODE fly server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=modENCODEfly" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="dbkey" remote_name="dbkey" missing="dm3" > + <value_translation> + <value galaxy_value="dm3" remote_value="fly" /> + </value_translation> + </request_param> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="d" missing="" /> + <value name="dbkey" missing="dm3" /> + <value name="q" missing="" /> + <value name="s" missing="" /> + <value name="t" missing="" /> + </append_param> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/flymine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/flymine.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,35 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="Flymine" id="flymine" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://www.flymine.org" check_values="false" method="get"> + <display>go to Flymine server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=flymine" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="URL" remote_name="URL" missing="" /> + <request_param galaxy_name="dbkey" remote_name="db" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="FlyMine query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="txt" /> <!-- intermine currently always provides 'txt', make this auto detect --> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> + |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/flymine_test.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/flymine_test.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,31 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="Flymine test" id="flymine_test" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://preview.flymine.org/preview/begin.do" check_values="false" method="get"> + <display>go to Flymine server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=flymine" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="URL" remote_name="URL" missing="" /> + <request_param galaxy_name="dbkey" remote_name="db" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="FlyMine query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="txt" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> + |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/genbank.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/genbank.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,44 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys +import textwrap + +from Bio import GenBank + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def make_fasta(rec): + '''Creates fasta format from a record''' + gi = rec.annotations.get('gi', '') + org = rec.annotations.get('organism', '') + date = rec.annotations.get('date', '') + head = '>gi:%s, id:%s, org:%s, date:%s\n' % (gi, rec.id, org, date) + body = '\n'.join(textwrap.wrap(rec.seq.data, width=80)) + return head, body + + +if __name__ == '__main__': + mode = sys.argv[1] + text = sys.argv[2] + output_file = sys.argv[3] + + print('Searching for %s <br>' % text) + + # check if inputs are all numbers + try: + gi_list = text.split() + [int(_) for _ in gi_list] + except ValueError: + gi_list = GenBank.search_for(text, max_ids=10) + + fp = open(output_file, 'wt') + record_parser = GenBank.FeatureParser() + ncbi_dict = GenBank.NCBIDictionary(mode, 'genbank', parser=record_parser) + for gid in gi_list: + res = ncbi_dict[gid] + head, body = make_fasta(res) + fp.write(head + body + '\n') + print(head) + fp.close() |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/genbank.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/genbank.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,25 @@ +<tool id="genbank" name="Connect to Genbank" version="1.0.0"> +<!-- <description>queries genbank</description> --> + <command interpreter="python">genbank.py $mode "$text" $output</command> + <inputs> + <param name="mode" type="select"> + <option value="nucleotide">nucleotide database</option> + <option value="protein">proteins database</option> + <label>Get sequences from the</label> + </param> + <param name="text" size="40" type="text" value="6273291"> + <label>with accession ID</label> + </param> + </inputs> + <outputs> + <data format="fasta" name="output" /> + </outputs> + <help> +At the moment this tool allows the following simple searches: + +- by GI: **51594135** +- by accession: **CF622840** +- using text: **human hbb1** (this feature is experimental) + </help> + +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/gramene_mart.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/gramene_mart.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,42 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. + + TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile + everything including and beyond the first '&' is truncated from URL. They said they'll let us know when this is fixed at their end. +--> +<tool name="GrameneMart" id="gramenemart" tool_type="data_source" version="1.0.1"> + <description> Central server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://www.gramene.org/biomart/martview" check_values="false" method="get" target="_top"> + <display>go to GrameneMart Central $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="_export" missing="1" /> + <value name="GALAXY_URL" missing="0" /> + </append_param> + </request_param> + <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular"> + <value_translation> + <value galaxy_value="tabular" remote_value="TSV" /> + </value_translation> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" /> + <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="Biomart query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/hapmapmart.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/hapmapmart.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,46 @@ +<?xml version="1.0"?> +<!-- + hacked from biomart.xml - testing hapmap biomart - problem is going to be converting these to lped/pbed + the data returned will be in all sorts of different shapes - and the sample ids need to be obtained separately + to create reliable pedigrees. eesh... + + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. + + TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile + everything including and beyond the first '&' is truncated from URL. They said they'll let us know when this is fixed at their end. +--> +<tool name="HapMapMart" id="hapmapmart" tool_type="data_source" version="0.0.01"> + <description>HapMap Biomart</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://hapmap.ncbi.nlm.nih.gov/biomart/martview" check_values="false" method="get" target="_top"> + <display>go to HapMap BioMart $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner/hapmapmart" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="_export" missing="1" /> + <value name="GALAXY_URL" missing="0" /> + </append_param> + </request_param> + <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" > + <value_translation> + <value galaxy_value="tabular" remote_value="TSV" /> + </value_translation> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" /> + <request_param galaxy_name="dbkey" remote_name="dbkey" missing="hg18" /> + <request_param galaxy_name="organism" remote_name="organism" missing="human" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="HapMap query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/hbvar.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/hbvar.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,21 @@ +<?xml version="1.0"?> +<tool name="HbVar" id="hbvar" tool_type="data_source" version="2.0.0"> + + <description>Human Hemoglobin Variants and Thalassemias</description> + + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + + <inputs action="http://globin.bx.psu.edu/cgi-bin/hbvar/query_vars3" check_values="false" method="get" target="_top"> + <display>go to HbVar database $GALAXY_URL $tool_id</display> + </inputs> + + <uihints minwidth="800"/> + + <outputs> + <data name="output" format="auto" /> + </outputs> + + <options sanitize="False" refresh="True"/> + +</tool> + |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/hbvar_filter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/hbvar_filter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,81 @@ +# TODO: Set dbkey to proper UCSC build, if known +import shutil +import tempfile + +from six.moves.urllib.request import urlopen + +from galaxy import datatypes + + +def exec_before_job( app, inp_data, out_data, param_dict, tool=None): + """Sets the name of the data""" + data_name = param_dict.get( 'name', 'HbVar query' ) + data_type = param_dict.get( 'type', 'txt' ) + if data_type == 'txt': + data_type = 'interval' # All data is TSV, assume interval + name, data = next(iter(out_data.items())) + data = app.datatypes_registry.change_datatype(data, data_type) + data.name = data_name + out_data[name] = data + + +def exec_after_process(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None): + """Verifies the data after the run""" + + URL = param_dict.get( 'URL', None ) + URL = URL + '&_export=1&GALAXY_URL=0' + if not URL: + raise Exception('Datasource has not sent back a URL parameter') + + CHUNK_SIZE = 2**20 # 1Mb + MAX_SIZE = CHUNK_SIZE * 100 + + try: + page = urlopen(URL) + except Exception as exc: + raise Exception('Problems connecting to %s (%s)' % (URL, exc) ) + + data = next(iter(out_data.values())) + + fp = open(data.file_name, 'wb') + size = 0 + while 1: + chunk = page.read(CHUNK_SIZE) + if not chunk: + break + if size > MAX_SIZE: + raise Exception('----- maximum datasize exceeded ---') + size += len(chunk) + fp.write(chunk) + + fp.close() + # Set meta data, format file to be valid interval type + if isinstance(data.datatype, datatypes.interval.Interval): + data.set_meta(first_line_is_header=True) + # check for missing meta data, if all there, comment first line and process file + if not data.missing_meta(): + line_ctr = -1 + temp = tempfile.NamedTemporaryFile('w') + temp_filename = temp.name + temp.close() + temp = open(temp_filename, 'w') + int(data.metadata.chromCol) + int(data.metadata.startCol) + int(data.metadata.strandCol) + + for line in open(data.file_name, 'r'): + line_ctr += 1 + + fields = line.strip().split('\t') + + temp.write("%s\n" % '\t'.join(fields)) + + temp.close() + shutil.move(temp_filename, data.file_name) + + else: + data = app.datatypes_registry.change_datatype(data, 'tabular') + data.set_size() + data.set_peek() + app.model.context.add( data ) + app.model.context.flush() |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/import.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,62 @@ +#!/usr/bin/env python +""" +Script that imports locally stored data as a new dataset for the user +Usage: import id outputfile +""" +from __future__ import print_function + +import os +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + +BUFFER = 1048576 + +dataid = sys.argv[1] +out_name = sys.argv[2] + + +id2name = { + 'eryth' : 'ErythPreCRMmm3_cusTrk.txt', + 'cishg16' : 'ReglRegHBBhg16CusTrk.txt', + 'cishg17' : 'ReglRegHBBhg17CusTrk.txt', + 'exons' : 'ExonsKnownGenes_mm3.txt', + 'krhg16' : 'known_regulatory_hg16.bed', + 'krhg17' : 'known_regulatory_hg17.bed', + 'tARhg16mmc' : 'hg16.mouse.t_AR.cold.bed', + 'tARhg16mmm' : 'hg16.mouse.t_AR.medium.bed', + 'tARhg16mmh' : 'hg16.mouse.t_AR.hot.bed', + 'tARhg16rnc' : 'hg16.rat.t_AR.cold.bed', + 'tARhg16rnm' : 'hg16.rat.t_AR.medium.bed', + 'tARhg16rnh' : 'hg16.rat.t_AR.hot.bed', + 'phastConsHg16' : 'phastConsMost_hg16.bed', + 'omimhg16' : 'omimDisorders_hg16.tab', + 'omimhg17' : 'omimDisorders_hg17.tab', +} + +fname = id2name.get(dataid, '') +if not fname: + print('Importing invalid data %s' % dataid) + sys.exit() +else: + print('Imported %s' % fname) + +# this path is hardcoded +inp_name = os.path.join('database', 'import', fname) + +try: + inp = open(inp_name, 'rt') +except: + print('Could not find file %s' % inp_name) + sys.exit() + +out = open(out_name, 'wt') + +while 1: + data = inp.read(BUFFER) + if not data: + break + out.write(data) + +inp.close() +out.close() |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/import.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,27 @@ +<tool id="Featured datasets4" name="Featured datasets" version="1.0.0"> + <description>(PSU prepared queries)</description> + <command interpreter="python">import.py $data $output</command> + <inputs> + <display>$data</display> + <param name="data" type="select" display="radio"> + <option value="eryth">Erythroid predicted cis-regulatory modules</option> + <option value="exons">Exons of protein-coding genes in the mouse genome, assembly mm3</option> + <option value="cishg16 ">Known cis-regulatory modules in the human HBB gene complex (hg16)</option> + <option value="cishg17">Known cis-regulatory modules in the human HBB gene complex (hg17)</option> + <option value="krhg16">Known regulatory regions (hg16)</option> + <option value="krhg17">Known regulatory regions (hg17)</option> + <option value="tARhg16mmc">Human (hg16) evolutionary cold region (vs mouse)</option> + <option value="tARhg16mmm">Human (hg16) evolutionary medium region (vs mouse)</option> + <option value="tARhg16mmh">Human (hg16) evolutionary hot region (vs mouse)</option> + <option value="tARhg16rnc">Human (hg16) evolutionary cold region (vs rat)</option> + <option value="tARhg16rnm">Human (hg16) evolutionary medium region (vs rat)</option> + <option value="tARhg16rnh">Human (hg16) evolutionary hot region (vs rat)</option> + <option value="phastConsHg16">phastCons hg16 (stringent, top ~5%) from UCSC</option> + <option value="omimhg16">OMIM disorders (hg16)</option> + <option value="omimhg17">OMIM disorders (hg17)</option> + </param> + </inputs> + <outputs> + <data format="bed" name="output" /> + </outputs> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/metabolicmine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/metabolicmine.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<tool name="metabolicMine" id="metabolicmine" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://www.metabolicmine.org/beta/begin.do" check_values="false" method="get"> + <display>go to metabolicMine server $GALAXY_URL</display> + </inputs> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/microbial_import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/microbial_import.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,86 @@ +#!/usr/bin/env python +""" +Script that imports locally stored data as a new dataset for the user +Usage: import id outputfile +""" +from __future__ import print_function + +import sys +from shutil import copyfile + +assert sys.version_info[:2] >= ( 2, 4 ) + +BUFFER = 1048576 + +uids = sys.argv[1].split(",") +out_file1 = sys.argv[2] + +# remove NONE from uids +have_none = True +while have_none: + try: + uids.remove('None') + except: + have_none = False + + +# create dictionary keyed by uid of tuples of (displayName,filePath,build) for all files +available_files = {} +try: + filename = sys.argv[-1] + for i, line in enumerate( open( filename ) ): + if not line or line[0:1] == "#": + continue + fields = line.split('\t') + try: + info_type = fields.pop(0) + + if info_type.upper() == "DATA": + uid = fields.pop(0) + org_num = fields.pop(0) + chr_acc = fields.pop(0) + feature = fields.pop(0) + filetype = fields.pop(0) + path = fields.pop(0).replace("\r", "").replace("\n", "") + + file_type = filetype + build = org_num + description = uid + else: + continue + except: + continue + + available_files[uid] = (description, path, build, file_type, chr_acc) +except: + print("It appears that the configuration file for this tool is missing.", file=sys.stderr) + +# create list of tuples of (displayName,FileName,build) for desired files +desired_files = [] +for uid in uids: + try: + desired_files.append(available_files[uid]) + except: + continue + +# copy first file to contents of given output file +file1_copied = False +while not file1_copied: + try: + first_file = desired_files.pop(0) + except: + print("There were no valid files requested.", file=sys.stderr) + sys.exit() + file1_desc, file1_path, file1_build, file1_type, file1_chr_acc = first_file + try: + copyfile(file1_path, out_file1) + print("#File1\t" + file1_desc + "\t" + file1_chr_acc + "\t" + file1_build + "\t" + file1_type) + file1_copied = True + except: + print("The file specified is missing.", file=sys.stderr) + continue + +# Tell post-process filter where remaining files reside +for extra_output in desired_files: + file_desc, file_path, file_build, file_type, file_chr_acc = extra_output + print("#NewFile\t" + file_desc + "\t" + file_chr_acc + "\t" + file_build + "\t" + file_path + "\t" + file_type) |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/microbial_import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/microbial_import.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,114 @@ +<tool id="microbial_import1" name="Get Microbial Data" version="1.0.0"> + <command interpreter="python">microbial_import.py $CDS,$tRNA,$rRNA,$sequence,$GeneMark,$GeneMarkHMM,$Glimmer3 $output ${GALAXY_DATA_INDEX_DIR}/microbial_data.loc</command> + <inputs> + <param name="kingdom" type="select" label="Select the Desired Kingdom"> + <options from_file="microbial_data.loc" startswith="ORG"> + <column name="name" index="3"/> + <column name="value" index="3"/> + <filter type="unique_value" name="unique" column="3"/> + </options> + </param> + <param name="org" type="select" label="Select the Desired Organism"> + <options from_file="microbial_data.loc" startswith="ORG"> + <column name="name" index="2"/> + <column name="value" index="1"/> + <filter type="param_value" ref="kingdom" name="kingdom" column="3"/> + <filter type="sort_by" column="2"/> + </options> + </param> + <param name="CDS" type="select" label="Select Desired Coding Sequences" display="checkboxes" multiple="True"> + <options from_file="microbial_data.loc" startswith="DATA"> + <column name="name" index="3"/> + <column name="value" index="1"/> + <column name="feature" index="4"/> + <filter type="param_value" ref="org" name="kingdom" column="2"/> + <filter type="static_value" name="feature" value="CDS" column="4"/> + </options> + </param> + <param name="tRNA" type="select" label="Select Desired tRNA" display="checkboxes" multiple="True"> + <options from_file="microbial_data.loc" startswith="DATA"> + <column name="name" index="3"/> + <column name="value" index="1"/> + <column name="feature" index="4"/> + <filter type="param_value" ref="org" name="kingdom" column="2"/> + <filter type="static_value" name="feature" value="tRNA" column="4"/> + </options> + </param> + <param name="rRNA" type="select" label="Select Desired rRNA" display="checkboxes" multiple="True"> + <options from_file="microbial_data.loc" startswith="DATA"> + <column name="name" index="3"/> + <column name="value" index="1"/> + <column name="feature" index="4"/> + <filter type="param_value" ref="org" name="kingdom" column="2"/> + <filter type="static_value" name="feature" value="rRNA" column="4"/> + </options> + </param> + <param name="sequence" type="select" label="Select Desired DNA Sequences" display="checkboxes" multiple="True"> + <options from_file="microbial_data.loc" startswith="DATA"> + <column name="name" index="3"/> + <column name="value" index="1"/> + <column name="feature" index="4"/> + <filter type="param_value" ref="org" name="kingdom" column="2"/> + <filter type="static_value" name="feature" value="sequence" column="4"/> + </options> + </param> + <param name="GeneMark" type="select" label="Select Desired GeneMark Annotations" display="checkboxes" multiple="True"> + <options from_file="microbial_data.loc" startswith="DATA"> + <column name="name" index="3"/> + <column name="value" index="1"/> + <column name="feature" index="4"/> + <filter type="param_value" ref="org" name="kingdom" column="2"/> + <filter type="static_value" name="feature" value="GeneMark" column="4"/> + </options> + </param> + <param name="GeneMarkHMM" type="select" label="Select Desired GeneMarkHMM Annotations" display="checkboxes" multiple="True"> + <options from_file="microbial_data.loc" startswith="DATA"> + <column name="name" index="3"/> + <column name="value" index="1"/> + <column name="feature" index="4"/> + <filter type="param_value" ref="org" name="kingdom" column="2"/> + <filter type="static_value" name="feature" value="GeneMarkHMM" column="4"/> + </options> + </param> + <param name="Glimmer3" type="select" label="Select Desired Glimmer3 Annotations" display="checkboxes" multiple="True"> + <options from_file="microbial_data.loc" startswith="DATA"> + <column name="name" index="3"/> + <column name="value" index="1"/> + <column name="feature" index="4"/> + <filter type="param_value" ref="org" name="kingdom" column="2"/> + <filter type="static_value" name="feature" value="Glimmer3" column="4"/> + </options> + </param> + </inputs> + <outputs> + <data format="bed" name="output"/> + </outputs> + <code file="microbial_import_code.py"/> + <help> + +This tool will allow you to obtain various genomic datasets for any completed Microbial Genome Project as listed at NCBI_. + +.. _NCBI: http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?view=1 + +Current datasets available include + 1. CDS + 2. tRNA + 3. rRNA + 4. FASTA Sequences + 5. GeneMark Annotations + 6. GeneMarkHMM Annotations + 7. Glimmer3 Annotations + +----- + +Organisms in **bold** are available at the UCSC Browser. + +----- + +.. class:: infomark + +**Note:** Having trouble locating your organism? Click here_ for a list of available species and their location. + +.. _here: https://wiki.galaxyproject.org/Main/Data%20Libraries/Microbes + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/microbial_import_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/microbial_import_code.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,158 @@ +from __future__ import print_function + +from shutil import copyfile + +from galaxy import tools + + +def load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' ): + # FIXME: this function is duplicated in the DynamicOptions class. It is used here only to + # set data.name in exec_after_process(). + microbe_info = {} + orgs = {} + + filename = "%s/microbial_data.loc" % GALAXY_DATA_INDEX_DIR + for i, line in enumerate( open( filename ) ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ): + fields = line.split( sep ) + # read each line, if not enough fields, go to next line + try: + info_type = fields.pop(0) + if info_type.upper() == "ORG": + # ORG 12521 Clostridium perfringens SM101 bacteria Firmicutes CP000312,CP000313,CP000314,CP000315 http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=12521 + org_num = fields.pop(0) + name = fields.pop(0) + kingdom = fields.pop(0) + group = fields.pop(0) + chromosomes = fields.pop(0) + info_url = fields.pop(0) + link_site = fields.pop(0) + if org_num not in orgs: + orgs[ org_num ] = {} + orgs[ org_num ][ 'chrs' ] = {} + orgs[ org_num ][ 'name' ] = name + orgs[ org_num ][ 'kingdom' ] = kingdom + orgs[ org_num ][ 'group' ] = group + orgs[ org_num ][ 'chromosomes' ] = chromosomes + orgs[ org_num ][ 'info_url' ] = info_url + orgs[ org_num ][ 'link_site' ] = link_site + elif info_type.upper() == "CHR": + # CHR 12521 CP000315 Clostridium perfringens phage phiSM101, complete genome 38092 110684521 CP000315.1 + org_num = fields.pop(0) + chr_acc = fields.pop(0) + name = fields.pop(0) + length = fields.pop(0) + gi = fields.pop(0) + gb = fields.pop(0) + info_url = fields.pop(0) + chr = {} + chr[ 'name' ] = name + chr[ 'length' ] = length + chr[ 'gi' ] = gi + chr[ 'gb' ] = gb + chr[ 'info_url' ] = info_url + if org_num not in orgs: + orgs[ org_num ] = {} + orgs[ org_num ][ 'chrs' ] = {} + orgs[ org_num ][ 'chrs' ][ chr_acc ] = chr + elif info_type.upper() == "DATA": + # DATA 12521_12521_CDS 12521 CP000315 CDS bed /home/djb396/alignments/playground/bacteria/12521/CP000315.CDS.bed + uid = fields.pop(0) + org_num = fields.pop(0) + chr_acc = fields.pop(0) + feature = fields.pop(0) + filetype = fields.pop(0) + path = fields.pop(0) + data = {} + data[ 'filetype' ] = filetype + data[ 'path' ] = path + data[ 'feature' ] = feature + + if org_num not in orgs: + orgs[ org_num ] = {} + orgs[ org_num ][ 'chrs' ] = {} + if 'data' not in orgs[ org_num ][ 'chrs' ][ chr_acc ]: + orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ] = {} + orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ][ uid ] = data + else: + continue + except: + continue + for org_num in orgs: + org = orgs[ org_num ] + if org[ 'kingdom' ] not in microbe_info: + microbe_info[ org[ 'kingdom' ] ] = {} + if org_num not in microbe_info[ org[ 'kingdom' ] ]: + microbe_info[ org[ 'kingdom' ] ][org_num] = org + return microbe_info + + +# post processing, set build for data and add additional data to history +def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): + base_dataset = next(iter(out_data.values())) + history = base_dataset.history + if history is None: + print("unknown history!") + return + kingdom = param_dict.get( 'kingdom', None ) + org = param_dict.get( 'org', None ) + + # if not (kingdom or group or org): + if not (kingdom or org): + print("Parameters are not available.") + # workflow passes galaxy.tools.parameters.basic.UnvalidatedValue instead of values + if isinstance( kingdom, tools.parameters.basic.UnvalidatedValue ): + kingdom = kingdom.value + if isinstance( org, tools.parameters.basic.UnvalidatedValue ): + org = org.value + + GALAXY_DATA_INDEX_DIR = app.config.tool_data_path + microbe_info = load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' ) + split_stdout = stdout.split("\n") + basic_name = "" + for line in split_stdout: + fields = line.split("\t") + if fields[0] == "#File1": + description = fields[1] + chr = fields[2] + dbkey = fields[3] + file_type = fields[4] + data = next(iter(out_data.values())) + data.set_size() + basic_name = data.name + data.name = data.name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] + " for " + microbe_info[kingdom][org]['name'] + ":" + chr + ")" + data.dbkey = dbkey + data.info = data.name + data = app.datatypes_registry.change_datatype( data, file_type ) + data.init_meta() + data.set_peek() + app.model.context.add( data ) + app.model.context.flush() + elif fields[0] == "#NewFile": + description = fields[1] + chr = fields[2] + dbkey = fields[3] + filepath = fields[4] + file_type = fields[5] + newdata = app.model.HistoryDatasetAssociation( create_dataset=True, sa_session=app.model.context ) # This import should become a library + newdata.set_size() + newdata.extension = file_type + newdata.name = basic_name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] + " for " + microbe_info[kingdom][org]['name'] + ":" + chr + ")" + app.model.context.add( newdata ) + app.model.context.flush() + app.security_agent.copy_dataset_permissions( base_dataset.dataset, newdata.dataset ) + history.add_dataset( newdata ) + app.model.context.add( history ) + app.model.context.flush() + try: + copyfile(filepath, newdata.file_name) + newdata.info = newdata.name + newdata.state = newdata.states.OK + except: + newdata.info = "The requested file is missing from the system." + newdata.state = newdata.states.ERROR + newdata.dbkey = dbkey + newdata.init_meta() + newdata.set_peek() + app.model.context.flush() |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/modmine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/modmine.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="modENCODE modMine" id="modmine" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://intermine.modencode.org/" check_values="false" method="get"> + <display>go to modENCODE modMine server $GALAXY_URL</display> + </inputs> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> + |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/mousemine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/mousemine.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,35 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="MouseMine" id="mousemine" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://www.mousemine.org/mousemine/begin.do" check_values="false" method="get"> + <display>go to MouseMine server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=mousemine" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="URL" remote_name="URL" missing="" /> + <request_param galaxy_name="dbkey" remote_name="db" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="MouseMine query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="txt" /> <!-- intermine currently always provides 'txt', make this auto detect --> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> + |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/ratmine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/ratmine.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,34 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="Ratmine" id="ratmine" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://ratmine.mcw.edu/ratmine/begin.do" check_values="false" method="get"> + <display>go to Ratmine server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=ratmine" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="URL" remote_name="URL" missing="" /> + <request_param galaxy_name="dbkey" remote_name="db" missing="?" /> + <request_param galaxy_name="organism" remote_name="organism" missing="" /> + <request_param galaxy_name="table" remote_name="table" missing="" /> + <request_param galaxy_name="description" remote_name="description" missing="" /> + <request_param galaxy_name="name" remote_name="name" missing="Ratmine query" /> + <request_param galaxy_name="info" remote_name="info" missing="" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="txt" /> <!-- intermine currently always provides 'txt', make this auto detect --> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/ucsc_tablebrowser.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/ucsc_tablebrowser.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,46 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="UCSC Main" id="ucsc_table_direct1" tool_type="data_source" version="1.0.0"> + <description>table browser</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="https://genome.ucsc.edu/cgi-bin/hgTables" check_values="false" method="get"> + <display>go to UCSC Table Browser $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner" /> + <param name="tool_id" type="hidden" value="ucsc_table_direct1" /> + <param name="sendToGalaxy" type="hidden" value="1" /> + <param name="hgta_compressType" type="hidden" value="none" /> + <param name="hgta_outputType" type="hidden" value="bed" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="URL" remote_name="URL" missing="" /> + <request_param galaxy_name="dbkey" remote_name="db" missing="?" /> + <request_param galaxy_name="organism" remote_name="org" missing="unknown species" /> + <request_param galaxy_name="table" remote_name="hgta_table" missing="unknown table" /> + <request_param galaxy_name="description" remote_name="hgta_regionType" missing="no description" /> + <request_param galaxy_name="data_type" remote_name="hgta_outputType" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="primaryTable" /> + <value galaxy_value="auto" remote_value="selectedFields" /> + <value galaxy_value="wig" remote_value="wigData" /> + <value galaxy_value="interval" remote_value="tab" /> + <value galaxy_value="html" remote_value="hyperlinks" /> + <value galaxy_value="fasta" remote_value="sequence" /> + <value galaxy_value="gtf" remote_value="gff" /> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" label="${tool.name} on ${organism}: ${table} (#if $description == 'range' then $getVar( 'position', 'unknown position' ) else $description#)"/> + </outputs> + <options sanitize="False" refresh="True"/> + <citations> + <citation type="doi">10.1093/database/bar011</citation> + <citation type="doi">10.1101/gr.229102</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/ucsc_tablebrowser_archaea.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/ucsc_tablebrowser_archaea.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,47 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="UCSC Archaea" id="ucsc_table_direct_archaea1" tool_type="data_source" version="1.0.0"> + <description>table browser</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://archaea.ucsc.edu/cgi-bin/hgTables" check_values="false" method="get"> + <display>go to UCSC Table Browser $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner" /> + <param name="tool_id" type="hidden" value="ucsc_table_direct_archaea1" /> + <param name="sendToGalaxy" type="hidden" value="1" /> + <param name="hgta_compressType" type="hidden" value="none" /> + <param name="hgta_outputType" type="hidden" value="bed" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="URL" remote_name="URL" missing="" /> + <request_param galaxy_name="dbkey" remote_name="db" missing="?" /> + <request_param galaxy_name="organism" remote_name="org" missing="unknown species" /> + <request_param galaxy_name="table" remote_name="hgta_table" missing="unknown table" /> + <request_param galaxy_name="description" remote_name="hgta_regionType" missing="no description" /> + <request_param galaxy_name="data_type" remote_name="hgta_outputType" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="primaryTable" /> + <value galaxy_value="auto" remote_value="selectedFields" /> + <value galaxy_value="wig" remote_value="wigData" /> + <value galaxy_value="interval" remote_value="tab" /> + <value galaxy_value="html" remote_value="hyperlinks" /> + <value galaxy_value="fasta" remote_value="sequence" /> + <value galaxy_value="gtf" remote_value="gff" /> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" label="${tool.name} on ${organism}: ${table} (#if $description == 'range' then $getVar( 'position', 'unknown position' ) else $description#)"/> + </outputs> + <options sanitize="False" refresh="True"/> + <citations> + <citation type="doi">10.1093/database/bar011</citation> + <citation type="doi">10.1101/gr.229102</citation> + <citation type="doi">10.1093/nar/gkj134</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/ucsc_tablebrowser_test.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/ucsc_tablebrowser_test.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,46 @@ +<?xml version="1.0"?> +<!-- + If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in + the initial response. If value of 'URL_method' is 'post', any additional params coming back in the + initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed. +--> +<tool name="UCSC Test" id="ucsc_table_direct_test1" tool_type="data_source" version="1.0.0"> + <description>table browser</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://genome-test.cse.ucsc.edu/cgi-bin/hgTables" check_values="false" method="get"> + <display>go to UCSC Table Browser $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner" /> + <param name="tool_id" type="hidden" value="ucsc_table_direct_test1" /> + <param name="sendToGalaxy" type="hidden" value="1" /> + <param name="hgta_compressType" type="hidden" value="none" /> + <param name="hgta_outputType" type="hidden" value="bed" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="URL" remote_name="URL" missing="" /> + <request_param galaxy_name="dbkey" remote_name="db" missing="?" /> + <request_param galaxy_name="organism" remote_name="org" missing="unknown species" /> + <request_param galaxy_name="table" remote_name="hgta_table" missing="unknown table" /> + <request_param galaxy_name="description" remote_name="hgta_regionType" missing="no description" /> + <request_param galaxy_name="data_type" remote_name="hgta_outputType" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="primaryTable" /> + <value galaxy_value="auto" remote_value="selectedFields" /> + <value galaxy_value="wig" remote_value="wigData" /> + <value galaxy_value="interval" remote_value="tab" /> + <value galaxy_value="html" remote_value="hyperlinks" /> + <value galaxy_value="fasta" remote_value="sequence" /> + <value galaxy_value="gtf" remote_value="gff" /> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="tabular" label="${tool.name} on ${organism}: ${table} (#if $description == 'range' then $getVar( 'position', 'unknown position' ) else $description#)"/> + </outputs> + <options sanitize="False" refresh="True"/> + <citations> + <citation type="doi">10.1093/database/bar011</citation> + <citation type="doi">10.1101/gr.229102</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/upload.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/upload.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,425 @@\n+#!/usr/bin/env python\n+# Processes uploads from the user.\n+\n+# WARNING: Changes in this tool (particularly as related to parsing) may need\n+# to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools\n+from __future__ import print_function\n+\n+import codecs\n+import gzip\n+import os\n+import shutil\n+import sys\n+import tempfile\n+import zipfile\n+from json import dumps, loads\n+\n+from six.moves.urllib.request import urlopen\n+\n+from galaxy import util\n+from galaxy.datatypes import sniff\n+from galaxy.datatypes.binary import Binary\n+from galaxy.datatypes.registry import Registry\n+from galaxy.util import multi_byte\n+from galaxy.util.checkers import check_binary, check_bz2, check_gzip, check_html, check_zip\n+from galaxy.util.image_util import get_image_ext\n+\n+\n+try:\n+ import bz2\n+except:\n+ bz2 = None\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+\n+def stop_err( msg, ret=1 ):\n+ sys.stderr.write( msg )\n+ sys.exit( ret )\n+\n+\n+def file_err( msg, dataset, json_file ):\n+ json_file.write( dumps( dict( type=\'dataset\',\n+ ext=\'data\',\n+ dataset_id=dataset.dataset_id,\n+ stderr=msg ) ) + "\\n" )\n+ # never remove a server-side upload\n+ if dataset.type in ( \'server_dir\', \'path_paste\' ):\n+ return\n+ try:\n+ os.remove( dataset.path )\n+ except:\n+ pass\n+\n+\n+def safe_dict(d):\n+ """\n+ Recursively clone json structure with UTF-8 dictionary keys\n+ http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/\n+ """\n+ if isinstance(d, dict):\n+ return dict([(k.encode(\'utf-8\'), safe_dict(v)) for k, v in d.items()])\n+ elif isinstance(d, list):\n+ return [safe_dict(x) for x in d]\n+ else:\n+ return d\n+\n+\n+def parse_outputs( args ):\n+ rval = {}\n+ for arg in args:\n+ id, files_path, path = arg.split( \':\', 2 )\n+ rval[int( id )] = ( path, files_path )\n+ return rval\n+\n+\n+def add_file( dataset, registry, json_file, output_path ):\n+ data_type = None\n+ line_count = None\n+ converted_path = None\n+ stdout = None\n+ link_data_only = dataset.get( \'link_data_only\', \'copy_files\' )\n+ in_place = dataset.get( \'in_place\', True )\n+ purge_source = dataset.get( \'purge_source\', True )\n+ try:\n+ ext = dataset.file_type\n+ except AttributeError:\n+ file_err( \'Unable to process uploaded file, missing file_type parameter.\', dataset, json_file )\n+ return\n+\n+ if dataset.type == \'url\':\n+ try:\n+ page = urlopen( dataset.path ) # page will be .close()ed by sniff methods\n+ temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix=\'url_paste\', source_encoding=util.get_charset_from_http_headers( page.headers ) )\n+ except Exception as e:\n+ file_err( \'Unable to fetch %s\\n%s\' % ( dataset.path, str( e ) ), dataset, json_file )\n+ return\n+ dataset.path = temp_name\n+ # See if we have an empty file\n+ if not os.path.exists( dataset.path ):\n+ file_err( \'Uploaded temporary file (%s) does not exist.\' % dataset.path, dataset, json_file )\n+ return\n+ if not os.path.getsize( dataset.path ) > 0:\n+ file_err( \'The uploaded file is empty\', dataset, json_file )\n+ return\n+ if not dataset.type == \'url\':\n+ # Already set is_multi_byte above if type == \'url\'\n+ try:\n+ dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open( dataset.path, \'r\', \'utf-8\' ).read( 100 ) )\n+ except UnicodeDecodeError as e:\n+ dataset.is_multi_byte = False\n+ # Is dataset an image?\n+ i_ext = get_image_ext( dataset.path )\n+ if i_ext:\n+ ext = i_ext\n+ data_type = ext\n+ # Is dataset content multi-byte?\n+ elif dataset.is_multi_byte:\n+ data_type = \'multi-byte char\'\n+ ext = sniff.guess_ext( dataset.path, registry.sniff_order, is_multi_byte=Tr'..b't.get(\'uuid\')\n+ json_file.write( dumps( info ) + "\\n" )\n+\n+ if link_data_only == \'copy_files\' and datatype.dataset_content_needs_grooming( output_path ):\n+ # Groom the dataset content if necessary\n+ datatype.groom_dataset_content( output_path )\n+\n+\n+def add_composite_file( dataset, json_file, output_path, files_path ):\n+ if dataset.composite_files:\n+ os.mkdir( files_path )\n+ for name, value in dataset.composite_files.items():\n+ value = util.bunch.Bunch( **value )\n+ if dataset.composite_file_paths[ value.name ] is None and not value.optional:\n+ file_err( \'A required composite data file was not provided (%s)\' % name, dataset, json_file )\n+ break\n+ elif dataset.composite_file_paths[value.name] is not None:\n+ dp = dataset.composite_file_paths[value.name][ \'path\' ]\n+ isurl = dp.find(\'://\') != -1 # todo fixme\n+ if isurl:\n+ try:\n+ temp_name, dataset.is_multi_byte = sniff.stream_to_file( urlopen( dp ), prefix=\'url_paste\' )\n+ except Exception as e:\n+ file_err( \'Unable to fetch %s\\n%s\' % ( dp, str( e ) ), dataset, json_file )\n+ return\n+ dataset.path = temp_name\n+ dp = temp_name\n+ if not value.is_binary:\n+ tmpdir = output_adjacent_tmpdir( output_path )\n+ tmp_prefix = \'data_id_%s_convert_\' % dataset.dataset_id\n+ if dataset.composite_file_paths[ value.name ].get( \'space_to_tab\', value.space_to_tab ):\n+ sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )\n+ else:\n+ sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )\n+ shutil.move( dp, os.path.join( files_path, name ) )\n+ # Move the dataset to its "real" path\n+ shutil.move( dataset.primary_file, output_path )\n+ # Write the job info\n+ info = dict( type=\'dataset\',\n+ dataset_id=dataset.dataset_id,\n+ stdout=\'uploaded %s file\' % dataset.file_type )\n+ json_file.write( dumps( info ) + "\\n" )\n+\n+\n+def output_adjacent_tmpdir( output_path ):\n+ """ For temp files that will ultimately be moved to output_path anyway\n+ just create the file directly in output_path\'s directory so shutil.move\n+ will work optimially.\n+ """\n+ return os.path.dirname( output_path )\n+\n+\n+def __main__():\n+\n+ if len( sys.argv ) < 4:\n+ print(\'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...\', file=sys.stderr)\n+ sys.exit( 1 )\n+\n+ output_paths = parse_outputs( sys.argv[4:] )\n+ json_file = open( \'galaxy.json\', \'w\' )\n+\n+ registry = Registry()\n+ registry.load_datatypes( root_dir=sys.argv[1], config=sys.argv[2] )\n+\n+ for line in open( sys.argv[3], \'r\' ):\n+ dataset = loads( line )\n+ dataset = util.bunch.Bunch( **safe_dict( dataset ) )\n+ try:\n+ output_path = output_paths[int( dataset.dataset_id )][0]\n+ except:\n+ print(\'Output path for dataset %s not found on command line\' % dataset.dataset_id, file=sys.stderr)\n+ sys.exit( 1 )\n+ if dataset.type == \'composite\':\n+ files_path = output_paths[int( dataset.dataset_id )][1]\n+ add_composite_file( dataset, json_file, output_path, files_path )\n+ else:\n+ add_file( dataset, registry, json_file, output_path )\n+\n+ # clean up paramfile\n+ # TODO: this will not work when running as the actual user unless the\n+ # parent directory is writable by the user.\n+ try:\n+ os.remove( sys.argv[3] )\n+ except:\n+ pass\n+\n+\n+if __name__ == \'__main__\':\n+ __main__()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/upload.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/upload.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,232 @@\n+<?xml version="1.0"?>\n+\n+<tool name="Upload File" id="upload1" version="1.1.4" workflow_compatible="false">\n+ <description>\n+ from your computer \n+ </description>\n+ <action module="galaxy.tools.actions.upload" class="UploadToolAction"/>\n+ <requirements>\n+ <requirement type="package">samtools</requirement>\n+ </requirements>\n+ <command interpreter="python">\n+ upload.py $GALAXY_ROOT_DIR $GALAXY_DATATYPES_CONF_FILE $paramfile\n+ #set $outnum = 0\n+ #while $varExists(\'output%i\' % $outnum):\n+ #set $output = $getVar(\'output%i\' % $outnum)\n+ #set $outnum += 1\n+ #set $file_name = $output.file_name\n+ ## FIXME: This is not future-proof for other uses of external_filename (other than for use by the library upload\'s "link data" feature)\n+ #if $output.dataset.dataset.external_filename:\n+ #set $file_name = "None"\n+ #end if\n+ ${output.dataset.dataset.id}:${output.files_path}:${file_name}\n+ #end while\n+ </command>\n+ <inputs nginx_upload="true">\n+ <param name="file_type" type="select" label="File Format" help="Which format? See help below">\n+ <options from_parameter="tool.app.datatypes_registry.upload_file_formats" transform_lines="[ "%s%s%s" % ( line, self.separator, line ) for line in obj ]">\n+ <column name="value" index="1"/>\n+ <column name="name" index="0"/>\n+ <filter type="sort_by" column="0"/>\n+ <filter type="add_value" name="Auto-detect" value="auto" index="0"/>\n+ </options>\n+ </param>\n+ <param name="async_datasets" type="hidden" value="None"/>\n+ <upload_dataset name="files" title="Specify Files for Dataset" file_type_name="file_type" metadata_ref="files_metadata">\n+ <param name="file_data" type="file" size="30" label="File" ajax-upload="true" help="TIP: Due to browser limitations, uploading files larger than 2GB is guaranteed to fail. To upload large files, use the URL method (below) or FTP (if enabled by the site administrator).">\n+ </param>\n+ <param name="url_paste" type="text" area="true" size="5x35" label="URL/Text" help="Here you may specify a list of URLs (one per line) or paste the contents of a file."/> \n+ <param name="ftp_files" type="ftpfile" label="Files uploaded via FTP"/>\n+ <!-- Swap the following parameter for the select one that follows to\n+ enable the to_posix_lines option in the Web GUI. See Bitbucket\n+ Pull Request 171 for more information. -->\n+ <param name="uuid" type="hidden" required="False" />\n+ <param name="to_posix_lines" type="hidden" value="Yes" />\n+ <!--\n+ <param name="to_posix_lines" type="select" display="checkboxes" multiple="True" label="Convert universal line endings to Posix line endings" help="Turn this option off if you upload a gzip, bz2 or zip archive which contains a binary file." value="Yes"> \n+ <option value="Yes" selected="true">Yes</option>\n+ </param>\n+ -->\n+ <param name="space_to_tab" type="select" display="checkboxes" multiple="True" label="Convert spaces to tabs" help="Use this option if you are entering intervals by hand."> \n+ <option value="Yes">Yes</option>\n+ </param>\n+ <param name="NAME" type="hidden" help="Name for dataset in upload"></param>\n+ </upload_dataset>\n+ <param name="dbkey" type="genomebuild" label="Genome" />\n+ <conditional name="files_metadata" value_from="self:app.datatypes_registry.get_upload_metadata_params" value_ref="file_type" value_ref_in_group="False" />\n+ <!-- <param name="other_dbkey" type="text" label="Or user-defined Genome" /> -->\n+ </inputs>\n+ <help>\n+ \n+**Auto-detect**\n+\n+The system will attempt to detect Axt, Fasta, Fastqsolexa, Gff, Gff3, Html, Lav, Maf, Tabular, Wiggle, Bed and Interval (Bed with headers) formats. If your file is not detected properly as one of the known formats, it most likely means that it has some format problems (e.g., different number of columns on di'..b'eme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.\n+ - blockCount - The number of blocks (exons) in the BED line.\n+ - blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.\n+ - blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.\n+\n+* Example::\n+\n+ chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512\n+ chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601\n+\n+-----\n+\n+**Fasta**\n+\n+A sequence in FASTA format consists of a single-line description, followed by lines of sequence data. The first character of the description line is a greater-than (">") symbol in the first column. All lines should be shorter than 80 characters::\n+\n+ >sequence1\n+ atgcgtttgcgtgc\n+ gtcggtttcgttgc\n+ >sequence2\n+ tttcgtgcgtatag\n+ tggcgcggtga\n+\n+-----\n+\n+**FastqSolexa**\n+\n+FastqSolexa is the Illumina (Solexa) variant of the Fastq format, which stores sequences and quality scores in a single file::\n+\n+ @seq1 \n+ GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT \n+ +seq1 \n+ hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh \n+ @seq2 \n+ GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG \n+ +seq2 \n+ hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO\n+ \n+Or:: \n+\n+ @seq1\n+ GAATTGATCAGGACATAGGACAACTGTAGGCACCAT\n+ +seq1\n+ 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4\n+ @seq2\n+ GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG\n+ +seq2\n+ 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9\n+ \n+-----\n+\n+**Gff**\n+\n+GFF lines have nine required fields that must be tab-separated.\n+\n+-----\n+\n+**Gff3**\n+\n+The GFF3 format addresses the most common extensions to GFF, while preserving backward compatibility with previous formats.\n+\n+-----\n+\n+**Interval (Genomic Intervals)**\n+\n+- Tab delimited format (tabular)\n+- File must start with definition line in the following format (columns may be in any order).::\n+\n+ #CHROM START END STRAND\n+\n+- CHROM - The name of the chromosome (e.g. chr3, chrY, chr2_random) or contig (e.g. ctgY1).\n+- START - The starting position of the feature in the chromosome or contig. The first base in a chromosome is numbered 0.\n+- END - The ending position of the feature in the chromosome or contig. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.\n+- STRAND - Defines the strand - either \'+\' or \'-\'.\n+\n+- Example::\n+\n+ #CHROM START END STRAND NAME COMMENT\n+ chr1 10 100 + exon myExon\n+ chrX 1000 10050 - gene myGene\n+\n+-----\n+\n+**Lav**\n+\n+Lav is the primary output format for BLASTZ. The first line of a .lav file begins with #:lav..\n+\n+-----\n+\n+**MAF**\n+\n+TBA and multiz multiple alignment format. The first line of a .maf file begins with ##maf. This word is followed by white-space-separated "variable=value" pairs. There should be no white space surrounding the "=".\n+\n+-----\n+\n+**Scf**\n+\n+A binary sequence file in \'scf\' format with a \'.scf\' file extension. You must manually select this \'File Format\' when uploading the file.\n+\n+-----\n+\n+**Sff**\n+\n+A binary file in \'Standard Flowgram Format\' with a \'.sff\' file extension.\n+\n+-----\n+\n+**Tabular (tab delimited)**\n+\n+Any data in tab delimited format (tabular)\n+\n+-----\n+\n+**Table (delimiter-separated)**\n+\n+Any delimiter-separated tabular data (CSV or TSV).\n+\n+-----\n+\n+**Wig**\n+\n+The wiggle format is line-oriented. Wiggle data is preceded by a track definition line, which adds a number of options for controlling the default display of this track.\n+\n+-----\n+\n+**Other text type**\n+\n+Any text file\n+\n+ </help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/worm_modencode.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/worm_modencode.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,32 @@ +<?xml version="1.0"?> +<tool name="modENCODE worm" id="modENCODEworm" tool_type="data_source" version="1.0.1"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://gbrowse.modencode.org/fgb2/gbrowse/worm" check_values="false" target="_top"> + <display>go to modENCODE worm server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=modENCODEworm" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="dbkey" remote_name="dbkey" missing="ce10" > + <value_translation> + <value galaxy_value="ce10" remote_value="worm" /> + </value_translation> + </request_param> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="d" missing="" /> + <value name="dbkey" missing="ce10" /> + <value name="q" missing="" /> + <value name="s" missing="" /> + <value name="t" missing="" /> + </append_param> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/wormbase.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/wormbase.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<tool name="WormBase" id="wormbase" tool_type="data_source" version="1.0.1"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://www.wormbase.org/tools/genome/gbrowse/c_elegans/" check_values="false" target="_top"> + <display>go to Wormbase server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=wormbase" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="d" missing="" /> + <value name="dbkey" missing="" /> + <value name="q" missing="" /> + <value name="s" missing="" /> + <value name="t" missing="" /> + </append_param> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/wormbase_test.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/wormbase_test.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<tool name="Wormbase" id="wormbase_test" tool_type="data_source" version="1.0.0"> + <description>test server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://dev.wormbase.org/db/seq/gbrowse/c_elegans/" check_values="false" target="_top"> + <display>go to Wormbase test server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=wormbase_test" /> + </inputs> + <request_param_translation> + <request_param galaxy_name="URL" remote_name="URL" missing=""> + <append_param separator="&" first_separator="?" join="="> + <value name="d" missing="" /> + <value name="dbkey" missing="" /> + <value name="q" missing="" /> + <value name="s" missing="" /> + <value name="t" missing="" /> + </append_param> + </request_param> + <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" /> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" /> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/yeastmine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/yeastmine.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<tool name="YeastMine" id="yeastmine" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://yeastmine.yeastgenome.org/yeastmine/begin.do" check_values="false" method="get"> + <display>go to yeastMine server $GALAXY_URL</display> + </inputs> + <request_param_translation> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="txt" /> <!-- intermine currently always provides 'txt', make this auto detect --> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c data_source/zebrafishmine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_source/zebrafishmine.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<tool name="ZebrafishMine" id="zebrafishmine" tool_type="data_source" version="1.0.0"> + <description>server</description> + <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command> + <inputs action="http://zebrafishmine.org/begin.do" check_values="false" method="get"> + <display>go to ZebrafishMine server $GALAXY_URL</display> + </inputs> + <request_param_translation> + <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" > + <value_translation> + <value galaxy_value="auto" remote_value="txt" /> <!-- make txt auto detect --> + </value_translation> + </request_param> + </request_param_translation> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c evolution/add_scores.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/evolution/add_scores.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,113 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys + +from bx.bbi.bigwig_file import BigWigFile + + +def die( message ): + print(message, file=sys.stderr) + sys.exit(1) + + +def open_or_die( filename, mode='r', message=None ): + if message is None: + message = 'Error opening %s' % filename + try: + fh = open( filename, mode ) + except IOError as err: + die( '%s: %s' % ( message, err.strerror ) ) + return fh + + +class LocationFile( object ): + def __init__( self, filename, comment_chars=None, delimiter='\t', key_column=0 ): + self.filename = filename + if comment_chars is None: + self.comment_chars = ( '#' ) + else: + self.comment_chars = tuple( comment_chars ) + self.delimiter = delimiter + self.key_column = key_column + self._map = {} + self._populate_map() + + def _populate_map( self ): + try: + with open( self.filename ) as fh: + line_number = 0 + for line in fh: + line_number += 1 + line = line.rstrip( '\r\n' ) + if not line.startswith( self.comment_chars ): + elems = line.split( self.delimiter ) + if len( elems ) <= self.key_column: + die( 'Location file %s line %d: less than %d columns' % ( self.filename, line_number, self.key_column + 1 ) ) + else: + key = elems.pop( self.key_column ) + if key in self._map: + if self._map[key] != elems: + die( 'Location file %s line %d: duplicate key "%s"' % ( self.filename, line_number, key ) ) + else: + self._map[key] = elems + except IOError as err: + die( 'Error opening location file %s: %s' % ( self.filename, err.strerror ) ) + + def get_values( self, key ): + if key in self._map: + rval = self._map[key] + if len( rval ) == 1: + return rval[0] + else: + return rval + else: + die( 'key "%s" not found in location file %s' % ( key, self.filename ) ) + + +def main(): + input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[1:] + + # open input, output, and bigwig files + location_file = LocationFile( loc_filename ) + bigwig_filename = location_file.get_values( loc_key ) + bwfh = open_or_die( bigwig_filename, message='Error opening BigWig file %s' % bigwig_filename ) + bw = BigWigFile( file=bwfh ) + ifh = open_or_die( input_filename, message='Error opening input file %s' % input_filename ) + ofh = open_or_die( output_filename, mode='w', message='Error opening output file %s' % output_filename ) + + # make column numbers 0-based + chrom_col = int( chrom_col ) - 1 + start_col = int( start_col ) - 1 + min_cols = max( chrom_col, start_col ) + + # add score column to imput file + line_number = 0 + for line in ifh: + line_number += 1 + line = line.rstrip( '\r\n' ) + elems = line.split( '\t' ) + if len( elems ) > min_cols: + chrom = elems[chrom_col].strip() + # base-0 position in chrom + start = int( elems[start_col] ) + score_list = bw.get( chrom, start, start + 1 ) + score_list_len = len( score_list ) + if score_list_len == 1: + beg, end, score = score_list[0] + score_val = '%1.3f' % score + elif score_list_len == 0: + score_val = 'NA' + else: + die( '%s line %d: chrom=%s, start=%d, score_list_len = %d' % ( input_filename, line_number, chrom, start, score_list_len ) ) + print('\t'.join( [line, score_val] ), file=ofh) + else: + print(line, file=ofh) + + bwfh.close() + ifh.close() + ofh.close() + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c evolution/add_scores.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/evolution/add_scores.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,96 @@ +<tool id="hgv_add_scores" name="phyloP" version="1.0.0"> + <description>interspecies conservation scores</description> + <requirements> + <requirement type="package">add_scores</requirement> + </requirements> + <command> + python '$__tool_directory__/add_scores.py' '$input1' '$out_file1' '${GALAXY_DATA_INDEX_DIR}/add_scores.loc' '${input1.metadata.dbkey}' '${input1.metadata.chromCol}' '${input1.metadata.startCol}' + </command> + + <inputs> + <param format="interval" name="input1" type="data" label="Dataset"> + <validator type="unspecified_build"/> + <validator type="dataset_metadata_in_file" filename="add_scores.loc" metadata_name="dbkey" metadata_column="0" message="Data is currently not available for the specified build."/> + </param> + </inputs> + + <outputs> + <data format_source="input1" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="add_scores_input1.interval" ftype="interval" dbkey="hg18" /> + <output name="output" file="add_scores_output1.interval" /> + </test> + <test> + <param name="input1" value="add_scores_input2.bed" ftype="interval" dbkey="hg18" /> + <output name="output" file="add_scores_output2.interval" /> + </test> + </tests> + + <help> +.. class:: warningmark + +This currently works only for builds hg18 and hg19. + +----- + +**Dataset formats** + +The input can be any interval_ format dataset. The output is also in interval format. +(`Dataset missing?`_) + +.. _interval: ${static_path}/formatHelp.html#interval +.. _Dataset missing?: ${static_path}/formatHelp.html + +----- + +**What it does** + +This tool adds a column that measures interspecies conservation at each SNP +position, using conservation scores for primates pre-computed by the +phyloP program. PhyloP performs an exact P-value computation under a +continuous Markov substitution model. + +The chromosome and start position +are used to look up the scores, so if a larger interval is in the input, +only the score for the first nucleotide is returned. + +----- + +**Example** + +- input file, with SNPs:: + + chr22 16440426 14440427 C/T + chr22 15494851 14494852 A/G + chr22 14494911 14494912 A/T + chr22 14550435 14550436 A/G + chr22 14611956 14611957 G/T + chr22 14612076 14612077 A/G + chr22 14668537 14668538 C + chr22 14668703 14668704 A/T + chr22 14668775 14668776 G + chr22 14680074 14680075 A/T + etc. + +- output file, showing conservation scores for primates:: + + chr22 16440426 14440427 C/T 0.509 + chr22 15494851 14494852 A/G 0.427 + chr22 14494911 14494912 A/T NA + chr22 14550435 14550436 A/G NA + chr22 14611956 14611957 G/T -2.142 + chr22 14612076 14612077 A/G 0.369 + chr22 14668537 14668538 C 0.419 + chr22 14668703 14668704 A/T -1.462 + chr22 14668775 14668776 G 0.470 + chr22 14680074 14680075 A/T 0.303 + etc. + + "NA" means that the phyloP score was not available. + </help> + <citations> + <citation type="doi">10.1007/11732990_17</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c evolution/codingSnps.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/evolution/codingSnps.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,571 @@\n+#!/usr/bin/perl -w \n+use strict;\n+\n+#########################################################################\n+#\tcodingSnps.pl\n+#\tThis takes a bed file with the names being / separated nts\n+#\tand a gene bed file with cds start and stop.\n+#\tIt then checks for changes in coding regions, reporting\n+#\tthose that cause a frameshift or substitution in the amino acid.\n+#\tOutput columns:\n+#\t\tchrom, start, end, allele as given (amb code translated)\n+#\t\tGene ID from genes file, ref amino acid:variant amino acids,\n+#\t\tcodon number, (in strand of gene)ref nt, refCodon:variantCodons\n+#########################################################################\n+\n+my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib\n+if (!@ARGV or scalar @ARGV < 3) {\n+ print "Usage: codingSnps.pl snps.bed genes.bed (/dir/*$seqFlag|Galaxy build= loc=) [chr=# start=# end=# snp=# strand=#|-|+ keepColumns=1 synon=1 unique=1] > codingSnps.txt\\n";\n+ exit;\n+}\n+my $uniq = 0; #flag for whether want uniq positions\n+my $syn = 0; #flag for if want synonomous changes rather than non-syn\n+my $keep = 0; #keep old columns and append new ones\n+my $snpFile = shift @ARGV;\n+my $geneFile = shift @ARGV;\n+my $nibDir = shift @ARGV; #2bit or nib, depending on flag above\n+if ($nibDir eq \'Galaxy\') { getGalaxyInfo(); }\n+my $col0 = 0; #bed like columns in default positions\n+my $col1 = 1;\n+my $col2 = 2;\n+my $col3 = 3;\n+my $strand = -1;\n+#column positions 1 based coming in (for Galaxy)\n+foreach (@ARGV) {\n+ if (/chr=(\\d+)/) { $col0 = $1 -1; }\n+ elsif (/start=(\\d+)/) { $col1 = $1 -1; }\n+ elsif (/end=(\\d+)/) { $col2 = $1 -1; }\n+ elsif (/snp=(\\d+)/) { $col3 = $1 -1; }\n+ elsif (/keepColumns=1/) { $keep = 1; }\n+ elsif (/synon=1/) { $syn = 1; }\n+ elsif (/unique=1/) { $uniq = 1; }\n+ elsif (/strand=(\\d+)/) { $strand = $1 -1; } #0 based column\n+ elsif (/strand=-/) { $strand = -99; } #special case of all minus\n+}\n+if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) {\n+ print STDERR "ERROR column numbers are given with origin 1\\n";\n+ exit 1;\n+}\n+my @genes; #bed lines for genes, sorted by chrom and start\n+my %chrSt; #index in array where each chrom starts\n+my %codon; #hash of codon amino acid conversions\n+my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom\n+my $ignoreN = 1; #skip N\n+my $origAll; #alleles from input file (before changes for strand)\n+\n+my %amb = (\n+"R" => "A/G",\n+"Y" => "C/T",\n+"S" => "C/G",\n+"W" => "A/T",\n+"K" => "G/T",\n+"M" => "A/C",\n+"B" => "C/G/T",\n+"D" => "A/G/T",\n+"H" => "A/C/T",\n+"V" => "A/C/G",\n+"N" => "A/C/G/T"\n+);\n+fill_codon();\n+open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") \n+ or die "Couldn\'t open and sort $geneFile, $!\\n";\n+my $i = 0;\n+while(<FH>) {\n+ chomp;\n+ if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; }\n+ push(@genes, "$_");\n+ my @f = split(/\\t/);\n+ if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }\n+ $i++;\n+}\n+close FH or die "Couldn\'t close $geneFile, $!\\n";\n+\n+if ($ends) { print STDERR "WARNING using block ends rather than sizes\\n"; }\n+\n+#open snps sorted as well\n+my $s1 = $col0 + 1; #sort order is origin 1\n+my $s2 = $col1 + 1; \n+open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |")\n+ or die "Couldn\'t open and sort $snpFile, $!\\n";\n+$i = 0;\n+my @g; #one genes fields, should be used repeatedly\n+my %done;\n+while(<FH>) {\n+ chomp;\n+ if (/^\\s*#/) { next; } #comment\n+ my @s = split(/\\t/); #SNP fields\n+ if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\\n"; }\n+ my $size = $#s;\n+ if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) {\n+ print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\\n";\n+ exit 1;\n+ }\n+ if ($strand >= 0 && $strand > $size) { \n+ print STDERR "ERROR file has fewer columns than requested, requested strand in $strand (0 based), file has $size\\n";\n+ exit 1;\n+ }\n+ if ($s[$col1] =~ /\\D/) { \n+ print STDERR "ERROR th'..b' $seq .= uc($_);\n+ }\n+ close BIT or die "Couldn\'t finish twoBitToFa on $chr $st $end, $!\\n";\n+ return $seq;\n+}\n+\n+sub fetchSeqNib {\n+ my $chr = shift;\n+ my $st = shift;\n+ my $end = shift;\n+ my $strand = \'+\';\n+ $st--; #change to UCSC numbering\n+ open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn\'t run nibFrag, $!\\n";\n+ my $seq = \'\';\n+ while (<NIB>) {\n+ chomp;\n+ if (/^>/) { next; } #header\n+ $seq .= $_;\n+ }\n+ close NIB or die "Couldn\'t finish nibFrag on $chr $st $end, $!\\n";\n+ return $seq;\n+}\n+\n+sub compl {\n+ my $nts = shift;\n+ my $comp = \'\';\n+ if (!$nts) { die "ERROR called compl with nts undefined"; }\n+ foreach my $n (split(/ */, $nts)) {\n+ if ($n eq \'A\') { $comp .= \'T\'; }\n+ elsif ($n eq \'T\') { $comp .= \'A\'; }\n+ elsif ($n eq \'C\') { $comp .= \'G\'; }\n+ elsif ($n eq \'G\') { $comp .= \'C\'; }\n+ elsif ($n eq \'N\') { $comp .= \'N\'; }\n+ elsif ($n eq \'-\') { $comp .= \'-\'; } #deletion\n+ else { $comp = undef; }\n+ }\n+ return $comp;\n+}\n+\n+sub reverseCompAlleles {\n+ my $all = shift;\n+ my @nt = split(/\\//, $all);\n+ my $rv = \'\';\n+ foreach my $n (@nt) {\n+ $n = reverse(split(/ */, $n)); #needed for indels\n+ $n = compl($n);\n+ $rv .= "$n/";\n+ }\n+ $rv =~ s/\\/$//;\n+ return $rv;\n+}\n+\n+sub getaa {\n+ my $nts = shift; #in multiples of 3\n+ my $aa = \'\';\n+ my @n = split(/ */, $nts);\n+ while (@n) {\n+ my @t = splice(@n, 0, 3);\n+ my $n = uc(join("", @t));\n+ if (!exists $codon{$n}) { $aa .= \'N\'; next; }\n+ $aa .= $codon{$n};\n+ }\n+ return $aa;\n+}\n+\n+sub fill_codon {\n+$codon{GCA} = \'Ala\';\n+$codon{GCC} = \'Ala\';\n+$codon{GCG} = \'Ala\';\n+$codon{GCT} = \'Ala\';\n+$codon{CGG} = \'Arg\';\n+$codon{CGT} = \'Arg\';\n+$codon{CGC} = \'Arg\';\n+$codon{AGA} = \'Arg\';\n+$codon{AGG} = \'Arg\';\n+$codon{CGA} = \'Arg\';\n+$codon{AAC} = \'Asn\';\n+$codon{AAT} = \'Asn\';\n+$codon{GAC} = \'Asp\';\n+$codon{GAT} = \'Asp\';\n+$codon{TGC} = \'Cys\';\n+$codon{TGT} = \'Cys\';\n+$codon{CAG} = \'Gln\';\n+$codon{CAA} = \'Gln\';\n+$codon{GAA} = \'Glu\';\n+$codon{GAG} = \'Glu\';\n+$codon{GGG} = \'Gly\';\n+$codon{GGA} = \'Gly\';\n+$codon{GGC} = \'Gly\';\n+$codon{GGT} = \'Gly\';\n+$codon{CAC} = \'His\';\n+$codon{CAT} = \'His\';\n+$codon{ATA} = \'Ile\';\n+$codon{ATT} = \'Ile\';\n+$codon{ATC} = \'Ile\';\n+$codon{CTA} = \'Leu\';\n+$codon{CTC} = \'Leu\';\n+$codon{CTG} = \'Leu\';\n+$codon{CTT} = \'Leu\';\n+$codon{TTG} = \'Leu\';\n+$codon{TTA} = \'Leu\';\n+$codon{AAA} = \'Lys\';\n+$codon{AAG} = \'Lys\';\n+$codon{ATG} = \'Met\';\n+$codon{TTC} = \'Phe\';\n+$codon{TTT} = \'Phe\';\n+$codon{CCT} = \'Pro\';\n+$codon{CCA} = \'Pro\';\n+$codon{CCC} = \'Pro\';\n+$codon{CCG} = \'Pro\';\n+$codon{TCA} = \'Ser\';\n+$codon{AGC} = \'Ser\';\n+$codon{AGT} = \'Ser\';\n+$codon{TCC} = \'Ser\';\n+$codon{TCT} = \'Ser\';\n+$codon{TCG} = \'Ser\';\n+$codon{TGA} = \'Stop\';\n+$codon{TAG} = \'Stop\';\n+$codon{TAA} = \'Stop\';\n+$codon{ACT} = \'Thr\';\n+$codon{ACA} = \'Thr\';\n+$codon{ACC} = \'Thr\';\n+$codon{ACG} = \'Thr\';\n+$codon{TGG} = \'Trp\';\n+$codon{TAT} = \'Tyr\';\n+$codon{TAC} = \'Tyr\';\n+$codon{GTC} = \'Val\';\n+$codon{GTA} = \'Val\';\n+$codon{GTG} = \'Val\';\n+$codon{GTT} = \'Val\';\n+}\n+\n+sub getGalaxyInfo {\n+ my $build;\n+ my $locFile;\n+ foreach (@ARGV) {\n+ if (/build=(.*)/) { $build = $1; }\n+ elsif (/loc=(.*)/) { $locFile = $1; }\n+ }\n+ if (!$build or !$locFile) {\n+ print STDERR "ERROR missing build or locfile for Galaxy input\\n";\n+ exit 1;\n+ }\n+ # read $locFile to get $nibDir (ignoring commets)\n+ open(LF, "< $locFile") || die "open($locFile): $!\\n";\n+ while(<LF>) {\n+ s/#.*$//;\n+ s/(?:^\\s+|\\s+$)//g;\n+ next if (/^$/);\n+ \n+ my @t = split(/\\t/);\n+ if ($t[0] eq $build) { $nibDir = $t[1]; }\n+ }\n+ close(LF);\n+ if ($nibDir eq \'Galaxy\') {\n+ print STDERR "Failed to find sequence directory in locfile $locFile\\n";\n+ }\n+ # lparsons: allow specification of full filename in loc file for greater felxibility\n+ unless ($nibDir =~ /(.*)\\.2bit$/) { $nibDir .= "/$build.2bit"; }\n+ #$nibDir .= "/$build.2bit"; #we want full path and filename\n+}\n+\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c evolution/codingSnps.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/evolution/codingSnps.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,177 @@\n+<tool id="hgv_codingSnps" name="aaChanges" version="1.0.0">\n+ <description>amino-acid changes caused by a set of SNPs</description>\n+\n+ <command interpreter="perl">\n+ codingSnps.pl $input1 $input2 Galaxy build=${input1.metadata.dbkey} loc=${GALAXY_DATA_INDEX_DIR}/codingSnps.loc chr=${input1.metadata.chromCol} start=${input1.metadata.startCol} end=${input1.metadata.endCol} snp=$col1 keepColumns=$keep strand=${strand_source.strand_col} unique=$uniqpos > $out_file1\n+ </command>\n+\n+ <inputs>\n+ <param format="interval" name="input1" type="data" label="SNP dataset">\n+ <validator type="dataset_metadata_in_file" filename="codingSnps.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are not currently available for the specified build." split="\\t" />\n+ </param>\n+ <param name="col1" type="data_column" data_ref="input1" label="Column with SNPs" />\n+ <param format="interval" name="input2" type="data" label="Gene dataset">\n+ <validator type="dataset_metadata_in_file" filename="codingSnps.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are not currently available for the specified build." split="\\t" />\n+ </param>\n+ <param name="keep" type="select" label="Keep columns from SNP dataset">\n+ <option value="0" selected="true">No</option>\n+ <option value="1">Yes</option>\n+ </param>\n+ <param name="uniqpos" type="select" label="Only report each SNP position once">\n+ <option value="1" selected="true">Yes</option>\n+ <option value="0">No</option>\n+ </param>\n+ <conditional name="strand_source">\n+ <param name="strand_choice" type="select" label="Strand info">\n+ <option value="data_column">a column in the dataset</option>\n+ <option value="all_pos" selected="true">all on sense/forward/+ strand</option>\n+ <option value="all_neg">all on antisense/reverse/- strand</option>\n+ </param>\n+ <when value="data_column">\n+ <param name="strand_col" type="data_column" data_ref="input1" label="Column with strand"/>\n+ </when>\n+ <when value="all_pos">\n+ <param name="strand_col" type="hidden" value="+"/>\n+ </when>\n+ <when value="all_neg">\n+ <param name="strand_col" type="hidden" value="-"/>\n+ </when>\n+ </conditional>\n+ </inputs>\n+\n+ <outputs>\n+ <data format="interval" name="out_file1" />\n+ </outputs>\n+\n+ <code file="codingSnps_filter.py"></code>\n+\n+ <requirements>\n+ <requirement type="package">gnu_coreutils</requirement>\n+ <requirement type="package">ucsc_tools</requirement>\n+ </requirements>\n+\n+ <tests>\n+ <test>\n+ <param name="input1" ftype="interval" value="codingSnps_input1.interval" dbkey="hg18" />\n+ <param name="col1" value="6" />\n+ <param name="input2" ftype="interval" value="codingSnps_inputGenes1.bed" dbkey="hg18" />\n+ <param name="strand_choice" value="all_pos" />\n+ <param name="strand_col" value="+" />\n+ <param name="uniqpos" value="0" />\n+ <output name="output" file="codingSnps_output1.interval" />\n+ </test>\n+ <test>\n+ <param name="input1" ftype="interval" value="codingSnps_input2.interval" dbkey="hg18" />\n+ <param name="input2" ftype="interval" value="codingSnps_inputGenes2.bed" dbkey="hg18" />\n+ <param name="col1" value="4" />\n+ <param name="strand_choice" value="all_pos" />\n+ <param name="strand_col" value="+" />\n+ <param name="uniqpos" value="0" />\n+ <output name="output" file="codingSnps_output2.interval" />\n+ </test>\n+ <test>\n+ <param name="input1" ftype="interval" value="codingSnps_input2.interval" dbkey="hg18" />\n+ <param name="input2" ftype="interval" value="codingSnps_inputGenes2.bed" dbkey="hg18" />\n+ <param name="col1" value="4" />\n+ <param name="strand_choice" value="all_neg" />\n+ <param name="strand_col" value="-" />\n+ <output name="output" file="codingSnps_output3.interval" />\n+ </test>\n+ </tests>\n+\n+ <help>\n+.. '..b"The SNP dataset is in interval_ format, with a column of SNPs as described below.\n+The gene dataset is in BED_ format with 12 columns. The output dataset is also interval.\n+(`Dataset missing?`_)\n+\n+.. _interval: ${static_path}/formatHelp.html#interval\n+.. _BED: ${static_path}/formatHelp.html#bed\n+.. _Dataset missing?: ${static_path}/formatHelp.html\n+\n+-----\n+\n+**What it does**\n+\n+This tool identifies which SNPs create amino-acid changes in the specified \n+coding regions. The first input file contains the SNPs and must be an interval file.\n+It needs the chromosome, start, and end position as well as the SNP. The \n+SNP can be given using ambiguous-nucleotide symbols or a list of two to four\n+alleles \n+separated by '/'. Any other columns in the first input file will not be\n+used but will be kept for the output. The second input file contains the genes\n+to be used for defining the coding regions. This file must be a BED file with\n+the first 12 columns standard BED columns. The output is the same as the\n+first input file with\n+several columns added: the name field from the line of the gene input file\n+used, the amino acids, the codon number, the reference nucleotide that \n+changed in the amino acid (in the same strand as the gene), and the codons \n+that go with the amino acids.\n+The amino acids are listed with the reference amino acid first, then a colon,\n+and then the amino acids for the alleles. If a SNP is not in a coding region\n+or is synonymous then it is not included in the output file.\n+\n+-----\n+\n+**Example**\n+\n+- first input file, with SNPs::\n+\n+ chr22 15660821 15660822 A/G\n+ chr22 15825725 15825726 G/T\n+ chr22 15827035 15827036 G\n+ chr22 15827135 15827136 C/G\n+ chr22 15830928 15830929 A/G\n+ chr22 15830951 15830952 G\n+ chr22 15830955 15830956 C/T\n+ chr22 15848885 15848886 C/T\n+ chr22 15849048 15849049 A/C\n+ chr22 15919711 15919712 A/G\n+ etc.\n+\n+ or, indicating polymorphisms using ambiguous-nucleotide symbols::\n+\n+ chr22 15660821 15660822 R\n+ chr22 15825725 15825726 K\n+ chr22 15827035 15827036 G\n+ chr22 15827135 15827136 S\n+ chr22 15830928 15830929 R\n+ chr22 15830951 15830952 G\n+ chr22 15830955 15830956 Y\n+ chr22 15848885 15848886 Y\n+ chr22 15849048 15849049 M\n+ chr22 15919711 15919712 R\n+ etc.\n+\n+- second input file, with UCSC annotations for human genes::\n+\n+ chr22 15688363 15690225 uc010gqr.1 0 + 15688363 15688363 0 2 587,794, 0,1068,\n+ chr22 15822826 15869112 uc002zlw.1 0 - 15823622 15869004 0 10 940,105,97,91,265,86,251,208,304,282, 0,1788,2829,3241,4163,6361,8006,26023,29936,46004,\n+ chr22 15826991 15869112 uc010gqs.1 0 - 15829218 15869004 0 5 1380,86,157,304,282, 0,2196,21858,25771,41839,\n+ chr22 15897459 15919682 uc002zlx.1 0 + 15897459 15897459 0 4 775,128,103,1720, 0,8303,10754,20503,\n+ chr22 15945848 15971389 uc002zly.1 0 + 15945981 15970710 0 13 271,25,147,113,127,48,164,84,85,12,102,42,2193, 0,12103,12838,13816,15396,17037,17180,18535,19767,20632,20894,22768,23348,\n+ etc.\n+\n+- output file, showing non-synonymous substitutions in coding regions::\n+\n+ chr22 15825725 15825726 G/T uc002zlw.1 Gln:Pro/Gln 469 A CAA:CCA/CAA\n+ chr22 15827035 15827036 G uc002zlw.1 Glu:Asp 414 G GAG:GAC\n+ chr22 15827135 15827136 C/G uc002zlw.1 Gly:Gly/Ala 381 G GGT:GGT/GCT\n+ chr22 15830928 15830929 A/G uc002zlw.1 Ala:Ser/Pro 281 G GCA:TCA/CCA\n+ chr22 15830951 15830952 G uc002zlw.1 Leu:Pro 273 T CTT:CCT\n+ chr22 15830955 15830956 C/T uc002zlw.1 Ser:Gly/Ser 272 A AGC:GGC/AGC\n+ chr22 15848885 15848886 C/T uc002zlw.1 Ser:Trp/Stop 217 C TCG:TGG/TAG\n+ chr22 15848885 15848886 C/T uc010gqs.1 Ser:Trp/Stop 200 C TCG:TGG/TAG\n+ chr22 15849048 15849049 A/C uc002zlw.1 Gly:Stop/Gly 163 G GGA:TGA/GGA\n+ etc.\n+\n+ </help>\n+</tool>\n" |
b |
diff -r 000000000000 -r 7621d36a4e9c evolution/codingSnps_filter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/evolution/codingSnps_filter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,41 @@ +#!/usr/bin/env python + +# runs after the job (and after the default post-filter) +from galaxy.tools.parameters import DataToolParameter +# Older py compatibility +try: + set() +except: + from sets import Set as set + + +def validate_input( trans, error_map, param_values, page_param_map ): + dbkeys = set() + data_param_names = set() + data_params = 0 + for name, param in page_param_map.items(): + if isinstance( param, DataToolParameter ): + # for each dataset parameter + if param_values.get(name, None) is not None: + dbkeys.add( param_values[name].dbkey ) + data_params += 1 + # check meta data + try: + param = param_values[name] + int( param.metadata.startCol ) + int( param.metadata.endCol ) + int( param.metadata.chromCol ) + if param.metadata.strandCol is not None: + int( param.metadata.strandCol ) + except: + error_msg = ("The attributes of this dataset are not properly set. " + "Click the pencil icon in the history item to set the chrom, start, end and strand columns.") + error_map[name] = error_msg + data_param_names.add( name ) + if len( dbkeys ) > 1: + for name in data_param_names: + error_map[name] = "All datasets must belong to same genomic build, " \ + "this dataset is linked to build '%s'" % param_values[name].dbkey + if data_params != len(data_param_names): + for name in data_param_names: + error_map[name] = "A dataset of the appropriate type is required" |
b |
diff -r 000000000000 -r 7621d36a4e9c extract/extract_genomic_dna.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract/extract_genomic_dna.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,316 @@\n+#!/usr/bin/env python\n+"""\n+usage: %prog $input $out_file1\n+ -1, --cols=N,N,N,N,N: Columns for start, end, strand in input file\n+ -d, --dbkey=N: Genome build of input file\n+ -o, --output_format=N: the data type of the output file\n+ -g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc or twobit.loc\n+ -I, --interpret_features: if true, complete features are interpreted when input is GFF\n+ -F, --fasta=<genomic_sequences>: genomic sequences to use for extraction\n+ -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than \'traditional\' 0-based, closed format.\n+"""\n+from __future__ import print_function\n+\n+import os\n+import subprocess\n+import sys\n+import tempfile\n+\n+import bx.seq.nib\n+import bx.seq.twobit\n+from bx.cookbook import doc_optparse\n+from bx.intervals.io import Comment, Header\n+\n+from galaxy.datatypes.util import gff_util\n+from galaxy.tools.util.galaxyops import parse_cols_arg\n+\n+\n+def stop_err( msg ):\n+ sys.stderr.write( msg )\n+ sys.exit()\n+\n+\n+def reverse_complement( s ):\n+ complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}\n+ reversed_s = []\n+ for i in s:\n+ reversed_s.append( complement_dna[i] )\n+ reversed_s.reverse()\n+ return "".join( reversed_s )\n+\n+\n+def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ):\n+ # Checks for the presence of *.nib files matching the dbkey within alignseq.loc\n+ seq_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR\n+ for line in open(seq_file):\n+ line = line.rstrip( \'\\r\\n\' )\n+ if line and not line.startswith( "#" ) and line.startswith( \'seq\' ):\n+ fields = line.split( \'\\t\' )\n+ if len( fields) >= 3 and fields[1] == dbkey:\n+ print("Using *.nib genomic reference files")\n+ return fields[2].strip()\n+\n+ # If no entry in aligseq.loc was found, check for the presence of a *.2bit file in twobit.loc\n+ seq_file = "%s/twobit.loc" % GALAXY_DATA_INDEX_DIR\n+ for line in open( seq_file ):\n+ line = line.rstrip( \'\\r\\n\' )\n+ if line and not line.startswith( "#" ) and line.endswith( \'.2bit\' ):\n+ fields = line.split( \'\\t\' )\n+ if len(fields) >= 2 and fields[0] == dbkey:\n+ print("Using a *.2bit genomic reference file")\n+ return fields[1].strip()\n+\n+ return \'\'\n+\n+\n+def __main__():\n+ #\n+ # Parse options, args.\n+ #\n+ options, args = doc_optparse.parse( __doc__ )\n+ try:\n+ if len(options.cols.split(\',\')) == 5:\n+ # BED file\n+ chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )\n+ else:\n+ # gff file\n+ chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )\n+ name_col = False\n+ dbkey = options.dbkey\n+ output_format = options.output_format\n+ gff_format = options.gff\n+ interpret_features = options.interpret_features\n+ GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR\n+ fasta_file = options.fasta\n+ input_filename, output_filename = args\n+ except:\n+ doc_optparse.exception()\n+\n+ includes_strand_col = strand_col >= 0\n+ strand = None\n+ nibs = {}\n+\n+ #\n+ # Set path to sequence data.\n+ #\n+ if fasta_file:\n+ # Need to create 2bit file from fasta file.\n+ try:\n+ seq_path = tempfile.NamedTemporaryFile( dir="." ).name\n+ cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )\n+\n+ tmp_name = tempfile.NamedTemporaryFile( dir="." ).name\n+ tmp_stderr = open( tmp_name, \'wb\' )\n+ proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )\n+ returncode = proc.wait()\n+ tmp_stderr.close()\n+\n+ # Get stderr, allowing for case where it\'s very large.\n+ tmp'..b'not(twobitfile):\n+ twobitfile = bx.seq.twobit.TwoBitFile( open( seq_path ) )\n+ try:\n+ if options.gff and interpret_features:\n+ # Create sequence from intervals within a feature.\n+ sequence = \'\'\n+ for interval in feature.intervals:\n+ sequence += twobitfile[interval.chrom][interval.start:interval.end]\n+ else:\n+ sequence = twobitfile[chrom][start:end]\n+ except:\n+ warning = "Unable to fetch the sequence from \'%d\' to \'%d\' for chrom \'%s\'. " % ( start, end - start, chrom )\n+ warnings.append( warning )\n+ if not invalid_lines:\n+ invalid_lines = get_lines( feature )\n+ first_invalid_line = line_count\n+ skipped_lines += len( invalid_lines )\n+ continue\n+ else:\n+ warning = "Chromosome by name \'%s\' was not found for build \'%s\'. " % ( chrom, dbkey )\n+ warnings.append( warning )\n+ if not invalid_lines:\n+ invalid_lines = get_lines( feature )\n+ first_invalid_line = line_count\n+ skipped_lines += len( invalid_lines )\n+ continue\n+ if sequence == \'\':\n+ warning = "Chrom: \'%s\', start: \'%s\', end: \'%s\' is either invalid or not present in build \'%s\'. " % \\\n+ ( chrom, start, end, dbkey )\n+ warnings.append( warning )\n+ if not invalid_lines:\n+ invalid_lines = get_lines( feature )\n+ first_invalid_line = line_count\n+ skipped_lines += len( invalid_lines )\n+ continue\n+ if includes_strand_col and strand == "-":\n+ sequence = reverse_complement( sequence )\n+\n+ if output_format == "fasta":\n+ l = len( sequence )\n+ c = 0\n+ if gff_format:\n+ start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )\n+ fields = [dbkey, str( chrom ), str( start ), str( end ), strand]\n+ meta_data = "_".join( fields )\n+ if name.strip():\n+ fout.write( ">%s %s\\n" % (meta_data, name) )\n+ else:\n+ fout.write( ">%s\\n" % meta_data )\n+ while c < l:\n+ b = min( c + 50, l )\n+ fout.write( "%s\\n" % str( sequence[c:b] ) )\n+ c = b\n+ else: # output_format == "interval"\n+ if gff_format and interpret_features:\n+ # TODO: need better GFF Reader to capture all information needed\n+ # to produce this line.\n+ meta_data = "\\t".join(\n+ [feature.chrom, "galaxy_extract_genomic_dna", "interval",\n+ str( feature.start ), str( feature.end ), feature.score, feature.strand,\n+ ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )\n+ else:\n+ meta_data = "\\t".join( fields )\n+ if gff_format:\n+ format_str = "%s seq \\"%s\\";\\n"\n+ else:\n+ format_str = "%s\\t%s\\n"\n+ fout.write( format_str % ( meta_data, str( sequence ) ) )\n+\n+ # Update line count.\n+ if isinstance( feature, gff_util.GFFFeature ):\n+ line_count += len( feature.intervals )\n+ else:\n+ line_count += 1\n+\n+ fout.close()\n+\n+ if warnings:\n+ warn_msg = "%d warnings, 1st is: " % len( warnings )\n+ warn_msg += warnings[0]\n+ print(warn_msg)\n+ if skipped_lines:\n+ # Error message includes up to the first 10 skipped lines.\n+ print(\'Skipped %d invalid lines, 1st is #%d, "%s"\' % ( skipped_lines, first_invalid_line, \'\\n\'.join( invalid_lines[:10] ) ))\n+\n+ # Clean up temp file.\n+ if fasta_file:\n+ os.remove( seq_path )\n+ os.remove( tmp_name )\n+\n+\n+if __name__ == "__main__":\n+ __main__()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c extract/extract_genomic_dna.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract/extract_genomic_dna.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,187 @@\n+<tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="2.2.3">\n+ <description>using coordinates from assembled/unassembled genomes</description>\n+ <requirements>\n+ <requirement type="package">ucsc_tools</requirement>\n+ <requirement type="binary">faToTwoBit</requirement>\n+ </requirements>\n+ <command>\n+ python \'$__tool_directory__/extract_genomic_dna.py\' \'${input}\' \'${out_file1}\' -o ${out_format} -d \'${dbkey}\'\n+\n+ #if str( $interpret_features ) == "yes":\n+ -I\n+ #end if\n+\n+ ## Columns to use in input file.\n+ #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension(\'gff\').__class__):\n+ -1 "1,4,5,7" --gff\n+ #else:\n+ -1 "${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol},${input.metadata.nameCol}"\n+ #end if\n+\n+ #if $seq_source.index_source == "cached":\n+ ## Genomic data from cache.\n+ -g \'${GALAXY_DATA_INDEX_DIR}\'\n+ #else:\n+ ## Genomic data from history.\n+ -F \'${seq_source.ref_file}\'\n+ #end if\n+ </command>\n+ <inputs>\n+ <param format="interval,gff" name="input" type="data" label="Fetch sequences for intervals in"/>\n+ <param name="interpret_features" type="select" label="Interpret features when possible" help="Only meaningful for GFF, GTF datasets.">\n+ <option value="yes">Yes</option>\n+ <option value="no">No</option>\n+ </param>\n+ <conditional name="seq_source">\n+ <param name="index_source" type="select" label="Source for Genomic Data" help="If \'Locally cached\' is selected, it will use a genomic reference file that matches the input file\'s dbkey. First it looks whether there are corresponding *.nib files in alignseq.loc. If that is not available, it searches for a corresponding *.2bit in twobit.loc.">\n+ <option value="cached">Locally cached</option>\n+ <option value="history">History</option>\n+ </param>\n+ <when value="cached">\n+ </when>\n+ <when value="history">\n+ <param name="ref_file" type="data" format="fasta" label="Using reference file" />\n+ </when>\n+ </conditional>\n+ <param name="out_format" type="select" label="Output data type">\n+ <option value="fasta">FASTA</option>\n+ <option value="interval">Interval</option>\n+ </param>\n+ </inputs>\n+ <outputs>\n+ <data format_source="input" name="out_file1" metadata_source="input">\n+ <change_format>\n+ <when input="out_format" value="fasta" format="fasta" />\n+ </change_format>\n+ </data>\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />\n+ <param name="interpret_features" value="yes"/>\n+ <param name="index_source" value="cached"/>\n+ <param name="out_format" value="fasta"/>\n+ <output name="out_file1">\n+\t<assert_contents>\n+\t <!-- First few lines... -->\n+\t <has_text text=">hg17_chr1_147962192_147962580_- CCDS989.1_cds_0_0_chr1_147962193_r" />\n+\t <has_text text="ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTG" />\n+\t <has_text text="GACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGT" />\n+\t <has_text text="GATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGT" />\n+\t <!-- Last few lines... -->\n+\t <has_text text="GCTGTGGCACAGAACATGGACTCTGTGTTTAAGGAGCTCTTGGGAAAGAC" />\n+\t <has_text text="CTCTGTCCGCCAGGGCCTTGGGCCAGCATCTACCACCTCTCCCAGTCCTG" />\n+\t <has_text text="GGCCCCGAAGCCCAAAGGCCCCGCCCAGCAGCCGCCTGGGCAGGAACAAA" />\n+\t <has_text text="GGCTTCTCCCGGGGCCCTGGGGCCCCAGCCTCACCCTCAGCTTCCCACCC" />\n+\t <has_text text="CCAGGGCCTAGACACGACCCCCAAGCCACACTGA" />\n+\t</assert_contents>\n+ </output>\n+ </test>\n+ <test>\n+ <param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />\n+ <param name="interpret_features" value="yes"/>\n+ <param name="index_source" value="cached"/>\n+ '..b'out1.gff" dbkey="mm9" ftype="gff" />\n+ <param name="interpret_features" value="no"/>\n+ <param name="out_format" value="fasta"/>\n+ <param name="index_source" value="cached"/>\n+ <output name="out_file1" file="extract_genomic_dna_out5.fasta" />\n+ </test>\n+ <!-- Test custom sequences support and GFF feature interpretation. -->\n+ <test>\n+ <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />\n+ <param name="interpret_features" value="no"/>\n+ <param name="index_source" value="history"/>\n+ <param name="ref_file" value="tophat_in1.fasta"/>\n+ <param name="out_format" value="fasta"/>\n+ <output name="out_file1" file="extract_genomic_dna_out6.fasta" />\n+ </test>\n+ <test>\n+ <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />\n+ <param name="interpret_features" value="yes"/>\n+ <param name="index_source" value="history"/>\n+ <param name="ref_file" value="tophat_in1.fasta"/>\n+ <param name="out_format" value="fasta"/>\n+ <output name="out_file1" file="extract_genomic_dna_out7.fasta" />\n+ </test>\n+ </tests>\n+ <help>\n+.. class:: warningmark\n+\n+This tool requires interval or gff (special tabular formatted data). If your data is not TAB delimited, first use *Text Manipulation->Convert*.\n+\n+.. class:: warningmark\n+\n+Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified).\n+\n+.. class:: warningmark\n+\n+All of the following will cause a line from the input dataset to be skipped and a warning generated. The number of warnings and skipped lines is documented in the resulting history item.\n+ - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates.\n+ - Sequences that fall outside of the range of a line\'s start and end coordinates.\n+ - Chromosome, start or end coordinates that are invalid for the specified build.\n+ - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ).\n+\n+.. class:: infomark\n+\n+ **Extract genomic DNA using coordinates from ASSEMBLED genomes and UNassembled genomes** previously were achieved by two separate tools.\n+\n+-----\n+\n+**What it does**\n+\n+This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format.\n+\n+If strand is not defined, the default value is "+".\n+\n+-----\n+\n+**Example**\n+\n+If the input dataset is::\n+\n+ chr7 127475281 127475310 NM_000230 0 +\n+ chr7 127485994 127486166 NM_000230 0 +\n+ chr7 127486011 127486166 D49487 0 +\n+\n+Extracting sequences with **FASTA** output data type returns::\n+\n+ >hg17_chr7_127475281_127475310_+ NM_000230\n+ GTAGGAATCGCAGCGCCAGCGGTTGCAAG\n+ >hg17_chr7_127485994_127486166_+ NM_000230\n+ GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG\n+ GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC\n+ CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG\n+ GATCAATGACATTTCACACACG\n+ >hg17_chr7_127486011_127486166_+ D49487\n+ TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG\n+ CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA\n+ CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC\n+ ACACG\n+\n+Extracting sequences with **Interval** output data type returns::\n+\n+ chr7 127475281 127475310 NM_000230 0 + GTAGGAATCGCAGCGCCAGCGGTTGCAAG\n+ chr7 127485994 127486166 NM_000230 0 + GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG\n+ chr7 127486011 127486166 D49487 0 + TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG\n+ </help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c extract/liftOver_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract/liftOver_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,86 @@ +#!/usr/bin/env python +# Guruprasad Ananda +""" +Converts coordinates from one build/assembly to another using liftOver binary and mapping files downloaded from UCSC. +""" + +import os +import re +import subprocess +import sys +import tempfile + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def safe_bed_file(infile): + """Make a BED file with track and browser lines ready for liftOver. + + liftOver will fail with track or browser lines. We can make it happy + by converting these to comments. See: + + https://lists.soe.ucsc.edu/pipermail/genome/2007-May/013561.html + """ + fix_pat = re.compile("^(track|browser)") + (fd, fname) = tempfile.mkstemp() + in_handle = open(infile) + out_handle = open(fname, "w") + for line in in_handle: + if fix_pat.match(line): + line = "#" + line + out_handle.write(line) + in_handle.close() + out_handle.close() + return fname + + +if len( sys.argv ) < 9: + stop_err( "USAGE: prog input out_file1 out_file2 input_dbkey output_dbkey infile_type minMatch multiple <minChainT> <minChainQ> <minSizeQ>" ) + +infile = sys.argv[1] +outfile1 = sys.argv[2] +outfile2 = sys.argv[3] +in_dbkey = sys.argv[4] +mapfilepath = sys.argv[5] +infile_type = sys.argv[6] +gff_option = "" +if infile_type == "gff": + gff_option = "-gff " +minMatch = sys.argv[7] +multiple = int(sys.argv[8]) +multiple_option = "" +if multiple: + minChainT = sys.argv[9] + minChainQ = sys.argv[10] + minSizeQ = sys.argv[11] + multiple_option = " -multiple -minChainT=%s -minChainQ=%s -minSizeQ=%s " % (minChainT, minChainQ, minSizeQ) + +try: + assert float(minMatch) +except: + minMatch = 0.1 +# ensure dbkey is set +if in_dbkey == "?": + stop_err( "Input dataset genome build unspecified, click the pencil icon in the history item to specify it." ) + +if not os.path.isfile( mapfilepath ): + stop_err( "%s mapping is not currently available." % ( mapfilepath.split('/')[-1].split('.')[0] ) ) + +safe_infile = safe_bed_file(infile) +cmd_line = "liftOver " + gff_option + "-minMatch=" + str(minMatch) + multiple_option + " " + safe_infile + " " + mapfilepath + " " + outfile1 + " " + outfile2 + " > /dev/null" + +try: + # have to nest try-except in try-finally to handle 2.4 + try: + proc = subprocess.Popen( args=cmd_line, shell=True, stderr=subprocess.PIPE ) + returncode = proc.wait() + stderr = proc.stderr.read() + if returncode != 0: + raise Exception(stderr) + except Exception as e: + raise Exception('Exception caught attempting conversion: ' + str( e )) +finally: + os.remove(safe_infile) |
b |
diff -r 000000000000 -r 7621d36a4e9c extract/liftOver_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract/liftOver_wrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,144 @@ +<tool id="liftOver1" name="Convert genome coordinates" version="1.0.4"> + <description> between assemblies and genomes</description> + <command interpreter="python"> + liftOver_wrapper.py + $input + "$out_file1" + "$out_file2" + $dbkey + $to_dbkey + #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__) or isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gtf').__class__): + "gff" + #else: + "interval" + #end if + $minMatch ${multiple.choice} ${multiple.minChainT} ${multiple.minChainQ} ${multiple.minSizeQ} + </command> + <inputs> + <param format="interval,gff,gtf" name="input" type="data" label="Convert coordinates of"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="liftOver.loc" metadata_name="dbkey" metadata_column="0" message="Liftover mappings are currently not available for the specified build." /> + </param> + <param name="to_dbkey" type="select" label="To"> + <options from_data_table="liftOver"> + <filter type="data_meta" ref="input" key="dbkey" column="0" /> + </options> + </param> + <param name="minMatch" size="10" type="float" value="0.95" label="Minimum ratio of bases that must remap" help="Recommended values: same species = 0.95, different species = 0.10" /> + <conditional name="multiple"> + <param name="choice" type="select" label="Allow multiple output regions?" help="Recommended values: same species = No, different species = Yes"> + <option value="0" selected="true">No</option> + <option value="1">Yes</option> + </param> + <when value="0"> + <param name="minSizeQ" type="hidden" value="0" /> + <param name="minChainQ" type="hidden" value="0" /> + <param name="minChainT" type="hidden" value="0" /> + </when> + <when value="1"> + <param name="minSizeQ" size="10" type="integer" value="0" label="Minimum matching region size in dataset" help="Recommended value: set to >= 300 bases for complete transcripts"/> + <param name="minChainQ" size="10" type="integer" value="500" label="Minimum chain size in dataset"/> + <param name="minChainT" size="10" type="integer" value="500" label="Minimum chain size in target"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="input" name="out_file1" label="${tool.name} on ${on_string} [ MAPPED COORDINATES ]"> + <actions> + <action type="metadata" name="dbkey"> + <option type="from_data_table" name="liftOver" key="name" column="1" offset="0"> + <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> + <filter type="param_value" ref="to_dbkey" column="2"/> + </option> + </action> + </actions> + </data> + <data format="input" name="out_file2" label="${tool.name} on ${on_string} [ UNMAPPED COORDINATES ]" /> + </outputs> + <requirements> + <requirement type="package">ucsc_tools</requirement> + </requirements> + <tests> + <!-- + <test> + <param name="input" value="5.bed" dbkey="hg18" ftype="bed" /> + <param name="to_dbkey" value="panTro2" /> + <param name="minMatch" value="0.95" /> + <param name="choice" value="0" /> + <output name="out_file1" file="5_liftover_mapped.bed"/> + <output name="out_file2" file="5_liftover_unmapped.bed"/> + </test> + <test> + <param name="input" value="5.bed" dbkey="hg18" ftype="bed" /> + <param name="to_dbkey" value="panTro2" /> + <param name="minMatch" value="0.10" /> + <param name="choice" value="1" /> + <param name="minSizeQ" value="0" /> + <param name="minChainQ" value="500" /> + <param name="minChainT" value="500" /> + <output name="out_file1" file="5_mult_liftover_mapped.bed"/> + <output name="out_file2" file="5_mult_liftover_unmapped.bed"/> + </test> + <test> + <param name="input" value="cuffcompare_in1.gtf" dbkey="hg18" ftype="gtf" /> + <param name="to_dbkey" value="panTro2" /> + <param name="minMatch" value="0.95" /> + <param name="choice" value="0" /> + <output name="out_file1" file="cuffcompare_in1_liftover_mapped.bed"/> + <output name="out_file2" file="cuffcompare_in1_liftover_unmapped.bed"/> + </test> + <test> + <param name="input" value="cuffcompare_in1.gtf" dbkey="hg18" ftype="gtf" /> + <param name="to_dbkey" value="panTro2" /> + <param name="minMatch" value="0.10" /> + <param name="choice" value="1" /> + <param name="minSizeQ" value="0" /> + <param name="minChainQ" value="500" /> + <param name="minChainT" value="500" /> + <output name="out_file1" file="cuffcompare_in1_mult_liftover_mapped.bed"/> + <output name="out_file2" file="cuffcompare_in1_mult_liftover_unmapped.bed"/> + </test> + --> + </tests> + <help> +.. class:: warningmark + +Make sure that the genome build of the input dataset is specified (click the pencil icon in the history item to set it if necessary). + +.. class:: warningmark + +This tool can work with interval, GFF, and GTF datasets. It requires the interval datasets to have chromosome in column 1, +start co-ordinate in column 2 and end co-ordinate in column 3. BED comments +and track and browser lines will be ignored, but if other non-interval lines +are present the tool will return empty output datasets. + +----- + +.. class:: infomark + +**What it does** + +This tool is based on the LiftOver utility and Chain track from `the UC Santa Cruz Genome Browser`__. + +It converts coordinates and annotations between assemblies and genomes. It produces 2 files, one containing all the mapped coordinates and the other containing the unmapped coordinates, if any. + + .. __: http://genome.ucsc.edu/ + +----- + +**Example** + +Converting the following hg16 intervals to hg18 intervals:: + + chrX 85170 112199 AK002185 0 + + chrX 110458 112199 AK097346 0 + + chrX 112203 121212 AK074528 0 - + +will produce the following hg18 intervals:: + + chrX 132991 160020 AK002185 0 + + chrX 158279 160020 AK097346 0 + + chrX 160024 169033 AK074528 0 - + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/CreateInterval.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/CreateInterval.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,19 @@ +#! /usr/bin/perl -w + +# Accepts chrom, start, end, name, and strand +# If strand is void sets it to plus +# CreateInterval.pl $chrom $start $end $name $strand $output + +my $strand = "+"; + +die "Not enough arguments\n" unless @ARGV == 6; + +open OUT, ">$ARGV[5]" or die "Cannot open $ARGV[5]:$!\n"; + +$strand = "-" if $ARGV[4] eq "minus"; +$ARGV[3] =~ s/\s+/_/g; +$ARGV[3] =~ s/\t+/_/g; + +print OUT "$ARGV[0]\t$ARGV[1]\t$ARGV[2]\t$ARGV[3]\t0\t$strand\n"; +close OUT; + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/CreateInterval.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/CreateInterval.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,56 @@ +<tool id="createInterval" name="Create single interval" version="1.0.0"> + <description>as a new dataset</description> + <command interpreter="perl">CreateInterval.pl $chrom $start $end "$name" $strand $out_file1</command> + <inputs> + <param name="chrom" size="20" type="text" value="chr7" label="Chromosome"/> + <param name="start" size="20" type="integer" value="100" label="Start position"/> + <param name="end" size="20" type="integer" value="1000" label="End position"/> + <param name="name" size="20" type="text" value="myInterval" label="Name"/> + <param name="strand" type="select" label="Strand" help="If your interval is strandless set strand to plus" > + <option value="plus">plus</option> + <option value="minus">minus</option> + </param> + </inputs> + <outputs> + <data format="bed" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="chrom" value="chr7"/> + <param name="start" value="100"/> + <param name="end" value="1000"/> + <param name="name" value="myinterval"/> + <param name="strand" value="plus"/> + <output name="out_file1" file="eq-createinterval.dat"/> + </test> + </tests> + <help> + +.. class:: warningmark + +**TIP**. Once your interval appears in history, you must tell Galaxy which genome it belongs to by clicking pencil icon or the "?" link in the history item. + +----- + +**What it does** + +This tool allows you to create a single genomic interval. The resulting history item will be in the BED format. + +----- + +**Example** + +Typing the following values in the form:: + + Chromosome: chrX + Start position: 151087187 + End position: 151370486 + Name: NM_000808 + Strand: minus + +will create a single interval:: + + chrX 151087187 151370486 NM_000808 0 - + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/axt_to_concat_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/axt_to_concat_fasta.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,50 @@ +#!/usr/bin/env python +""" +Adapted from bx/scripts/axt_to_concat_fasta.py +""" +from __future__ import print_function + +import sys + +import bx.align.axt + + +def usage(s=None): + message = """ +axt_to_fasta species1 species2 < axt_file > fasta_file +""" + if s is None: + sys.exit(message) + else: + sys.exit("%s\n%s" % (s, message)) + + +def main(): + # check the command line + species1 = sys.argv[1] + species2 = sys.argv[2] + + # convert the alignment blocks + + reader = bx.align.axt.Reader(sys.stdin, support_ids=True, + species1=species1, species2=species2) + sp1text = list() + sp2text = list() + for a in reader: + sp1text.append(a.components[0].text) + sp2text.append(a.components[1].text) + sp1seq = "".join(sp1text) + sp2seq = "".join(sp2text) + print_component_as_fasta(sp1seq, species1) + print_component_as_fasta(sp2seq, species2) + + +# TODO: this should be moved to a bx.align.fasta module +def print_component_as_fasta(text, src): + header = ">" + src + print(header) + print(text) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/axt_to_concat_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/axt_to_concat_fasta.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,66 @@ +<tool id="axt_to_concat_fasta" name="AXT to concatenated FASTA" version="1.0.0"> + <description>Converts an AXT formatted file to a concatenated FASTA alignment</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command interpreter="python">axt_to_concat_fasta.py $dbkey_1 $dbkey_2 < $axt_input > $out_file1</command> + <inputs> + <param format="axt" name="axt_input" type="data" label="AXT file"/> + <param name="dbkey_1" type="genomebuild" label="Genome"/> + <param name="dbkey_2" type="genomebuild" label="Genome"/> + </inputs> + <outputs> + <data format="fasta" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="axt_input" value="1.axt" ftype="axt" /> + <param name="dbkey_1" value='hg17' /> + <param name="dbkey_2" value="panTro1" /> + <output name="out_file1" file="axt_to_concat_fasta.dat" /> + </test> + </tests> + <help> + +.. class:: warningmark + +**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section. + +-------- + +**Syntax** + +This tool converts an AXT formatted file to the FASTA format, and concatenates the results in the same build. + +- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. + +- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code. + + - This format contains an one line header. It starts with a " >" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence. + - The remaining lines contain the sequence itself. + - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence. + - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs. + +----- + +**Example** + +- AXT format:: + + 0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 + TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA + TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA + + 1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900 + CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA + CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA + +- Convert the above file to concatenated FASTA format:: + + >hg16 + TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGACACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA + >mm5 + TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGACACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/axt_to_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/axt_to_fasta.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,52 @@ +#!/usr/bin/env python +""" +Adapted from bx/scripts/axt_to_fasta.py +""" +from __future__ import print_function + +import sys + +import bx.align.axt + + +def usage(s=None): + message = """ +axt_to_fasta species1 species2 < axt_file > fasta_file +""" + if s is None: + sys.exit(message) + else: + sys.exit("%s\n%s" % (s, message)) + + +def main(): + # check the command line + species1 = sys.argv[1] + species2 = sys.argv[2] + + # convert the alignment blocks + + reader = bx.align.axt.Reader(sys.stdin, support_ids=True, + species1=species1, species2=species2) + + for a in reader: + if ("id" in a.attributes): + id = a.attributes["id"] + else: + id = None + print_component_as_fasta(a.components[0], id) + print_component_as_fasta(a.components[1], id) + print() + + +# TODO: this should be moved to a bx.align.fasta module +def print_component_as_fasta(c, id=None): + header = ">%s_%s_%s" % (c.src, c.start, c.start + c.size) + if id is not None: + header += " " + id + print(header) + print(c.text) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/axt_to_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/axt_to_fasta.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,72 @@ +<tool id="axt_to_fasta" name="AXT to FASTA" version="1.0.0"> + <description>Converts an AXT formatted file to FASTA format</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command interpreter="python">axt_to_fasta.py $dbkey_1 $dbkey_2 < $axt_input > $out_file1</command> + <inputs> + <param format="axt" name="axt_input" type="data" label="AXT file"/> + <param name="dbkey_1" type="genomebuild" label="Genome"/> + <param name="dbkey_2" type="genomebuild" label="Genome"/> + </inputs> + <outputs> + <data format="fasta" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="axt_input" value="1.axt" ftype="axt" /> + <param name="dbkey_1" value="hg17" /> + <param name="dbkey_2" value="panTro1" /> + <output name="out_file1" file="axt_to_fasta.dat" /> + </test> + </tests> + <help> + +.. class:: warningmark + +**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section. + +-------- + + +**Syntax** + +This tool converts an AXT formatted file to the FASTA format. + +- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. + +- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code. + + - This format contains an one line header. It starts with a " >" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence. + - The remaining lines contain the sequence itself. + - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence. + - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs. + +----- + +**Example** + +- AXT format:: + + 0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 + TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA + TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA + + 1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900 + CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA + CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA + +- Convert the above file to FASTA format:: + + >hg16.chr19(+):3001012-3001075|hg16_0 + TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA + >mm5.chr11(-):70568380-70568443|mm5_0 + TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA + + >hg16.chr19(+):3008279-3008357|hg16_1 + CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA + >mm5.chr11(-):70573976-70574054|mm5_1 + CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/axt_to_lav.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/axt_to_lav.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,180 @@ +#!/usr/bin/env python +""" +Application to convert AXT file to LAV file +------------------------------------------- + +:Author: Bob Harris (rsharris@bx.psu.edu) +:Version: $Revision: $ + +The application reads an AXT file from standard input and writes a LAV file to +standard out; some statistics are written to standard error. +""" +from __future__ import print_function + +import sys + +import bx.align.axt +import bx.align.lav + + +def usage(s=None): + message = """ +axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file + Each spec is of the form seq_file[:species_name]:lengths_file. + + seq_file should be a format string for the file names for the individual + sequences, with %s to be replaced by the alignment's src field. For example, + "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib", + etc. + + species_name is optional. If present, it is prepended to the alignment's src + field. + + Lengths files provide the length of each chromosome (lav format needs this + information but axt file does not contain it). The format is a series of + lines of the form + <chromosome name> <length> + The chromosome field in each axt block must match some <chromosome name> in + the lengths file. +""" + if s is None: + sys.exit(message) + else: + sys.exit("%s\n%s" % (s, message)) + + +def main(): + global debug + + # parse the command line + + primary = None + secondary = None + silent = False + + # pick off options + + args = sys.argv[1:] + seq_file2 = open(args.pop(-1), 'w') + seq_file1 = open(args.pop(-1), 'w') + lav_out = args.pop(-1) + axt_in = args.pop(-1) + while len(args) > 0: + arg = args.pop(0) + val = None + fields = arg.split("=", 1) + if len(fields) == 2: + arg = fields[0] + val = fields[1] + if val == "": + usage("missing a value in %s=" % arg) + + if arg == "--silent" and val is None: + silent = True + elif primary is None and val is None: + primary = arg + elif secondary is None and val is None: + secondary = arg + else: + usage("unknown argument: %s" % arg) + + if primary is None: + usage("missing primary file name and length") + + if secondary is None: + usage("missing secondary file name and length") + + try: + (primaryFile, primary, primaryLengths) = parse_spec(primary) + except: + usage("bad primary spec (must be seq_file[:species_name]:lengths_file") + + try: + (secondaryFile, secondary, secondaryLengths) = parse_spec(secondary) + except: + usage("bad secondary spec (must be seq_file[:species_name]:lengths_file") + + # read the lengths + + speciesToLengths = {} + speciesToLengths[primary] = read_lengths(primaryLengths) + speciesToLengths[secondary] = read_lengths(secondaryLengths) + + # read the alignments + + out = bx.align.lav.Writer(open(lav_out, 'w'), + attributes={ "name_format_1": primaryFile, + "name_format_2": secondaryFile }) + + axtsRead = 0 + axtsWritten = 0 + for axtBlock in bx.align.axt.Reader( + open(axt_in), species_to_lengths=speciesToLengths, species1=primary, + species2=secondary, support_ids=True): + axtsRead += 1 + out.write(axtBlock) + primary_c = axtBlock.get_component_by_src_start(primary) + secondary_c = axtBlock.get_component_by_src_start(secondary) + + print(">%s_%s_%s_%s" % (primary_c.src, secondary_c.strand, primary_c.start, primary_c.start + primary_c.size), file=seq_file1) + print(primary_c.text, file=seq_file1) + print(file=seq_file1) + + print(">%s_%s_%s_%s" % (secondary_c.src, secondary_c.strand, secondary_c.start, secondary_c.start + secondary_c.size), file=seq_file2) + print(secondary_c.text, file=seq_file2) + print(file=seq_file2) + axtsWritten += 1 + + out.close() + seq_file1.close() + seq_file2.close() + + if not silent: + sys.stdout.write("%d blocks read, %d written\n" % (axtsRead, axtsWritten)) + + +def parse_spec(spec): + """returns (seq_file,species_name,lengths_file)""" + fields = spec.split(":") + if len(fields) == 2: + return (fields[0], "", fields[1]) + elif len(fields) == 3: + return (fields[0], fields[1], fields[2]) + else: + raise ValueError + + +def read_lengths(fileName): + chromToLength = {} + + f = open(fileName, "r") + + for lineNumber, line in enumerate(f): + line = line.strip() + if line == "": + continue + if line.startswith("#"): + continue + + fields = line.split() + if len(fields) != 2: + raise Exception( "bad lengths line (%s:%d): %s" % (fileName, lineNumber, line) ) + + chrom = fields[0] + try: + length = int(fields[1]) + except: + raise Exception( "bad lengths line (%s:%d): %s" % (fileName, lineNumber, line) ) + + if chrom in chromToLength: + raise Exception( "%s appears more than once (%s:%d): %s" % (chrom, fileName, lineNumber) ) + + chromToLength[chrom] = length + + f.close() + + return chromToLength + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/axt_to_lav.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/axt_to_lav.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,97 @@ +<tool id="axt_to_lav_1" name="AXT to LAV" version="1.0.0"> + <description>Converts an AXT formatted file to LAV format</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command interpreter="python">axt_to_lav.py /galaxy/data/$dbkey_1/seq/%s.nib:$dbkey_1:${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/${dbkey_1}.len /galaxy/data/$dbkey_2/seq/%s.nib:$dbkey_2:${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/${dbkey_2}.len $align_input $lav_file $seq_file1 $seq_file2</command> + <inputs> + <param name="align_input" type="data" format="axt" label="Alignment File" optional="False"/> + <param name="dbkey_1" type="genomebuild" label="Genome"/> + <param name="dbkey_2" type="genomebuild" label="Genome"/> + </inputs> + <outputs> + <data name="lav_file" format="lav"/> + <data name="seq_file1" format="fasta" parent="lav_file"/> + <data name="seq_file2" format="fasta" parent="lav_file"/> + </outputs> + <help> + +.. class:: warningmark + +**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section. + +-------- + + +**Syntax** + +This tool converts an AXT formatted file to the LAV format. + +- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. + +- **LAV format** LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ. + +- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code. + + - This format contains an one line header. It starts with a ">" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence. + - The remaining lines contain the sequence itself. + - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence. + - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs. + +----- + +**Example** + +- AXT format:: + + 0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 + TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA + TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA + + 1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900 + CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA + CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA + +- Convert the above file to LAV format:: + + #:lav + s { + "/galaxy/data/hg16/seq/chr19.nib" 1 63811651 0 1 + "/galaxy/data/mm5/seq/chr11.nib-" 1 121648857 0 1 + } + h { + "> hg16.chr19" + "> mm5.chr11 (reverse complement)" + } + a { + s 3500 + b 3001012 70568380 + e 3001075 70568443 + l 3001012 70568380 3001075 70568443 81 + } + a { + s 3900 + b 3008279 70573976 + e 3008357 70574054 + l 3008279 70573976 3008357 70574054 78 + } + #:eof + +- With two files in the FASTA format:: + + >hg16.chr19_-_3001011_3001075 + TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA + + >hg16.chr19_-_3008278_3008357 + CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA + + **and**:: + + >mm5.chr11_-_70568379_70568443 + TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA + + >mm5.chr11_-_70573975_70574054 + CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA + </help> + <code file="axt_to_lav_code.py"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/axt_to_lav_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/axt_to_lav_code.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,6 @@ + +def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): + data = out_data["seq_file2"] + data.dbkey = param_dict['dbkey_2'] + app.model.context.add( data ) + app.model.context.flush() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/bed2gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/bed2gff.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,92 @@ +<tool id="bed2gff1" name="BED-to-GFF" version="2.0.0"> + <description>converter</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command interpreter="python">bed_to_gff_converter.py $input $out_file1</command> + <inputs> + <param format="bed" name="input" type="data" label="Convert this dataset"/> + </inputs> + <outputs> + <data format="gff" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="9.bed"/> + <output name="out_file1" file="bed2gff_out.gff"/> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from BED format to GFF format (scroll down for format description). + +-------- + +**Example** + +The following data in BED format:: + + chr28 346187 388197 BC114771 0 + 346187 388197 0 9 144,81,115,63,155,96,134,105,112, 0,24095,26190,31006,32131,33534,36994,41793,41898, + +Will be converted to GFF (**note** that the start coordinate is incremented by 1):: + + ##gff-version 2 + ##bed_to_gff_converter.py + + chr28 bed2gff mRNA 346188 388197 0 + . mRNA BC114771; + chr28 bed2gff exon 346188 346331 0 + . exon BC114771; + chr28 bed2gff exon 370283 370363 0 + . exon BC114771; + chr28 bed2gff exon 372378 372492 0 + . exon BC114771; + chr28 bed2gff exon 377194 377256 0 + . exon BC114771; + chr28 bed2gff exon 378319 378473 0 + . exon BC114771; + chr28 bed2gff exon 379722 379817 0 + . exon BC114771; + chr28 bed2gff exon 383182 383315 0 + . exon BC114771; + chr28 bed2gff exon 387981 388085 0 + . exon BC114771; + chr28 bed2gff exon 388086 388197 0 + . exon BC114771; + + +------ + +.. class:: informark + +**About formats** + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +The additional BED fields (optional) are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + 13. expCount - The number of experiments. + 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. + 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. + +**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: + + 1. seqname - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/bed_to_bigbed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/bed_to_bigbed.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,58 @@ +<tool id="bed_to_bigBed" name="BED-to-bigBed" version="1.0.0"> + <description>converter</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command>bedToBigBed $input1 $chromInfo $out_file1 + #if $settings.settingsType == "full": + -blockSize=${settings.blockSize} -itemsPerSlot=${settings.itemsPerSlot} ${settings.unc} + #end if + 2>&1 || echo "Error running bedToBigBed." >&2 + </command> + <requirements> + <requirement type="package">ucsc_tools</requirement> + </requirements> + <inputs> + <param format="bed" name="input1" type="data" label="Convert"> + <validator type="unspecified_build" /> + </param> + <conditional name="settings"> + <param name="settingsType" type="select" label="Converter settings to use" help="Default settings should usually be used."> + <option value="preset">Default</option> + <option value="full">Full parameter list</option> + </param> + <when value="preset" /> + <when value="full"> + <param name="blockSize" size="4" type="integer" value="256" label="Items to bundle in r-tree" help="Default is 256 (blockSize)" /> + <param name="itemsPerSlot" size="4" type="integer" value="512" label="Data points bundled at lowest level" help="Default is 512 (itemsPerSlot)" /> + <param name="unc" type="boolean" truevalue="-unc" falsevalue="" checked="False" label="Do not use compression" help="(unc)"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="bigbed" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="7.bed" dbkey="hg17" /> + <param name="settingsType" value="full" /> + <param name="blockSize" value="256" /> + <param name="itemsPerSlot" value="512" /> + <param name="unc" value="False" /> + <output name="out_file1" file="7.bigbed"/> + </test> + <test> + <param name="input1" value="7.bed" dbkey="hg17" /> + <param name="settingsType" value="preset" /> + <output name="out_file1" file="7.bigbed"/> + </test> + </tests> + <help> + +This tool converts a **sorted** BED file into a bigBed file. + +Currently, the bedFields option to specify the number of non-standard fields is not supported as an AutoSQL file must be provided, which is a format +currently not supported by Galaxy. + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/bed_to_gff_converter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/bed_to_gff_converter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,78 @@ +#!/usr/bin/env python +# This code exists in 2 places: ~/datatypes/converters and ~/tools/filters +from __future__ import print_function + +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def __main__(): + input_name = sys.argv[1] + output_name = sys.argv[2] + skipped_lines = 0 + first_skipped_line = 0 + out = open( output_name, 'w' ) + out.write( "##gff-version 2\n" ) + out.write( "##bed_to_gff_converter.py\n\n" ) + i = 0 + for i, line in enumerate( open( input_name ) ): + complete_bed = False + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ): + try: + elems = line.split( '\t' ) + if len( elems ) == 12: + complete_bed = True + chrom = elems[0] + if complete_bed: + feature = "mRNA" + else: + try: + feature = elems[3] + except: + feature = 'feature%d' % ( i + 1 ) + start = int( elems[1] ) + 1 + end = int( elems[2] ) + try: + score = elems[4] + except: + score = '0' + try: + strand = elems[5] + except: + strand = '+' + try: + group = elems[3] + except: + group = 'group%d' % ( i + 1 ) + if complete_bed: + out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group ) ) + else: + out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group ) ) + if complete_bed: + # We have all the info necessary to annotate exons for genes and mRNAs + block_count = int( elems[9] ) + block_sizes = elems[10].split( ',' ) + block_starts = elems[11].split( ',' ) + for j in range( block_count ): + exon_start = int( start ) + int( block_starts[j] ) + exon_end = exon_start + int( block_sizes[j] ) - 1 + out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) ) + except: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + else: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + out.close() + info_msg = "%i lines converted to GFF version 2. " % ( i + 1 - skipped_lines ) + if skipped_lines > 0: + info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % ( skipped_lines, first_skipped_line ) + print(info_msg) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/catWrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/catWrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,32 @@ +#!/usr/bin/env python +# By Guruprasad Ananda. +import os +import shutil +import sys + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def main(): + outfile = sys.argv[1] + infile = sys.argv[2] + + if len(sys.argv) < 4: + shutil.copyfile(infile, outfile) + sys.exit() + + cmdline = "cat %s " % (infile) + for inp in sys.argv[3:]: + cmdline = cmdline + inp + " " + cmdline = cmdline + ">" + outfile + try: + os.system(cmdline) + except: + stop_err("Error encountered with cat.") + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/catWrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/catWrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,79 @@ +<tool id="cat1" name="Concatenate datasets" version="1.0.0"> + <description>tail-to-head</description> + <command interpreter="python"> + catWrapper.py + $out_file1 + $input1 + #for $q in $queries + ${q.input2} + #end for + </command> + <inputs> + <param name="input1" type="data" label="Concatenate Dataset"/> + <repeat name="queries" title="Dataset"> + <param name="input2" type="data" label="Select" /> + </repeat> + </inputs> + <outputs> + <data name="out_file1" format="input" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <output name="out_file1" file="cat_wrapper_out1.bed"/> + </test> + <!--TODO: if possible, enhance the underlying test code to handle this test + the problem is multiple params with the same name "input2" + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="input2" value="3.bed"/> + <output name="out_file1" file="cat_wrapper_out2.bed"/> + </test> + --> + </tests> + <help> + +.. class:: warningmark + +**WARNING:** Be careful not to concatenate datasets of different kinds (e.g., sequences with intervals). This tool does not check if the datasets being concatenated are in the same format. + +----- + +**What it does** + +Concatenates datasets + +----- + +**Example** + +Concatenating Dataset:: + + chrX 151087187 151087355 A 0 - + chrX 151572400 151572481 B 0 + + +with Dataset1:: + + chr1 151242630 151242955 X 0 + + chr1 151271715 151271999 Y 0 + + chr1 151278832 151279227 Z 0 - + +and with Dataset2:: + + chr2 100000030 200000955 P 0 + + chr2 100000015 200000999 Q 0 + + +will result in the following:: + + chrX 151087187 151087355 A 0 - + chrX 151572400 151572481 B 0 + + chr1 151242630 151242955 X 0 + + chr1 151271715 151271999 Y 0 + + chr1 151278832 151279227 Z 0 - + chr2 100000030 200000955 P 0 + + chr2 100000015 200000999 Q 0 + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/changeCase.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/changeCase.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,58 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +my $columns = {}; +my $del = ""; +my @in = (); +my @out = (); +my $command = ""; +my $field = 0; + +# a wrapper for changing the case of columns from within galaxy +# isaChangeCase.pl [filename] [columns] [delim] [casing] [output] + +die "Check arguments: $0 [filename] [columns] [delim] [casing] [output]\n" unless @ARGV == 5; + +# process column input +$ARGV[1] =~ s/\s+//g; +foreach ( split /,/, $ARGV[1] ) { + if (m/^c\d{1,}$/i) { + s/c//ig; + $columns->{$_} = --$_; + } +} + +die "No columns specified, columns are not preceeded with 'c', or commas are not used to separate column numbers: $ARGV[1]\n" if keys %$columns == 0; + +my $column_delimiters_href = { + 'TAB' => q{\t}, + 'COMMA' => ",", + 'DASH' => "-", + 'UNDERSCORE' => "_", + 'PIPE' => q{\|}, + 'DOT' => q{\.}, + 'SPACE' => q{\s+} +}; + +$del = $column_delimiters_href->{$ARGV[2]}; + +open (OUT, ">$ARGV[4]") or die "Cannot create $ARGV[4]:$!\n"; +open (IN, "<$ARGV[0]") or die "Cannot open $ARGV[0]:$!\n"; +while (<IN>) { + chop; + @in = split /$del/; + for ( my $i = 0; $i <= $#in; ++$i) { + if (exists $columns->{$i}) { + push(@out, $ARGV[3] eq 'up' ? uc($in[$i]) : lc($in[$i])); + } else { + push(@out, $in[$i]); + } + } + print OUT join("\t",@out), "\n"; + @out = (); +} +close IN; + +close OUT; |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/changeCase.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/changeCase.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,77 @@ +<tool id="ChangeCase" name="Change Case" version="1.0.0"> + <description> of selected columns</description> + <stdio> + <exit_code range="1:" err_level="fatal" /> + </stdio> + <command interpreter="perl">changeCase.pl $input "$cols" $delimiter $casing $out_file1</command> + <inputs> + <param name="input" format="txt" type="data" label="From"/> + <param name="cols" size="10" type="text" value="c1,c2" label="Change case of columns"/> + <param name="delimiter" type="select" label="Delimited by"> + <option value="TAB">Tab</option> + <option value="SPACE">Whitespace</option> + <option value="DOT">Dot</option> + <option value="COMMA">Comma</option> + <option value="DASH">Dash</option> + <option value="UNDERSCORE">Underscore</option> + <option value="PIPE">Pipe</option> + </param> + <param name="casing" type="select" label="To"> + <option value="up">Upper case</option> + <option value="lo">Lower case</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="1.txt" ftype="txt"/> + <param name="cols" value="c1"/> + <param name="delimiter" value="SPACE"/> + <param name="casing" value="up"/> + <output name="out_file1" file="changeCase_out1.tabular"/> + </test> + <test> + <param name="input" value="1.bed" ftype="bed"/> + <param name="cols" value="c1"/> + <param name="delimiter" value="TAB"/> + <param name="casing" value="up"/> + <output name="out_file1" file="changeCase_out2.tabular"/> + </test> + </tests> + <help> + +.. class:: warningmark + +**This tool breaks column assignments.** To re-establish column assignments run the tool and click on the pencil icon in the resulting history item. + +.. class:: warningmark + +The format of the resulting dataset from this tool is always tabular. + +----- + +**What it does** + +This tool selects specified columns from a dataset and converts the values of those columns to upper or lower case. + +- Columns are specified as **c1**, **c2**, and so on. +- Columns can be specified in any order (e.g., **c2,c1,c6**) + +----- + +**Example** + +Changing columns 1 and 3 ( delimited by Comma ) to upper case in:: + + apple,is,good + windows,is,bad + +will result in:: + + APPLE is GOOD + WINDOWS is BAD + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/commWrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/commWrapper.pl Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,19 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +use File::Temp "tempfile"; +#use POSIX qw(tmpnam); + +my ($input1, $input2, $mode, $out_file1) = @ARGV; + +my ($fh, $file1) = tempfile(); +my ($fh1,$file2) = tempfile(); + +`sort $input1 > $file1`; +`sort $input2 > $file2`; +`comm $mode $file1 $file2 > $out_file1`; +`rm $file1 ; rm $file2`; + + + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/commWrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/commWrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,38 @@ +<tool id="Comm1" name="Find Similarities and Differences" version="1.0.0"> + <description>between two datasets</description> + <command interpreter="perl">commWrapper.pl $input1 $input2 $mode $out_file1</command> + <inputs> + <param format="tabular" name="input1" type="data" label="Compare Dataset1"/> + <param format="tabular" name="input2" type="data" label="with Dataset2"/> + <param name="mode" type="select" label="And find"> + <option value="-23">Lines unique to Dataset1</option> + <option value="-12">Lines shared between Dataset1 and Dataset2</option> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <help> +This tool is based on UNIX shell command comm. It compares two datasets and returns similarities or differences. For example, if you have two datasets:: + + a 1 + b 2 + c 3 + +and:: + + a 1 + f 6 + h 8 + +Using this tool with **Lines unique to Dataset1** option will return:: + + b 2 + c 3 + +If you use **Lines shared between Dataset1 and Dataset2** option output will look like this:: + + a 1 + +</help> +</tool> \ No newline at end of file |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/compare.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/compare.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,79 @@ +<tool id="comp1" name="Compare two Datasets" version="1.0.2"> + <description>to find common or distinct rows</description> + <command interpreter="python">joinWrapper.py $input1 $input2 $field1 $field2 $mode $out_file1</command> + <inputs> + <param format="tabular" name="input1" type="data" label="Compare"/> + <param name="field1" label="Using column" type="data_column" data_ref="input1"> + <validator type="no_options" message="Invalid column choice. Please try again after editing metadata of your input dataset by clicking on the pencil icon next to it."/> + </param> + <param format="tabular" name="input2" type="data" label="against" /> + <param name="field2" label="and column" type="data_column" data_ref="input2"> + <validator type="no_options" message="Invalid column choice. Please try again after editing metadata of your input dataset by clicking on the pencil icon next to it."/> + </param> + <param name="mode" type="select" label="To find" help="See examples below for explanation of these options"> + <option value="N">Matching rows of 1st dataset</option> + <option value="V">Non Matching rows of 1st dataset</option> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="field1" value="2"/> + <param name="field2" value="2"/> + <param name="mode" value="N"/> + <output name="out_file1" file="fs-compare.dat"/> + </test> + <!--test case with duplicated key values--> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="3.bed"/> + <param name="field1" value="1"/> + <param name="field2" value="1"/> + <param name="mode" value="V"/> + <output name="out_file1" file="fs-compare-2.dat"/> + </test> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool finds lines in one dataset that HAVE or DO NOT HAVE a common field with another dataset. + +----- + +**Example** + +If this is **First dataset**:: + + chr1 10 20 geneA + chr1 50 80 geneB + chr5 10 40 geneL + +and this is **Second dataset**:: + + geneA tumor-suppressor + geneB Foxp2 + geneC Gnas1 + geneE INK4a + +Finding lines of the **First dataset** whose 4th column matches the 1st column of the **Second dataset** yields:: + + chr1 10 20 geneA + chr1 50 80 geneB + +Conversely, using option **Non Matching rows of First dataset** on the same fields will yield:: + + chr5 10 40 geneL + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/condense_characters.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/condense_characters.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,105 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +# condenses all consecutive characters of one type +# convert_characters.pl [input] [character] [output] + +die "Check arguments" unless @ARGV == 3; + +my $inputfile = $ARGV[0]; +my $character = $ARGV[1]; +my $outputfile = $ARGV[2]; + + +my $convert_from; +my $convert_to; + + +if ($character eq "s") +{ + $convert_from = '\s'; +} +elsif ($character eq "T") +{ + $convert_from = '\t'; +} +elsif ($character eq "Sp") +{ + $convert_from = " "; +} +elsif ($character eq "Dt") +{ + $convert_from = '\.'; +} +elsif ($character eq "C") +{ + $convert_from = ","; +} +elsif ($character eq "D") +{ + $convert_from = "-"; +} +elsif ($character eq "U") +{ + $convert_from = "_"; +} +elsif ($character eq "P") +{ + $convert_from = '\|'; +} +else +{ + die "Invalid value specified for convert from\n"; +} + + +if ($character eq "T") +{ + $convert_to = "\t"; +} +elsif ($character eq "Sp") +{ + $convert_to = " "; +} +elsif ($character eq "Dt") +{ + $convert_to = "\."; +} +elsif ($character eq "C") +{ + $convert_to = ","; +} +elsif ($character eq "D") +{ + $convert_to = "-"; +} +elsif ($character eq "U") +{ + $convert_to = "_"; +} +elsif ($character eq "P") +{ + $convert_to = "|"; +} +else +{ + die "Invalid value specified for Convert to\n"; +} + +my $fhIn; +open ($fhIn, "< $inputfile") or die "Cannot open source file"; + +my $fhOut; +open ($fhOut, "> $outputfile"); + +while (<$fhIn>) +{ + my $thisLine = $_; + chomp $thisLine; + $thisLine =~ s/${convert_from}+/$convert_to/g; + print $fhOut $thisLine,"\n"; +} +close ($fhIn) or die "Cannot close source file"; +close ($fhOut) or die "Cannot close output file"; |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/condense_characters.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/condense_characters.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,48 @@ +<tool id="Condense characters1" name="Condense" version="1.0.0"> + <description>consecutive characters</description> + <command interpreter="perl">condense_characters.pl $input $character $out_file1</command> + <inputs> +<!-- <display>condense all consecutive $character from $input</display> --> + <param name="character" type="select" label="Condense all consecutive"> + <option value="T">Tabs</option> + <option value="Sp">Spaces</option> + <option value="Dt">Dots</option> + <option value="C">Commas</option> + <option value="D">Dashes</option> + <option value="U">Underscores</option> + <option value="P">Pipes</option> + </param> + <param format="txt" name="input" type="data" label="in this Dataset"/> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input" /> + </outputs> + <tests> + <test> + <param name="character" value="T"/> + <param name="input" value="1.bed"/> + <output name="out_file1" file="eq-condense.dat"/> + </test> + </tests> + <help> + +**What it does** + +This tool condenses all consecutive characters of a specified type. + +----- + +**Example** + +- Input file:: + + geneX,,,10,,,,,20 + geneY,,5,,,,,12,15,9, + +- Condense all consecutive commas. The above file will be converted into:: + + geneX,10,20 + geneY,5,12,15,9 + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/convert_characters.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/convert_characters.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,101 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +# converts all characters of one type into another +# convert_characters.pl [input] [convert_from] [convert_to] [output] + +die "Check argument\n" unless @ARGV == 4; + +my $inputfile = $ARGV[0]; +my $convert_from = $ARGV[1]; +my $convert_to = $ARGV[2]; +my $outputfile = $ARGV[3]; + +if ($convert_from eq "s") +{ + $convert_from = '\s'; +} +elsif ($convert_from eq "T") +{ + $convert_from = '\t'; +} +elsif ($convert_from eq "Sp") +{ + $convert_from = '\s'; +} +elsif ($convert_from eq "Dt") +{ + $convert_from = '\.'; +} +elsif ($convert_from eq "C") +{ + $convert_from = ","; +} +elsif ($convert_from eq "D") +{ + $convert_from = "-"; +} +elsif ($convert_from eq "U") +{ + $convert_from = "_"; +} +elsif ($convert_from eq "P") +{ + $convert_from = '\|'; +} +else +{ + die "Invalid value specified for convert from\n"; +} + + +if ($convert_to eq "T") +{ + $convert_to = "\t"; +} +elsif ($convert_to eq "Sp") +{ + $convert_to = '\s'; +} +elsif ($convert_to eq "Dt") +{ + $convert_to = "\."; +} +elsif ($convert_to eq "C") +{ + $convert_to = ","; +} +elsif ($convert_to eq "D") +{ + $convert_to = "-"; +} +elsif ($convert_to eq "U") +{ + $convert_to = "_"; +} +elsif ($convert_to eq "P") +{ + $convert_to = "|"; +} +else +{ + die "Invalid value specified for convert to\n"; +} + +my $fhIn; +open ($fhIn, "< $inputfile") or die "Cannot open source file"; + +my $fhOut; +open ($fhOut, "> $outputfile"); + +while (<$fhIn>) +{ + my $thisLine = $_; + chomp $thisLine; + $thisLine =~ s/$convert_from{1,}/$convert_to/g; + print $fhOut $thisLine,"\n"; +} +close ($fhIn) or die "Cannot close source file\n"; +close ($fhOut) or die "Cannot close output fil\n"; |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/convert_characters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/convert_characters.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,54 @@ +#!/usr/bin/env python +# By, Guruprasad Ananda. +from __future__ import print_function + +import optparse +import re + + +def __main__(): + parser = optparse.OptionParser() + parser.add_option('--strip', action='store_true', + help='strip leading and trailing whitespaces') + parser.add_option('--condense', action='store_true', + help='condense consecutive delimiters') + (options, args) = parser.parse_args() + if len(args) != 3: + parser.error("usage: convert_characters.py infile from_char outfile") + + char_dict = { + 'T': '\t', + 's': '\s', + 'Dt': '\.', + 'C': ',', + 'D': '-', + 'U': '_', + 'P': '\|', + 'Co': ':', + 'Sc': ';' + } + # regexp to match 1 or more occurences. + from_char = args[1] + from_ch = char_dict[from_char] + if options.condense: + from_ch += '+' + + skipped = 0 + with open(args[0], 'rU') as fin: + with open(args[2], 'w') as fout: + for line in fin: + if options.strip: + line = line.strip() + else: + line = line.rstrip('\n') + try: + fout.write("%s\n" % (re.sub(from_ch, '\t', line))) + except: + skipped += 1 + + if skipped: + print("Skipped %d lines as invalid." % skipped) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/convert_characters.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/convert_characters.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,77 @@ +<tool id="Convert characters1" name="Convert" version="1.0.0"> + <description>delimiters to TAB</description> + <command interpreter="python"> +convert_characters.py +#if $strip + --strip +#end if +#if $condense + --condense +#end if +$input $convert_from $out_file1 + </command> + <inputs> + <param name="convert_from" type="select" label="Convert all"> + <option value="s">Whitespaces</option> + <option value="T">Tabs</option> + <!--<option value="Sp">Spaces</option>--> + <option value="Dt">Dots</option> + <option value="C">Commas</option> + <option value="D">Dashes</option> + <option value="U">Underscores</option> + <option value="P">Pipes</option> + <option value="Co">Colons</option> + <option value="Sc">Semicolons</option> + </param> + <param format="txt" name="input" type="data" label="in Dataset"/> + <param name="strip" type="boolean" checked="true" label="Strip leading and trailing whitespaces" /> + <param name="condense" type="boolean" checked="true" label="Condense consecutive delimiters in one TAB" /> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + <tests> + <test> + <param name="convert_from" value="s"/> + <param name="input" value="1.bed"/> + <param name="strip" value="true" /> + <param name="condense" value="true" /> + <output name="out_file1" file="eq-convert.dat"/> + </test> + <test> + <param name="convert_from" value="s"/> + <param name="input" value="a.txt"/> + <param name="strip" value="true" /> + <param name="condense" value="true" /> + <output name="out_file1" file="a.tab"/> + </test> + </tests> + <help> + +**What it does** + +Converts all delimiters of a specified type into TABs. Consecutive delimiters can be condensed in a single TAB. + +----- + +**Example** + +- Input file:: + + chrX||151283558|151283724|NM_000808_exon_8_0_chrX_151283559_r|0|- + chrX|151370273|151370486|NM_000808_exon_9_0_chrX_151370274_r|0|- + chrX|151559494|151559583|NM_018558_exon_1_0_chrX_151559495_f|0|+ + chrX|151564643|151564711|NM_018558_exon_2_0_chrX_151564644_f||||0|+ + +- Converting all pipe delimiters of the above file to TABs and condensing delimiters will get:: + + chrX 151283558 151283724 NM_000808_exon_8_0_chrX_151283559_r 0 - + chrX 151370273 151370486 NM_000808_exon_9_0_chrX_151370274_r 0 - + chrX 151559494 151559583 NM_018558_exon_1_0_chrX_151559495_f 0 + + chrX 151564643 151564711 NM_018558_exon_2_0_chrX_151564644_f 0 + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/cutWrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/cutWrapper.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,87 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +my @columns = (); +my $del = ""; +my @in = (); +my @out = (); +my $command = ""; +my $field = 0; +my $start = 0; +my $end = 0; +my $i = 0; + +# a wrapper for cut for use in galaxy +# cutWrapper.pl [filename] [columns] [delim] [output] + +die "Check arguments\n" unless @ARGV == 4; + +$ARGV[1] =~ s/\s+//g; +foreach ( split /,/, $ARGV[1] ) { + if (m/^c\d{1,}$/i) { + push (@columns, $_); + $columns[@columns-1] =~s/c//ig; + } elsif (m/^c\d{1,}-c\d{1,}$/i) { + ($start, $end) = split(/-/, $_); + $start =~ s/c//ig; + $end =~ s/c//ig; + for $i ($start .. $end) { + push (@columns, $i); + } + } +} + +die "No columns specified, columns are not preceded with 'c', or commas are not used to separate column numbers: $ARGV[1]\n" if @columns == 0; + +my $column_delimiters_href = { + 'T' => q{\t}, + 'C' => ",", + 'D' => "-", + 'U' => "_", + 'P' => q{\|}, + 'Dt' => q{\.}, + 'Sp' => q{\s+} +}; + +$del = $column_delimiters_href->{$ARGV[2]}; + +open (OUT, ">$ARGV[3]") or die "Cannot create $ARGV[2]:$!\n"; +open (IN, "<$ARGV[0]") or die "Cannot open $ARGV[0]:$!\n"; + +while (my $line=<IN>) { + if ($line =~ /^#/) { + #Ignore comment lines + } else { + chop($line); + @in = split(/$del/, $line); + foreach $field (@columns) { + if (defined($in[$field-1])) { + push(@out, $in[$field-1]); + } else { + push(@out, "."); + } + } + print OUT join("\t",@out), "\n"; + @out = (); + } +} + +#while (<IN>) { +# chop; +# @in = split /$del/; +# foreach $field (@columns) { +# if (defined($in[$field-1])) { +# push(@out, $in[$field-1]); +# } else { +# push(@out, "."); +# } +# } +# print OUT join("\t",@out), "\n"; +# @out = (); +#} +close IN; + +close OUT; + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/cutWrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/cutWrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,211 @@\n+<tool id="Cut1" name="Cut" version="1.0.2">\n+ <description>columns from a table</description>\n+ <command interpreter="perl">cutWrapper.pl "${input}" "${columnList}" "${delimiter}" "${out_file1}"</command>\n+ <inputs>\n+ <param name="columnList" type="text" value="c1,c2" label="Cut columns"/>\n+ <param name="delimiter" type="select" label="Delimited by">\n+ <option value="T">Tab</option>\n+ <option value="Sp">Whitespace</option>\n+ <option value="Dt">Dot</option>\n+ <option value="C">Comma</option>\n+ <option value="D">Dash</option>\n+ <option value="U">Underscore</option>\n+ <option value="P">Pipe</option>\n+ </param>\n+ <param format="txt" name="input" type="data" label="From"/>\n+ </inputs>\n+ <outputs>\n+ <data format="tabular" name="out_file1" >\n+ <actions>\n+ <conditional name="delimiter">\n+ <when value="T">\n+ <conditional name="input">\n+ <when datatype_isinstance="interval">\n+ <action type="format" default="tabular">\n+ <option type="from_param" name="columnList" column="0" offset="0"> <!-- chromCol is 1-->\n+\n+ <filter type="insert_column" column="0" value="interval"/>\n+\n+ <filter type="insert_column" ref="columnList" /> <!-- startCol -->\n+\n+ <filter type="insert_column" ref="columnList" /> <!-- endCol -->\n+\n+ <filter type="multiple_splitter" column="1" separator=","/>\n+ <filter type="column_strip" column="1"/> <!-- get rid of all external whitespace -->\n+ <filter type="string_function" column="1" name="lower" />\n+ <filter type="param_value" column="1" value="^c\\d{1,}$" compare="re_search" keep="True"/>\n+ <filter type="column_strip" column="1" strip="c"/> <!-- get rid of c\'s -->\n+ <filter type="boolean" column="1" cast="int" />\n+\n+ <filter type="multiple_splitter" column="2" separator=","/>\n+ <filter type="column_strip" column="2"/> <!-- get rid of all external whitespace -->\n+ <filter type="string_function" column="2" name="lower" />\n+ <filter type="param_value" column="2" value="^c\\d{1,}$" compare="re_search" keep="True"/>\n+ <filter type="column_strip" column="2" strip="c"/> <!-- get rid of c\'s -->\n+ <filter type="boolean" column="2" cast="int" />\n+\n+ <filter type="multiple_splitter" column="3" separator=","/>\n+ <filter type="column_strip" column="3"/> <!-- get rid of all external whitespace -->\n+ <filter type="string_function" column="3" name="lower" />\n+ <filter type="param_value" column="3" value="^c\\d{1,}$" compare="re_search" keep="True"/>\n+ <filter type="column_strip" column="3" strip="c"/> <!-- get rid of c\'s -->\n+ <filter type="boolean" column="3" cast="int" />\n+\n+ <filter type="metadata_value" ref="input" name="chromCol" column="1" />\n+ <filter type="metadata_value" ref="input" name="startCol" column="2" />\n+ <filter type="metadata_value" ref="input" name="endCol" column="3" />\n+\n+ </option>\n+ </action>\n+\n+ <conditional name="out_file1">\n+ <when datatype_isinstance="interval">\n+ <action type="metadata" name="chromCol">\n+ <option type="from_param" name="columnList" column="0" offset="0"> <!-- chromCol is 0-->\n+ <filter type="multiple_splitter" column="0" separator=","/>\n+ <filter type="column_strip" column="0"/> <!-- get rid of all external whitespace -->\n+ <filter type="string_function" column="0" name="lower" />\n+ <filter type="param_'..b'="metadata" name="nameCol" default="0">\n+ <option type="from_param" name="columnList" column="0" offset="0"> <!-- nameCol is 0-->\n+ <filter type="multiple_splitter" column="0" separator=","/>\n+ <filter type="column_strip" column="0"/> <!-- get rid of all external whitespace -->\n+ <filter type="string_function" column="0" name="lower" />\n+ <filter type="param_value" column="0" value="^c\\d{1,}$" compare="re_search" keep="True"/>\n+ <filter type="column_strip" column="0" strip="c"/> <!-- get rid of c\'s -->\n+ <filter type="insert_column" value="1" iterate="True" column="0"/>\n+ <filter type="boolean" column="1" cast="int" />\n+ <filter type="metadata_value" ref="input" name="nameCol" column="1" />\n+ </option>\n+ </action>\n+\n+ <action type="metadata" name="strandCol" default="0">\n+ <option type="from_param" name="columnList" column="0" offset="0"> <!-- strandCol is 0-->\n+ <filter type="multiple_splitter" column="0" separator=","/>\n+ <filter type="column_strip" column="0"/> <!-- get rid of all external whitespace -->\n+ <filter type="string_function" column="0" name="lower" />\n+ <filter type="param_value" column="0" value="^c\\d{1,}$" compare="re_search" keep="True"/>\n+ <filter type="column_strip" column="0" strip="c"/> <!-- get rid of c\'s -->\n+ <filter type="insert_column" value="1" iterate="True" column="0"/>\n+ <filter type="boolean" column="1" cast="int" />\n+ <filter type="metadata_value" ref="input" name="strandCol" column="1" />\n+ </option>\n+ </action>\n+ </when>\n+ </conditional>\n+\n+ </when>\n+ </conditional>\n+ </when>\n+ </conditional>\n+ </actions>\n+ </data>\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="columnList" value="c1,c4,c2,c3"/>\n+ <param name="delimiter" value="T"/>\n+ <param name="input" value="1.bed"/>\n+ <output name="out_file1" file="eq-cut.dat"/>\n+ </test>\n+ <test>\n+ <param name="columnList" value="c1,c4,c2-c3" />\n+ <param name="delimiter" value="T" />\n+ <param name="input" value="1.bed" />\n+ <output name="out_file1" file="eq-cut.dat" />\n+ </test>\n+ </tests>\n+ <help>\n+\n+.. class:: warningmark\n+\n+**WARNING: This tool breaks column assignments.** To re-establish column assignments run the tools and click on the pencil icon in the latest history item.\n+\n+.. class:: infomark\n+\n+The output of this tool is always in tabular format (e.g., if your original delimiters are commas, they will be replaced with tabs). For example:\n+\n+ Cutting columns 1 and 3 from::\n+\n+ apple,is,good\n+ windows,is,bad\n+\n+ will give::\n+\n+ apple good\n+ windows bad\n+\n+-----\n+\n+**What it does**\n+\n+This tool selects (cuts out) specified columns from the dataset.\n+\n+- Columns are specified as **c1**, **c2**, and so on. Column count begins with **1**\n+- Columns can be specified in any order (e.g., **c2,c1,c6**)\n+- If you specify more columns than actually present - empty spaces will be filled with dots\n+\n+-----\n+\n+**Example**\n+\n+Input dataset (six columns: c1, c2, c3, c4, c5, and c6)::\n+\n+ chr1 10 1000 gene1 0 +\n+ chr2 100 1500 gene2 0 +\n+\n+**cut** on columns "**c1,c4,c6**" will return::\n+\n+ chr1 gene1 +\n+ chr2 gene2 +\n+\n+**cut** on columns "**c6,c5,c4,c1**" will return::\n+\n+ + 0 gene1 chr1\n+ + 0 gene2 chr2\n+\n+**cut** on columns "**c1-c3**" will return::\n+\n+ chr1 10 1000\n+ chr2 100 1500\n+\n+\n+**cut** on columns "**c8,c7,c4**" will return::\n+\n+ . . gene1\n+ . . gene2\n+</help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/fileGrep.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/fileGrep.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,42 @@ +<tool id="fileGrep1" name="Match" version="1.0.0"> + <description>a column from one Query against another Query</description> + <command>cut -f $col $input1 | grep -f - $match $input2 > $out_file1</command> + <inputs> + <param name="col" size="2" type="text" value="1" label="Match content of column"/> + <param format="tabular" name="input1" type="data" label="From Query1"/> + <param format="tabular" name="input2" type="data" label="Against Query2"/> + <param name="match" type="select" label="and return rows that"> + <option value="">Match</option> + <option value="-v">Do not match</option> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input2" /> + </outputs> + <help> +This tool is based on UNIX command grep with option -f. It matches content of one query against another. For example, assume you have two queries - one that contains EST accession numbers and some other information:: + + AA001229 12 12 + A001501 7 7 + AA001641 6 6 + AA001842 6 6 + AA002047 6 6 + AA004638 3 3 + +and another that is a typical BED file describing genomic location of some ESTs:: + + chr7 115443235 115443809 CA947954_exon_0_0_chr7_115443236_f 0 + + chr7 115443236 115443347 DB338189_exon_0_0_chr7_115443237_f 0 + + chr7 115443347 115443768 DB338189_exon_1_0_chr7_115443348_f 0 + + chr7 115443239 115443802 AA001842_exon_0_0_chr7_115443240_f 0 + + chr7 115443243 115443347 DB331869_exon_0_0_chr7_115443244_f 0 + + chr7 115443347 115443373 DB331869_exon_1_0_chr7_115443348_f 0 + + +Using this tool you will be able to tell how many ESTs in Query1 are also preset in Query2 and will output this:: + + chr7 115443239 115443802 AA001842_exon_0_0_chr7_115443240_f 0 + +if **Match** option is chosen. + +</help> +</tool> \ No newline at end of file |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/fixedValueColumn.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/fixedValueColumn.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,34 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +# fixedValueColumn.pl $input $out_file1 "expression" "iterate [yes|no]" + +my ($input, $out_file1, $expression, $iterate) = @ARGV; +my $i = 0; +my $numeric = 0; + +die "Check arguments\n" unless @ARGV == 4; + +open (DATA, "<$input") or die "Cannot open $input:$!\n"; +open (OUT, ">$out_file1") or die "Cannot create $out_file1:$!\n"; + +if ($expression =~ m/^\d+$/) { + $numeric = 1; + $i = $expression; +} + +while (<DATA>) { + chop; + if ($iterate eq "no") { + print OUT "$_\t$expression\n"; + } else { + print OUT "$_\t$i\n" if $numeric == 1; + print OUT "$_\t$expression-$i\n" if $numeric == 0; + ++$i; + } +} + +close DATA; +close OUT; |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/fixedValueColumn.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/fixedValueColumn.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,61 @@ +<tool id="addValue" name="Add column" version="1.0.0"> + <description>to an existing dataset</description> + <command interpreter="perl">fixedValueColumn.pl $input $out_file1 "$exp" $iterate</command> + <inputs> + <param name="exp" size="20" type="text" value="1" label="Add this value"/> + <param format="tabular" name="input" type="data" label="to Dataset" help="Dataset missing? See TIP below" /> + <param name="iterate" type="select" label="Iterate?"> + <option value="no">NO</option> + <option value="yes">YES</option> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="exp" value="1"/> + <param name="input" value="1.bed"/> + <param name="iterate" value="no"/> + <output name="out_file1" file="eq-addvalue.dat"/> + </test> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**What it does** + +You can enter any value and it will be added as a new column to your dataset + +----- + +**Example** + +If you original data looks like this:: + + chr1 10 100 geneA + chr2 200 300 geneB + chr2 400 500 geneC + +Typing **+** in the text box will generate:: + + chr1 10 100 geneA + + chr2 200 300 geneB + + chr2 400 500 geneC + + + +You can also add line numbers by selecting **Iterate: YES**. In this case if you enter **1** in the text box you will get:: + + chr1 10 100 geneA 1 + chr2 200 300 geneB 2 + chr2 400 500 geneC 3 + + + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/extract_GFF_Features.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/extract_GFF_Features.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,54 @@ +#!/usr/bin/env python +# Guruprasad Ananda +""" +Extract features from GFF file. + +usage: %prog input1 out_file1 column features +""" +from __future__ import print_function + +import sys + +from bx.cookbook import doc_optparse + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def main(): + # Parsing Command Line here + options, args = doc_optparse.parse( __doc__ ) + + try: + inp_file, out_file, column, features = args + except: + stop_err( "One or more arguments is missing or invalid.\nUsage: prog input output column features" ) + try: + column = int( column ) + except: + stop_err( "Column %s is an invalid column." % column ) + + if features is None: + stop_err( "Column %d has no features to display, select another column." % ( column + 1 ) ) + + fo = open( out_file, 'w' ) + for i, line in enumerate( open( inp_file ) ): + line = line.rstrip( '\r\n' ) + if line and line.startswith( '#' ): + # Keep valid comment lines in the output + fo.write( "%s\n" % line ) + else: + try: + if line.split( '\t' )[column] in features.split( ',' ): + fo.write( "%s\n" % line ) + except: + pass + fo.close() + + print('Column %d features: %s' % ( column + 1, features )) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/extract_GFF_Features.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/extract_GFF_Features.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,114 @@ +<tool id="Extract_features1" name="Extract features" version="1.0.0"> + <description>from GFF data</description> + <command interpreter="python">extract_GFF_Features.py $input1 $out_file1 ${column_choice.col} ${column_choice.feature}</command> + <inputs> + <param format="gff" name="input1" type="data" label="Select GFF data"/> + <conditional name="column_choice"> + <param name="col" type="select" label="From"> + <option value="0" selected="true">Column 1 / Sequence name</option> + <option value="1">Column 2 / Source</option> + <option value="2">Column 3 / Feature</option> + <option value="6">Column 7 / Strand</option> + <option value="7">Column 8 / Frame</option> + </param> + <when value="0"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="0"/> + <column name="value" index="0"/> + <filter type="unique_value" name="unique" column="0"/> + </options> + </param> + </when> + <when value="1"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="1"/> + <column name="value" index="1"/> + <filter type="unique_value" name="unique" column="1"/> + </options> + </param> + </when> + <when value="2"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="2"/> + <column name="value" index="2"/> + <filter type="unique_value" name="unique" column="2"/> + </options> + </param> + </when> + <when value="6"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="6"/> + <column name="value" index="6"/> + <filter type="unique_value" name="unique" column="6"/> + </options> + </param> + </when> + <when value="7"> + <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> + <options from_dataset="input1"> + <column name="name" index="7"/> + <column name="value" index="7"/> + <filter type="unique_value" name="unique" column="7"/> + </options> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="5.gff"/> + <param name="col" value="0" /> + <param name="feature" value="chr5,chr6,chr7,chr8" /> + <output name="out_file1" file="Extract_features1_out.gff"/> + </test> + </tests> + <help> + +**What it does** + +This tool extracts selected features from GFF data. + +----- + +**Example** + +Selecting **promoter** from the following GFF data:: + + chr22 GeneA enhancer 10000000 10001000 500 + . TGA + chr22 GeneA promoter 10010000 10010100 900 + . TGA + chr22 GeneB promoter 10020000 10025000 400 - . TGB + chr22 GeneB CCDS2220 10030000 10065000 800 - . TGB + +will produce the following output:: + + chr22 GeneA promoter 10010000 10010100 900 + . TGA + chr22 GeneB promoter 10020000 10025000 400 - . TGB + +---- + +.. class:: infomark + +**About formats** + +**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: + + 1. seqname - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/gff_filter_by_attribute.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/gff_filter_by_attribute.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,308 @@\n+#!/usr/bin/env python\n+# This tool takes a gff file as input and creates filters on attributes based on certain properties.\n+# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.\n+# TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be\n+# abstracted and leveraged in each filtering tool.\n+from __future__ import division, print_function\n+\n+import sys\n+\n+from ast import Module, parse, walk\n+from json import loads\n+\n+AST_NODE_TYPE_WHITELIST = [\n+ \'Expr\', \'Load\', \'Str\', \'Num\', \'BoolOp\', \'Compare\', \'And\', \'Eq\', \'NotEq\',\n+ \'Or\', \'GtE\', \'LtE\', \'Lt\', \'Gt\', \'BinOp\', \'Add\', \'Div\', \'Sub\', \'Mult\', \'Mod\',\n+ \'Pow\', \'LShift\', \'GShift\', \'BitAnd\', \'BitOr\', \'BitXor\', \'UnaryOp\', \'Invert\',\n+ \'Not\', \'NotIn\', \'In\', \'Is\', \'IsNot\', \'List\', \'Index\', \'Subscript\',\n+ \'Name\',\n+]\n+\n+\n+BUILTIN_AND_MATH_FUNCTIONS = \'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor\'.split(\'|\')\n+STRING_AND_LIST_METHODS = [ name for name in dir(\'\') + dir([]) if not name.startswith(\'_\') ]\n+VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS\n+# Name blacklist isn\'t strictly needed - but provides extra peace of mind.\n+NAME_BLACKLIST = ["exec", "eval", "globals", "locals", "__import__", "__builtins__"]\n+\n+\n+def __check_name( ast_node ):\n+ name = ast_node.id\n+ return name not in NAME_BLACKLIST\n+\n+\n+def check_simple_name( text ):\n+ """\n+\n+ >>> check_simple_name("col_name")\n+ True\n+ >>> check_simple_name("c1==\'chr1\' and c3-c2>=2000 and c6==\'+\'")\n+ False\n+ >>> check_simple_name("eval(\'1+1\')")\n+ False\n+ >>> check_simple_name("import sys")\n+ False\n+ >>> check_simple_name("[].__str__")\n+ False\n+ >>> check_simple_name("__builtins__")\n+ False\n+ >>> check_simple_name("\'x\' in globals")\n+ False\n+ >>> check_simple_name("\'x\' in [1,2,3]")\n+ False\n+ >>> check_simple_name("c3==\'chr1\' and c5>5")\n+ False\n+ >>> check_simple_name("c3==\'chr1\' and d5>5")\n+ False\n+ >>> check_simple_name("c3==\'chr1\' and c5>5 or exec")\n+ False\n+ >>> check_simple_name("type(c1) != type(1)")\n+ False\n+ >>> check_simple_name("c1.split(\',\')[1] == \'1\'")\n+ False\n+ >>> check_simple_name("exec 1")\n+ False\n+ >>> check_simple_name("str(c2) in [\\\\\\"a\\\\\\",\\\\\\"b\\\\\\"]")\n+ False\n+ >>> check_simple_name("__import__(\'os\').system(\'touch /tmp/OOPS\')")\n+ False\n+ """\n+ try:\n+ module = parse( text )\n+ except SyntaxError:\n+ return False\n+\n+ if not isinstance(module, Module):\n+ return False\n+ statements = module.body\n+ if not len( statements ) == 1:\n+ return False\n+ expression = statements[0]\n+ if expression.__class__.__name__ != \'Expr\':\n+ return False\n+\n+ for ast_node in walk( expression ):\n+ ast_node_class = ast_node.__class__.__name__\n+ if ast_node_class not in ["Expr", "Name", "Load"]:\n+ return False\n+\n+ if ast_node_class == "Name" and not __check_name(ast_node):\n+ return False\n+\n+ return True\n+\n+\n+def check_expression( text ):\n+ """\n+\n+ >>> check_expression("c1==\'chr1\' and c3-c2>=2000 and c6==\'+\'")\n+ True\n+ >>> check_expression("eval(\'1+1\')")\n+ False\n+ >>> check_expression("import sys")\n+ False\n+ >>> check_expression("[].__str__")\n+ False\n+ >>> check_expression("__builtins__")\n+ False\n+ >>> check_expression("\'x\' in globals")\n+ False\n+ >>> check_expression("\'x\' in [1,2,3]")\n+ True\n+ >>> check_expression("c3==\'chr1\' and c5>5")\n+ True\n+ >>> check_expression("c3==\'chr1\' and d5>5")\n+ True\n+ >>> check_expression("c3==\'chr1\' and c5>5 or exec")\n+ False\n+ >>> check_expression("type(c1) != type(1)")\n+ False\n+ >>> check_expression("c1.split(\',\')[1] == \'1\'")\n+ False\n+ >>> check_expression("'..b'#\n+# Process inputs.\n+#\n+in_fname = sys.argv[1]\n+out_fname = sys.argv[2]\n+cond_text = sys.argv[3]\n+attribute_types = loads( sys.argv[4] )\n+\n+# Convert types from str to type objects.\n+for name, a_type in attribute_types.items():\n+ check_for_executable(a_type)\n+ if not check_simple_name( a_type ):\n+ stop_err("Problem with attribute type [%s]" % a_type)\n+ attribute_types[ name ] = eval( a_type )\n+\n+# Unescape if input has been escaped\n+mapped_str = {\n+ \'__lt__\': \'<\',\n+ \'__le__\': \'<=\',\n+ \'__eq__\': \'==\',\n+ \'__ne__\': \'!=\',\n+ \'__gt__\': \'>\',\n+ \'__ge__\': \'>=\',\n+ \'__sq__\': \'\\\'\',\n+ \'__dq__\': \'"\',\n+}\n+for key, value in mapped_str.items():\n+ cond_text = cond_text.replace( key, value )\n+\n+# Attempt to determine if the condition includes executable stuff and, if so, exit.\n+check_for_executable( cond_text, \'condition\')\n+\n+if not check_expression(cond_text):\n+ stop_err( "Illegal/invalid in condition \'%s\'" % ( cond_text ) )\n+\n+# Prepare the column variable names and wrappers for column data types. Only\n+# prepare columns up to largest column in condition.\n+attrs, type_casts = [], []\n+for name, attr_type in attribute_types.items():\n+ attrs.append( name )\n+ type_cast = "get_value(\'%(name)s\', attribute_types[\'%(name)s\'], attribute_values)" % ( {\'name\': name} )\n+ type_casts.append( type_cast )\n+\n+attr_str = \', \'.join( attrs ) # \'c1, c2, c3, c4\'\n+type_cast_str = \', \'.join( type_casts ) # \'str(c1), int(c2), int(c3), str(c4)\'\n+wrap = "%s = %s" % ( attr_str, type_cast_str )\n+\n+# Stats\n+skipped_lines = 0\n+first_invalid_line = 0\n+invalid_line = None\n+lines_kept = 0\n+total_lines = 0\n+out = open( out_fname, \'wt\' )\n+\n+\n+# Helper function to safely get and type cast a value in a dict.\n+def get_value(name, a_type, values_dict):\n+ if name in values_dict:\n+ return (a_type)(values_dict[ name ])\n+ else:\n+ return None\n+\n+\n+# Read and filter input file, skipping invalid lines\n+code = \'\'\'\n+for i, line in enumerate( open( in_fname ) ):\n+ total_lines += 1\n+ line = line.rstrip( \'\\\\r\\\\n\' )\n+ if not line or line.startswith( \'#\' ):\n+ skipped_lines += 1\n+ if not invalid_line:\n+ first_invalid_line = i + 1\n+ invalid_line = line\n+ continue\n+ try:\n+ # Place attribute values into variables with attribute\n+ # name; type casting is done as well.\n+ elems = line.split( \'\\t\' )\n+ attribute_values = {}\n+ for name_value_pair in elems[8].split(";"):\n+ pair = name_value_pair.strip().split(" ")\n+ if pair == \'\':\n+ continue\n+ name = pair[0].strip()\n+ if name == \'\':\n+ continue\n+ # Need to strip double quote from value and typecast.\n+ attribute_values[name] = pair[1].strip(" \\\\"")\n+ %s\n+ if %s:\n+ lines_kept += 1\n+ print( line, file=out )\n+ except Exception as e:\n+ print( e )\n+ skipped_lines += 1\n+ if not invalid_line:\n+ first_invalid_line = i + 1\n+ invalid_line = line\n+\'\'\' % ( wrap, cond_text )\n+\n+valid_filter = True\n+try:\n+ exec(code)\n+except Exception as e:\n+ out.close()\n+ if str( e ).startswith( \'invalid syntax\' ):\n+ valid_filter = False\n+ stop_err( \'Filter condition "%s" likely invalid. See tool tips, syntax and examples.\' % cond_text )\n+ else:\n+ stop_err( str( e ) )\n+\n+if valid_filter:\n+ out.close()\n+ valid_lines = total_lines - skipped_lines\n+ print(\'Filtering with %s, \' % ( cond_text ))\n+ if valid_lines > 0:\n+ print(\'kept %4.2f%% of %d lines.\' % ( 100.0 * lines_kept / valid_lines, total_lines ))\n+ else:\n+ print(\'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.\' % cond_text)\n+ if skipped_lines > 0:\n+ print(\'Skipped %d invalid lines starting at line #%d: "%s"\' % ( skipped_lines, first_invalid_line, invalid_line ))\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/gff_filter_by_attribute.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/gff_filter_by_attribute.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,54 @@ +<tool id="gff_filter_by_attribute" name="Filter GFF data by attribute" version="0.1.1"> + <description>using simple expressions</description> + <command interpreter="python"> + gff_filter_by_attribute.py $input $out_file1 "$cond" '${input.metadata.attribute_types}' + </command> + <inputs> + <param format="gff" name="input" type="data" label="Filter" help="Dataset missing? See TIP below."/> + <param name="cond" size="40" type="text" value="gene_id=='uc002loc.1'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool."> + <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="input" value="gff_filter_attr_in1.gff"/> + <param name="cond" value="conf_lo>0"/> + <output name="out_file1" file="gff_filter_by_attribute_out1.gff"/> + </test> + <test> + <param name="input" value="gff_filter_attr_in1.gff"/> + <param name="cond" value="conf_lo==0 or conf_hi>125"/> + <output name="out_file1" file="gff_filter_by_attribute_out2.gff"/> + </test> + </tests> + + <help> + +.. class:: warningmark + +Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**) + +.. class:: infomark + +**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the attribute being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings). If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition. The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue". + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +The filter tool allows you to restrict the dataset using simple conditional statements. + +- Make sure that multi-character operators contain no white space ( e.g., **<=** is valid while **< =** is not valid ) +- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **attribute_name=='chr1'** ) +- Non-numerical values must be included in single or double quotes ( e.g., **attribute_name=='XX22'** ) +- You can combine multiple conditional statements using **and** or **or** ( e.g., **attribute_name=='XX22' or attribute_name=='XX21'** ) + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/gff_filter_by_feature_count.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/gff_filter_by_feature_count.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,183 @@ +#!/usr/bin/env python +""" +Filter a gff file using a criterion based on feature counts for a transcript. + +Usage: +%prog input_name output_name feature_name condition +""" +from __future__ import print_function + +import sys + +from ast import Module, parse, walk + +from bx.intervals.io import GenomicInterval + +from galaxy.datatypes.util.gff_util import GFFReaderWrapper + +AST_NODE_TYPE_WHITELIST = [ + 'Expr', 'Load', 'Str', 'Num', 'BoolOp', 'Compare', 'And', 'Eq', 'NotEq', + 'Or', 'GtE', 'LtE', 'Lt', 'Gt', 'BinOp', 'Add', 'Div', 'Sub', 'Mult', 'Mod', + 'Pow', 'LShift', 'GShift', 'BitAnd', 'BitOr', 'BitXor', 'UnaryOp', 'Invert', + 'Not', 'NotIn', 'In', 'Is', 'IsNot', 'List', 'Index', 'Subscript', + 'Name', +] + + +BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|') +STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ] +VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS +# Name blacklist isn't strictly needed - but provides extra peace of mind. +NAME_BLACKLIST = ["exec", "eval", "globals", "locals", "__import__", "__builtins__"] + + +def __check_name( ast_node ): + name = ast_node.id + return name not in NAME_BLACKLIST + + +def check_expression( text ): + """ + + >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'") + True + >>> check_expression("eval('1+1')") + False + >>> check_expression("import sys") + False + >>> check_expression("[].__str__") + False + >>> check_expression("__builtins__") + False + >>> check_expression("'x' in globals") + False + >>> check_expression("'x' in [1,2,3]") + True + >>> check_expression("c3=='chr1' and c5>5") + True + >>> check_expression("c3=='chr1' and d5>5") + True + >>> check_expression("c3=='chr1' and c5>5 or exec") + False + >>> check_expression("type(c1) != type(1)") + False + >>> check_expression("c1.split(',')[1] == '1'") + False + >>> check_expression("exec 1") + False + >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]") + False + >>> check_expression("__import__('os').system('touch /tmp/OOPS')") + False + """ + try: + module = parse( text ) + except SyntaxError: + return False + + if not isinstance(module, Module): + return False + statements = module.body + if not len( statements ) == 1: + return False + expression = statements[0] + if expression.__class__.__name__ != 'Expr': + return False + + for ast_node in walk( expression ): + ast_node_class = ast_node.__class__.__name__ + + # Toss out everything that is not a "simple" expression, + # imports, error handling, etc... + if ast_node_class not in AST_NODE_TYPE_WHITELIST: + return False + + if ast_node_class == "Name" and not __check_name(ast_node): + return False + + return True + + +# Valid operators, ordered so that complex operators (e.g. '>=') are +# recognized before simple operators (e.g. '>') +ops = [ + '>=', + '<=', + '<', + '>', + '==', + '!=' +] + +# Escape sequences for valid operators. +mapped_ops = { + '__ge__': ops[0], + '__le__': ops[1], + '__lt__': ops[2], + '__gt__': ops[3], + '__eq__': ops[4], + '__ne__': ops[5], +} + + +def __main__(): + # Get args. + input_name = sys.argv[1] + output_name = sys.argv[2] + feature_name = sys.argv[3] + condition = sys.argv[4] + + # Unescape operations in condition str. + for key, value in mapped_ops.items(): + condition = condition.replace( key, value ) + + # Error checking: condition should be of the form <operator><number> + for op in ops: + if op in condition: + empty, number_str = condition.split( op ) + try: + number = float( number_str ) + except: + number = None + if empty != "" or not number: + print("Invalid condition: %s, cannot filter." % condition, file=sys.stderr) + return + break + + # Do filtering. + kept_features = 0 + skipped_lines = 0 + first_skipped_line = 0 + out = open( output_name, 'w' ) + for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ): + if not isinstance( feature, GenomicInterval ): + continue + count = 0 + for interval in feature.intervals: + if interval.feature == feature_name: + count += 1 + eval_text = '%s %s' % ( count, condition ) + if not check_expression(eval_text): + print("Invalid condition: %s, cannot filter." % condition, file=sys.stderr) + sys.exit(1) + + if eval(eval_text): + # Keep feature. + for interval in feature.intervals: + out.write( "\t".join(interval.fields) + '\n' ) + kept_features += 1 + + # Needed because i is 0-based but want to display stats using 1-based. + i += 1 + + # Clean up. + out.close() + info_msg = "%i of %i features kept (%.2f%%) using condition %s. " % \ + ( kept_features, i, float(kept_features) / i * 100.0, feature_name + condition ) + if skipped_lines > 0: + info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % ( skipped_lines, first_skipped_line ) + print(info_msg) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/gff_filter_by_feature_count.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/gff_filter_by_feature_count.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,53 @@ +<tool id="gff_filter_by_feature_count" name="Filter GFF data by feature count" version="0.1.1"> + <description>using simple expressions</description> + <command interpreter="python"> + gff_filter_by_feature_count.py $input_file1 $out_file1 "$feature_name" "$cond" + </command> + <inputs> + <param format="gff" name="input_file1" type="data" label="Filter"/> + <param name="feature_name" type="select" label="Using feature name"> + <options from_dataset="input_file1"> + <column name="name" index="2"/> + <column name="value" index="2"/> + <filter type="unique_value" name="unique" column="2"/> + </options> + </param> + <param name="cond" size="40" type="text" value=">0" label="With following condition"> + <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input_file1"/> + </outputs> + <tests> + <!-- Test GTF filtering. --> + <test> + <param name="input_file1" value="gops_subtract_in1.gff"/> + <param name="feature_name" value="exon"/> + <param name="cond" value=">1"/> + <output name="out_file1" file="gff_filter_by_feature_count_out1.gff"/> + </test> + <!-- Test GFF3 filtering. --> + <test> + <param name="input_file1" value="5.gff3"/> + <param name="feature_name" value="HSP"/> + <param name="cond" value=">=5"/> + <output name="out_file1" file="gff_filter_by_feature_count_out2.gff"/> + </test> + </tests> + + <help> + + +.. class:: infomark + +Valid comparison operators are: > < >=, <=, !=, and == + +----- + +**Syntax** + +The filter tool allows you to restrict the dataset based on transcripts' feature counts. + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/gtf_filter_by_attribute_values_list.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/gtf_filter_by_attribute_values_list.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,71 @@ +# +# Filters a GFF file using a list of attribute values. Attribute values must +# be in the first column of the file; subsequent columns are ignored. +# Usage: +# python gff_filter_by_attribute_values.py <gff_file> <attribute_name> <ids_file> <output_file> +# +from __future__ import print_function + +import sys + + +def parse_gff_attributes( attr_str ): + """ + Parses a GFF/GTF attribute string and returns a dictionary of name-value + pairs. The general format for a GFF3 attributes string is + name1=value1;name2=value2 + The general format for a GTF attribute string is + name1 "value1" ; name2 "value2" + The general format for a GFF attribute string is a single string that + denotes the interval's group; in this case, method returns a dictionary + with a single key-value pair, and key name is 'group' + """ + attributes_list = attr_str.split(";") + attributes = {} + for name_value_pair in attributes_list: + # Try splitting by space and, if necessary, by '=' sign. + pair = name_value_pair.strip().split(" ") + if len( pair ) == 1: + pair = name_value_pair.strip().split("=") + if len( pair ) == 1: + # Could not split for some reason -- raise exception? + continue + if pair == '': + continue + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[ name ] = value + + if len( attributes ) == 0: + # Could not split attributes string, so entire string must be + # 'group' attribute. This is the case for strictly GFF files. + attributes['group'] = attr_str + return attributes + + +def gff_filter( gff_file, attribute_name, ids_file, output_file ): + # Put ids in dict for quick lookup. + ids_dict = {} + for line in open( ids_file ): + ids_dict[ line.split('\t')[0].strip() ] = True + + # Filter GFF file using ids. + output = open( output_file, 'w' ) + for line in open( gff_file ): + fields = line.split( '\t' ) + attributes = parse_gff_attributes( fields[8] ) + if ( attribute_name in attributes ) and ( attributes[ attribute_name ] in ids_dict ): + output.write( line ) + output.close() + + +if __name__ == "__main__": + # Handle args. + if len( sys.argv ) != 5: + print("usage: python %s <gff_file> <attribute_name> <ids_file> <output_file>" % sys.argv[0], file=sys.stderr) + sys.exit( -1 ) + gff_file, attribute_name, ids_file, output_file = sys.argv[1:] + gff_filter( gff_file, attribute_name, ids_file, output_file ) |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff/gtf_filter_by_attribute_values_list.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff/gtf_filter_by_attribute_values_list.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,42 @@ +<tool id="gtf_filter_by_attribute_values_list" name="Filter GTF data by attribute values_list" version="0.1"> + <description></description> + <command interpreter="python"> + gtf_filter_by_attribute_values_list.py $input $attribute_name $ids $output + </command> + <inputs> + <param format="gtf" name="input" type="data" label="Filter"/> + <param name="attribute_name" type="select" label="Using attribute name"> + <option value="gene_id">gene_id</option> + <option value="transcript_id">transcript_id</option> + <option value="p_id">p_id</option> + <option value="tss_id">tss_id</option> + </param> + <param format="tabular,txt" name="ids" type="data" label="And attribute values"/> + </inputs> + <outputs> + <data format="gtf" name="output"/> + </outputs> + <tests> + <!-- Test filtering with a simple list of values. --> + <test> + <param name="input" value="gops_subtract_in1.gff"/> + <param name="attribute_name" value="gene_id"/> + <param name="ids" value="gtf_filter_by_attribute_values_list_in1.txt"/> + <output name="output" file="gtf_filter_by_attribute_values_list_out1.gtf"/> + </test> + <!-- Test filtering with a more complex tabular file. --> + <test> + <param name="input" value="gtf_filter_by_attribute_values_list_in2.gtf"/> + <param name="attribute_name" value="transcript_id"/> + <param name="ids" value="gtf_filter_by_attribute_values_list_in3.tabular"/> + <output name="output" file="gtf_filter_by_attribute_values_list_out2.gtf"/> + </test> + </tests> + <help> + +This tool filters a GTF file using a list of attribute values. The attribute values are +taken from the first column in the file; additional columns in the file are ignored. An example +use of this tool is to filter a GTF file using a list of transcript_ids or gene_ids obtained from Cuffdiff. + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff2bed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff2bed.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,90 @@ +<tool id="gff2bed1" name="GFF-to-BED" version="1.0.1"> + <description>converter</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command interpreter="python">gff_to_bed_converter.py $input $out_file1</command> + <inputs> + <param format="gff" name="input" type="data" label="Convert this dataset"/> + </inputs> + <outputs> + <data format="bed" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="5.gff" ftype="gff"/> + <output name="out_file1" file="gff2bed_out.bed"/> + </test> + <test> + <param name="input" value="gff2bed_in2.gff" ftype="gff"/> + <output name="out_file1" file="gff2bed_out2.bed"/> + </test> + <test> + <!-- Test conversion of gff3 file. --> + <param name="input" value="5.gff3" ftype="gff"/> + <output name="out_file1" file="gff2bed_out3.bed"/> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from GFF format to BED format (scroll down for format description). + +-------- + +**Example** + +The following data in GFF format:: + + chr22 GeneA enhancer 10000000 10001000 500 + . TGA + chr22 GeneA promoter 10010000 10010100 900 + . TGA + +Will be converted to BED (**note** that 1 is subtracted from the start coordinate):: + + chr22 9999999 10001000 enhancer 0 + + chr22 10009999 10010100 promoter 0 + + +------ + +.. class:: infomark + +**About formats** + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +The additional BED fields (optional) are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + 13. expCount - The number of experiments. + 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. + 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. + +**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields:: + + 1. seqname - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gff_to_bed_converter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gff_to_bed_converter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,136 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys + +from galaxy.datatypes.util.gff_util import parse_gff_attributes + + +def get_bed_line( chrom, name, strand, blocks ): + """ Returns a BED line for given data. """ + + if len( blocks ) == 1: + # Use simple BED format if there is only a single block: + # chrom, chromStart, chromEnd, name, score, strand + # + start, end = blocks[0] + return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand ) + + # + # Build lists for transcript blocks' starts, sizes. + # + + # Get transcript start, end. + t_start = sys.maxsize + t_end = -1 + for block_start, block_end in blocks: + if block_start < t_start: + t_start = block_start + if block_end > t_end: + t_end = block_end + + # Get block starts, sizes. + block_starts = [] + block_sizes = [] + for block_start, block_end in blocks: + block_starts.append( str( block_start - t_start ) ) + block_sizes.append( str( block_end - block_start ) ) + + # + # Create BED entry. + # Bed format: chrom, chromStart, chromEnd, name, score, strand, \ + # thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts + # + # Render complete feature with thick blocks. There's no clear way to do this unless + # we analyze the block names, but making everything thick makes more sense than + # making everything thin. + # + return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \ + ( chrom, t_start, t_end, name, strand, t_start, t_end, len( block_starts ), + ",".join( block_sizes ), ",".join( block_starts ) ) + + +def __main__(): + input_name = sys.argv[1] + output_name = sys.argv[2] + skipped_lines = 0 + first_skipped_line = 0 + out = open( output_name, 'w' ) + i = 0 + cur_transcript_chrome = None + cur_transcript_id = None + cur_transcript_strand = None + cur_transcripts_blocks = [] # (start, end) for each block. + for i, line in enumerate( open( input_name ) ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ): + try: + # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes + elems = line.split( '\t' ) + start = str( int( elems[3] ) - 1 ) + coords = [ int( start ), int( elems[4] ) ] + strand = elems[6] + if strand not in ['+', '-']: + strand = '+' + attributes = parse_gff_attributes( elems[8] ) + t_id = attributes.get( "transcript_id", None ) + + if not t_id: + # + # No transcript ID, so write last transcript and write current line as its own line. + # + + # Write previous transcript. + if cur_transcript_id: + # Write BED entry. + out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) ) + + # Replace any spaces in the name with underscores so UCSC will not complain. + name = elems[2].replace(" ", "_") + out.write( get_bed_line( elems[0], name, strand, [ coords ] ) ) + continue + + # There is a transcript ID, so process line at transcript level. + if t_id == cur_transcript_id: + # Line is element of transcript and will be a block in the BED entry. + cur_transcripts_blocks.append( coords ) + continue + + # + # Line is part of new transcript; write previous transcript and start + # new transcript. + # + + # Write previous transcript. + if cur_transcript_id: + # Write BED entry. + out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) ) + + # Start new transcript. + cur_transcript_chrome = elems[0] + cur_transcript_id = t_id + cur_transcript_strand = strand + cur_transcripts_blocks = [] + cur_transcripts_blocks.append( coords ) + except: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + else: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + + # Write last transcript. + if cur_transcript_id: + # Write BED entry. + out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) ) + out.close() + info_msg = "%i lines converted to BED. " % ( i + 1 - skipped_lines ) + if skipped_lines > 0: + info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % ( skipped_lines, first_skipped_line ) + print(info_msg) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/grep.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/grep.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,137 @@ +# Filename: grep.py +# Author: Ian N. Schenck +# Version: 8/23/2005 +# +# This script accepts regular expressions, as well as an "invert" +# option, and applies the regular expression using grep. This wrapper +# provides security and pipeline. +# +# Grep is launched based on these inputs: +# -i Input file +# -o Output file +# -pattern RegEx pattern +# -v true or false (output NON-matching lines) +from __future__ import print_function + +import os +import re +import subprocess +import sys +from subprocess import PIPE, Popen +from tempfile import NamedTemporaryFile + + +# This function is exceedingly useful, perhaps package for reuse? +def getopts(argv): + opts = {} + while argv: + if argv[0][0] == '-': + opts[argv[0]] = argv[1] + argv = argv[2:] + else: + argv = argv[1:] + return opts + + +def main(): + args = sys.argv[1:] + + try: + opts = getopts(args) + except IndexError: + print("Usage:") + print(" -i Input file") + print(" -o Output file") + print(" -pattern RegEx pattern") + print(" -v true or false (Invert match)") + return 0 + + outputfile = opts.get("-o") + if outputfile is None: + print("No output file specified.") + return -1 + + inputfile = opts.get("-i") + if inputfile is None: + print("No input file specified.") + return -2 + + invert = opts.get("-v") + if invert is None: + print("Match style (Invert or normal) not specified.") + return -3 + + pattern = opts.get("-pattern") + if pattern is None: + print("RegEx pattern not specified.") + return -4 + + # All inputs have been specified at this point, now validate. + + # replace if input has been escaped, remove sq + # characters that are allowed but need to be escaped + mapped_chars = {'>' : '__gt__', + '<' : '__lt__', + '\'': '__sq__', + '"' : '__dq__', + '[' : '__ob__', + ']' : '__cb__', + '{' : '__oc__', + '}' : '__cc__'} + + # with new sanitizing we only need to replace for single quote, + # but this needs to remain for backwards compatibility + for key, value in mapped_chars.items(): + pattern = pattern.replace(value, key) + + # match filename and invert flag + fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") + invertRegEx = re.compile("(true)|(false)") + + # verify that filename and inversion flag are in the correct format + if not fileRegEx.match(outputfile): + print("Illegal output filename.") + return -5 + if not fileRegEx.match(inputfile): + print("Illegal input filename.") + return -6 + if not invertRegEx.match(invert): + print("Illegal invert option.") + return -7 + + # invert grep search? + if invert == "true": + invertflag = "-v" + print("Not matching pattern: %s" % pattern) + else: + invertflag = "" + print("Matching pattern: %s" % pattern) + + # set version flag + versionflag = "-P" + + # MacOS 10.8.2 does not support -P option for perl-regex anymore + versionmatch = Popen("grep -V | grep 'BSD'", shell=True, stdout=PIPE).communicate()[0] + if versionmatch: + versionflag = "-E" + + # create temp file holding pattern + # by using a file to hold the pattern, we don't have worry about sanitizing grep commandline and can include single quotes in pattern + pattern_file_name = NamedTemporaryFile().name + open( pattern_file_name, 'w' ).write( pattern ) + + # generate grep command + commandline = "grep %s %s -f %s %s > %s" % ( versionflag, invertflag, pattern_file_name, inputfile, outputfile ) + + # run grep + errorcode = subprocess.call(commandline, shell=True) + + # remove temp pattern file + os.unlink( pattern_file_name ) + + # return error code + return errorcode + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/grep.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/grep.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,82 @@ +<tool id="Grep1" name="Select" version="1.0.1"> + <description>lines that match an expression</description> + <command interpreter="python">grep.py -i $input -o $out_file1 -pattern '$pattern' -v $invert</command> + <inputs> + <param format="txt" name="input" type="data" label="Select lines from"/> + <param name="invert" type="select" label="that"> + <option value="false">Matching</option> + <option value="true">NOT Matching</option> + </param> + <param name="pattern" size="40" type="text" value="^chr([0-9A-Za-z])+" label="the pattern" help="here you can enter text or regular expression (for syntax check lower part of this frame)"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="input" value="1.bed"/> + <param name="invert" value="false"/> + <param name="pattern" value="^chr[0-9]*"/> + <output name="out_file1" file="fs-grep.dat"/> + </test> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +The select tool searches the data for lines containing or not containing a match to the given pattern. Regular Expression is introduced in this tool. A Regular Expression is a pattern describing a certain amount of text. + +- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for. +- **\\A** matches the beginning of a string(but not an internal line). +- **\\d** matches a digit, same as [0-9]. +- **\\D** matches a non-digit. +- **\\s** matches a whitespace character. +- **\\S** matches anything BUT a whitespace. +- **\\t** matches a tab. +- **\\w** matches an alphanumeric character. +- **\\W** matches anything but an alphanumeric character. +- **(** .. **)** groups a particular pattern. +- **\\Z** matches the end of a string(but not a internal line). +- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern. + + - **{n}** The preceding item is matched exactly n times. + - **{n,}** The preceding item is matched n or more times. + - **{n,m}** The preceding item is matched at least n times but not more than m times. + +- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**. +- **.** Matches any single character except a newline. +- ***** The preceding item will be matched zero or more times. +- **?** The preceding item is optional and matched at most once. +- **+** The preceding item will be matched one or more times. +- **^** has two meaning: + - matches the beginning of a line or string. + - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets. +- **$** matches the end of a line or string. +- **\|** Separates alternate possibilities. + +----- + +**Example** + +- **^chr([0-9A-Za-z])+** would match lines that begin with chromosomes, such as lines in a BED format file. +- **(ACGT){1,5}** would match at least 1 "ACGT" and at most 5 "ACGT" consecutively. +- **([^,][0-9]{1,3})(,[0-9]{3})\*** would match a large integer that is properly separated with commas such as 23,078,651. +- **(abc)|(def)** would match either "abc" or "def". +- **^\\W+#** would match any line that is a comment. +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gtf2bedgraph.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gtf2bedgraph.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,84 @@ +<tool id="gtf2bedgraph" name="GTF-to-BEDGraph" version="1.0.0"> + <description>converter</description> + <edam_operations> + <edam_operation>operation_3434</edam_operation> + </edam_operations> + <command interpreter="python">gtf_to_bedgraph_converter.py $input $out_file1 $attribute_name</command> + <inputs> + <param format="gtf" name="input" type="data" label="Convert this query"/> + <param name="attribute_name" type="text" label="Attribute to Use for Value"> + <validator type="empty_field" /> + </param> + </inputs> + <outputs> + <data format="bedgraph" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="gtf2bedgraph_in.gtf" ftype="gtf"/> + <param name="attribute_name" value="FPKM"/> + <output name="out_file1" file="gtf2bedgraph_out.bedgraph" ftype="bedgraph"/> + </test> + </tests> + <help> + +**What it does** + +This tool converts data from GTF format to BEDGraph format (scroll down for format description). + +-------- + +**Example** + +The following data in GFF format:: + + chr22 GeneA enhancer 10000000 10001000 500 + . gene_id "GeneA"; transcript_id "TranscriptAlpha"; FPKM "2.75"; frac "1.000000"; + chr22 GeneA promoter 10010000 10010100 900 + . gene_id "GeneA"; transcript_id "TranscriptsAlpha"; FPKM "2.25"; frac "1.000000"; + +using the attribute name 'FPKM' will be converted to BEDGraph (**note** that 1 is subtracted from the start coordinate):: + + + chr22 9999999 10001000 2.75 + chr22 10009999 10010100 2.25 + +------ + +.. class:: infomark + +**About formats** + +**GTF format** Gene Transfer Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GTF lines have nine tab-separated fields:: + + 1. seqname - Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. + 9. group - The group field is a list of attributes. Each attribute consists of a type/value pair. Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. The attribute list must begin with the two mandatory attributes: (i) gene_id value - A globally unique identifier for the genomic source of the sequence and (ii) transcript_id value - A globally unique identifier for the predicted transcript. + +**BEDGraph format** + +The bedGraph format is line-oriented. Bedgraph data are preceeded by a track definition line, which adds a number of options for controlling the default display of this track. + +For the track definition line, all options are placed in a single line separated by spaces: + track type=bedGraph name=track_label description=center_label + visibility=display_mode color=r,g,b altColor=r,g,b + priority=priority autoScale=on|off alwaysZero=on|off + gridDefault=on|off maxHeightPixels=max:default:min + graphType=bar|points viewLimits=lower:upper + yLineMark=real-value yLineOnOff=on|off + windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 + +The track type is REQUIRED, and must be bedGraph: + type=bedGraph + +Following the track definition line are the track data in four column BED format:: + + chromA chromStartA chromEndA dataValueA + chromB chromStartB chromEndB dataValueB + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/gtf_to_bedgraph_converter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/gtf_to_bedgraph_converter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,87 @@ +#!/usr/bin/env python +from __future__ import print_function + +import os +import sys +import tempfile + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def __main__(): + # Read parms. + input_name = sys.argv[1] + output_name = sys.argv[2] + attribute_name = sys.argv[3] + + # Create temp files. + tmp_name1 = tempfile.NamedTemporaryFile().name + tmp_name2 = tempfile.NamedTemporaryFile().name + + # Do conversion. + skipped_lines = 0 + first_skipped_line = 0 + out = open( tmp_name1, 'w' ) + + # Write track data to temporary file. + i = 0 + for i, line in enumerate( open( input_name ) ): + line = line.rstrip( '\r\n' ) + + if line and not line.startswith( '#' ): + try: + elems = line.split( '\t' ) + start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based. + strand = elems[6] + if strand not in ['+', '-']: + strand = '+' + attributes_list = elems[8].split(";") + attributes = {} + for name_value_pair in attributes_list: + pair = name_value_pair.strip().split(" ") + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[name] = value + value = attributes[ attribute_name ] + # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. + # BedGraph format: chrom, chromStart, chromEnd, value + out.write( "%s\t%s\t%s\t%s\n" % ( elems[0], start, elems[4], value ) ) + except: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + else: + skipped_lines += 1 + if not first_skipped_line: + first_skipped_line = i + 1 + out.close() + + # Sort tmp file by chromosome name and chromosome start to create ordered track data. + cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 ) + try: + os.system(cmd) + os.remove(tmp_name1) + except Exception as ex: + sys.stderr.write( "%s\n" % ex ) + sys.exit(1) + + # Create bedgraph file by combining track definition with ordered track data. + cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name ) + try: + os.system(cmd) + os.remove(tmp_name2) + except Exception as ex: + sys.stderr.write( "%s\n" % ex ) + sys.exit(1) + + info_msg = "%i lines converted to BEDGraph. " % ( i + 1 - skipped_lines ) + if skipped_lines > 0: + info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % ( skipped_lines, first_skipped_line ) + print(info_msg) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/headWrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/headWrapper.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,19 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +# a wrapper for head for use in galaxy +# headWrapper.pl [filename] [# lines to show] [output] + +die "Check arguments" unless @ARGV == 3; +die "Line number must be an integer\n" unless $ARGV[1]=~ m/^\d+$/; + +open (OUT, ">$ARGV[2]") or die "Cannot create $ARGV[2]:$!\n"; +open (HEAD, "head -n $ARGV[1] $ARGV[0]|") or die "Cannot run head:$!\n"; +while (<HEAD>) { + print OUT; +} +close OUT; +close HEAD; + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/headWrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/headWrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,42 @@ +<tool id="Show beginning1" name="Select first" version="1.0.0"> + <description>lines from a dataset</description> + <command interpreter="perl">headWrapper.pl $input $lineNum $out_file1</command> + <inputs> + <param name="lineNum" size="5" type="integer" value="10" label="Select first" help="lines"/> + <param format="txt" name="input" type="data" label="from"/> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="lineNum" value="10"/> + <param name="input" value="1.bed"/> + <output name="out_file1" file="eq-showbeginning.dat"/> + </test> + </tests> + <help> + +**What it does** + +This tool outputs specified number of lines from the **beginning** of a dataset + +----- + +**Example** + +Selecting 2 lines from this:: + + chr7 56632 56652 D17003_CTCF_R6 310 + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56761 56781 D17003_CTCF_R4 220 + + chr7 56772 56792 D17003_CTCF_R7 372 + + chr7 56775 56795 D17003_CTCF_R4 207 + + +will produce:: + + chr7 56632 56652 D17003_CTCF_R6 310 + + chr7 56736 56756 D17003_CTCF_R7 354 + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/join.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/join.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,390 @@\n+#!/usr/bin/env python\n+# Dan Blankenberg\n+"""\n+Script to Join Two Files on specified columns.\n+\n+Takes two tab delimited files, two column numbers (base 1) and outputs a new tab delimited file with lines joined by tabs.\n+User can also opt to have have non-joining rows of file1 echoed.\n+"""\n+from __future__ import print_function\n+\n+import json\n+import optparse\n+import os\n+import struct\n+import sys\n+import tempfile\n+\n+from galaxy.util import stringify_dictionary_keys\n+from galaxy.util.bunch import Bunch\n+\n+\n+class OffsetList:\n+ def __init__( self, filesize=0, fmt=None ):\n+ self.file = tempfile.NamedTemporaryFile( \'w+b\' )\n+ if fmt:\n+ self.fmt = fmt\n+ elif filesize and filesize <= sys.maxsize * 2:\n+ self.fmt = \'I\'\n+ else:\n+ self.fmt = \'Q\'\n+ self.fmt_size = struct.calcsize( self.fmt )\n+\n+ @property\n+ def size( self ):\n+ self.file.flush()\n+ return self.file_size / self.fmt_size\n+\n+ @property\n+ def file_size( self ):\n+ self.file.flush()\n+ return os.stat( self.file.name ).st_size\n+\n+ def add_offset( self, offset ):\n+ if not isinstance( offset, list ):\n+ offset = [offset]\n+ self.file.seek( self.file_size )\n+ for off in offset:\n+ self.file.write( struct.pack( self.fmt, off ) )\n+\n+ def get_offsets( self, start=0 ):\n+ self.file.seek( start * self.fmt_size )\n+ while True:\n+ packed = self.file.read( self.fmt_size )\n+ if not packed:\n+ break\n+ yield struct.unpack( self.fmt, packed )[0]\n+\n+ def get_offset_by_index( self, index ):\n+ self.file.seek( index * self.fmt_size )\n+ return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\n+\n+ def set_offset_at_index( self, index, offset ):\n+ if not isinstance( offset, list ):\n+ offset = [offset]\n+ if index >= self.size:\n+ self.add_offset( offset )\n+ else:\n+ temp_file = tempfile.NamedTemporaryFile( \'w+b\' )\n+ self.file.seek( 0 )\n+ temp_file.write( self.file.read( ( index ) * self.fmt_size ) )\n+ for off in offset:\n+ temp_file.write( struct.pack( self.fmt, off ) )\n+ temp_file.write( self.file.read() )\n+ self.file = temp_file\n+\n+\n+class SortedOffsets( OffsetList ):\n+ def __init__( self, indexed_filename, column, split=None ):\n+ OffsetList.__init__( self, os.stat( indexed_filename ).st_size )\n+ self.indexed_filename = indexed_filename\n+ self.indexed_file = open( indexed_filename, \'rb\' )\n+ self.column = column\n+ self.split = split\n+ self.last_identifier = None\n+ self.last_identifier_merged = None\n+ self.last_offset_merged = 0\n+\n+ def merge_with_dict( self, new_offset_dict ):\n+ if not new_offset_dict:\n+ return # no items to merge in\n+ keys = list(new_offset_dict.keys())\n+ keys.sort()\n+ identifier2 = keys.pop( 0 )\n+\n+ result_offsets = OffsetList( fmt=self.fmt )\n+ offsets1 = enumerate( self.get_offsets() )\n+ try:\n+ index1, offset1 = next(offsets1)\n+ identifier1 = self.get_identifier_by_offset( offset1 )\n+ except StopIteration:\n+ offset1 = None\n+ identifier1 = None\n+ index1 = 0\n+\n+ while True:\n+ if identifier1 is None and identifier2 is None:\n+ self.file = result_offsets.file # self is now merged results\n+ return\n+ elif identifier1 is None or ( identifier2 and identifier2 < identifier1 ):\n+ result_offsets.add_offset( new_offset_dict[identifier2] )\n+ if keys:\n+ identifier2 = keys.pop( 0 )\n+ else:\n+ identifier2 = None\n+ elif identifier2 is None:\n+ result_offsets.file.seek( resul'..b'index = BufferedIndex( filename2, column2, split, buffer, index_depth )\n+ for line1 in open( filename1, \'rb\' ):\n+ identifier = get_identifier_by_line( line1, column1, split )\n+ if identifier:\n+ written = False\n+ for line2 in index.get_lines_by_identifier( identifier ):\n+ if not fill_options.fill_unjoined_only:\n+ out.write( "%s%s%s\\n" % ( fill_empty_columns( line1.rstrip( \'\\r\\n\' ), split, fill_options.file1_columns ), split, fill_empty_columns( line2.rstrip( \'\\r\\n\' ), split, fill_options.file2_columns ) ) )\n+ else:\n+ out.write( "%s%s%s\\n" % ( line1.rstrip( \'\\r\\n\' ), split, line2.rstrip( \'\\r\\n\' ) ) )\n+ written = True\n+ if not written and keep_unmatched:\n+ out.write( fill_empty_columns( line1.rstrip( \'\\r\\n\' ), split, fill_options.file1_columns ) )\n+ if fill_options:\n+ if fill_options.file2_columns:\n+ out.write( "%s%s" % ( split, fill_empty_columns( "", split, fill_options.file2_columns ) ) )\n+ out.write( "\\n" )\n+ elif keep_partial:\n+ out.write( fill_empty_columns( line1.rstrip( \'\\r\\n\' ), split, fill_options.file1_columns ) )\n+ if fill_options:\n+ if fill_options.file2_columns:\n+ out.write( "%s%s" % ( split, fill_empty_columns( "", split, fill_options.file2_columns ) ) )\n+ out.write( "\\n" )\n+ out.close()\n+\n+\n+def main():\n+ parser = optparse.OptionParser()\n+ parser.add_option(\n+ \'-b\', \'--buffer\',\n+ dest=\'buffer\',\n+ type=\'int\', default=1000000,\n+ help=\'Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.\'\n+ )\n+ parser.add_option(\n+ \'-d\', \'--index_depth\',\n+ dest=\'index_depth\',\n+ type=\'int\', default=3,\n+ help=\'Depth to use on filebased offset indexing. Default: 3.\'\n+ )\n+ parser.add_option(\n+ \'-p\', \'--keep_partial\',\n+ action=\'store_true\',\n+ dest=\'keep_partial\',\n+ default=False,\n+ help=\'Keep rows in first input which are missing identifiers.\')\n+ parser.add_option(\n+ \'-u\', \'--keep_unmatched\',\n+ action=\'store_true\',\n+ dest=\'keep_unmatched\',\n+ default=False,\n+ help=\'Keep rows in first input which are not joined with the second input.\')\n+ parser.add_option(\n+ \'-f\', \'--fill_options_file\',\n+ dest=\'fill_options_file\',\n+ type=\'str\', default=None,\n+ help=\'Fill empty columns with a values from a JSONified file.\')\n+\n+ options, args = parser.parse_args()\n+\n+ fill_options = None\n+ if options.fill_options_file is not None:\n+ try:\n+ fill_options = Bunch( **stringify_dictionary_keys( json.load( open( options.fill_options_file ) ) ) ) # json.load( open( options.fill_options_file ) )\n+ except Exception as e:\n+ print("Warning: Ignoring fill options due to json error (%s)." % e)\n+ if fill_options is None:\n+ fill_options = Bunch()\n+ if \'fill_unjoined_only\' not in fill_options:\n+ fill_options.fill_unjoined_only = True\n+ if \'file1_columns\' not in fill_options:\n+ fill_options.file1_columns = None\n+ if \'file2_columns\' not in fill_options:\n+ fill_options.file2_columns = None\n+\n+ try:\n+ filename1 = args[0]\n+ filename2 = args[1]\n+ column1 = int( args[2] ) - 1\n+ column2 = int( args[3] ) - 1\n+ out_filename = args[4]\n+ except:\n+ print("Error parsing command line.", file=sys.stderr)\n+ sys.exit()\n+\n+ # Character for splitting fields and joining lines\n+ split = "\\t"\n+\n+ return join_files( filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.index_depth, fill_options=fill_options )\n+\n+\n+if __name__ == "__main__":\n+ main()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/joinWrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/joinWrapper.pl Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,51 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +use File::Temp "tempfile"; + +my ($input1, $input2, $field1, $field2, $mode, $OOption, $out_file1) = @ARGV; + +die "No arguments\n" unless @ARGV == 7; + +my ($fh1, $file1) = tempfile(); +my ($fh2, $file2) = tempfile(); + +`sort -k $field1 $input1 > $file1`; +`sort -k $field2 $input2 > $file2`; + +my $option = ""; +my @fields = (); +my $line = ""; + +if ($OOption eq "Y") { + if (defined($fh1)) { + $line = <$fh1>; + } else { + die "Failed to create file $file1\n"; + } + @fields = split /\t/, $line; + die "The field you selected does not exist in the input file" if (@fields < $field1); + my @optionO = (); + my $i = 0; + foreach (@fields) { + ++$i; + push(@optionO, "1.$i"); + } + $option = "-o " . join(",", @optionO); +} else { + $option = ""; +} + +$ENV{'LC_ALL'} = 'POSIX'; + +if ($mode eq "V") { + `join -v 1 $option -1 $field1 -2 $field2 $file1 $file2 | tr " " "\t" > $out_file1`; +} else { + `join $option -1 $field1 -2 $field2 $file1 $file2 | tr " " "\t" > $out_file1`; +} + +`rm $file1 ; rm $file2`; + + + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/joinWrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/joinWrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,77 @@ +#!/usr/bin/env python +# Guruprasad Ananda +""" +This tool provides the UNIX "join" functionality. +""" +import os +import subprocess +import sys +import tempfile + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def main(): + infile1 = sys.argv[1] + infile2 = sys.argv[2] + field1 = int(sys.argv[3]) + field2 = int(sys.argv[4]) + mode = sys.argv[5] + outfile = sys.argv[6] + + tmpfile1 = tempfile.NamedTemporaryFile() + tmpfile2 = tempfile.NamedTemporaryFile() + + try: + # Sort the two files based on specified fields + os.system("sort -t ' ' -k %d,%d -o %s %s" % (field1, field1, tmpfile1.name, infile1)) + os.system("sort -t ' ' -k %d,%d -o %s %s" % (field2, field2, tmpfile2.name, infile2)) + except Exception as exc: + stop_err( 'Initialization error -> %s' % str(exc) ) + + option = "" + for line in open(tmpfile1.name): + line = line.strip() + if line: + elems = line.split('\t') + for j in range(1, len(elems) + 1): + if j == 1: + option = "1.1" + else: + option = option + ",1." + str(j) + break + + # check if join has --version option. BSD join doens't have this option, while GNU join does. + # The return value in the latter case will be 0, and non-zero in the latter case. + ret = subprocess.call('join --version 2>/dev/null', shell=True) + # check if we are a version later than 7 of join. If so, we want to skip + # checking the order since join will raise an error with duplicated items in + # the two files being joined. + if ret == 0: + cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE) + (stdout, _) = cl.communicate() + version_line = stdout.split("\n")[0] + (version, _) = version_line.split()[-1].split(".") + if int(version) >= 7: + flags = "--nocheck-order" + else: + flags = "" + else: + flags = "" + + if mode == "V": + cmdline = "join %s -t ' ' -v 1 -o %s -1 %d -2 %d %s %s > %s" % (flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) + else: + cmdline = "join %s -t ' ' -o %s -1 %d -2 %d %s %s > %s" % (flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) + + try: + os.system(cmdline) + except Exception as exj: + stop_err('Error joining the two datasets -> %s' % str(exj)) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/joiner.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/joiner.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,180 @@ +<tool id="join1" name="Join two Datasets" version="2.0.2"> + <description>side by side on a specified field</description> + <command interpreter="python">join.py $input1 $input2 $field1 $field2 $out_file1 $unmatched $partial --index_depth=3 --buffer=50000000 --fill_options_file=$fill_options_file</command> + <inputs> + <param format="tabular" name="input1" type="data" label="Join"/> + <param name="field1" label="using column" type="data_column" data_ref="input1" /> + <param format="tabular" name="input2" type="data" label="with" /> + <param name="field2" label="and column" type="data_column" data_ref="input2" /> + <param name="unmatched" type="select" label="Keep lines of first input that do not join with second input"> + <option value="-u">Yes</option> + <option value="" selected="true">No</option> + </param> + <param name="partial" type="select" label="Keep lines of first input that are incomplete"> + <option value="-p">Yes</option> + <option value="" selected="true">No</option> + </param> + <conditional name="fill_empty_columns"> + <param name="fill_empty_columns_switch" type="select" label="Fill empty columns"> + <option value="no_fill" selected="True">No</option> + <option value="fill_empty">Yes</option> + </param> + <when value="no_fill"> + <!-- do nothing --> + </when> + <when value="fill_empty"> + <param type="select" name="fill_columns_by" label="Only fill unjoined rows"> + <option value="fill_unjoined_only" selected="True">Yes</option> + <option value="fill_all">No</option> + </param> + <conditional name="do_fill_empty_columns"> + <param name="column_fill_type" type="select" label="Fill Columns by"> + <option value="single_fill_value" selected="True">Single fill value</option> + <option value="fill_value_by_column">Values by column</option> + </param> + <when value="single_fill_value"> + <param type="text" name="fill_value" label="Fill value" value="."/> + </when> + <when value="fill_value_by_column"> + <repeat name="column_fill1" title="Fill Column for Input 1"> + <param name="column_number1" label="Column" type="data_column" data_ref="input1" /> + <param type="text" name="fill_value1" value="."/> + </repeat> + <repeat name="column_fill2" title="Fill Column for Input 2"> + <param name="column_number2" label="Column" type="data_column" data_ref="input2" /> + <param type="text" name="fill_value2" value="."/> + </repeat> + </when> + </conditional> + </when> + </conditional> + </inputs> + <configfiles> + <configfile name="fill_options_file"><% +import json +%> +#set $__fill_options = {} +#if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty': + #set $__fill_options['fill_unjoined_only'] = $fill_empty_columns['fill_columns_by'].value == 'fill_unjoined_only' + #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value': + #set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value + #else: + #set $__start_fill = "" + #end if + #set $__fill_options['file1_columns'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ] + #set $__fill_options['file2_columns'] = [ __start_fill for i in range( int( $input2.metadata.columns ) ) ] + #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column': + #for column_fill1 in $fill_empty_columns['do_fill_empty_columns']['column_fill1']: + #set $__fill_options['file1_columns'][ int( column_fill1['column_number1'].value ) - 1 ] = column_fill1['fill_value1'].value + #end for + #for column_fill2 in $fill_empty_columns['do_fill_empty_columns']['column_fill2']: + #set $__fill_options['file2_columns'][ int( column_fill2['column_number2'].value ) - 1 ] = column_fill2['fill_value2'].value + #end for + #end if +#end if +${json.dumps( __fill_options )} + </configfile> + </configfiles> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="field1" value="2"/> + <param name="field2" value="2"/> + <param name="unmatched" value=""/> + <param name="partial" value=""/> + <param name="fill_empty_columns_switch" value="no_fill"/> + <output name="out_file1" file="joiner_out1.bed"/> + </test> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="field1" value="2"/> + <param name="field2" value="2"/> + <param name="unmatched" value="Yes"/> + <param name="partial" value="Yes"/> + <param name="fill_empty_columns_switch" value="no_fill"/> + <output name="out_file1" file="joiner_out2.bed"/> + </test> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="field1" value="2"/> + <param name="field2" value="2"/> + <param name="unmatched" value="Yes"/> + <param name="partial" value="Yes"/> + <param name="fill_empty_columns_switch" value="fill_empty"/> + <param name="fill_columns_by" value="fill_all"/> + <param name="column_fill_type" value="single_fill_value"/> + <param name="fill_value" value="~"/> + <output name="out_file1" file="joiner_out3.bed"/> + </test> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="field1" value="2"/> + <param name="field2" value="2"/> + <param name="unmatched" value="Yes"/> + <param name="partial" value="Yes"/> + <param name="fill_empty_columns_switch" value="fill_empty"/> + <param name="fill_columns_by" value="fill_all"/> + <param name="column_fill_type" value="fill_value_by_column"/> + <param name="column_number1" value="6"/> + <param name="fill_value1" value="+"/> + <param name="column_number2" value="1"/> + <param name="fill_value2" value="NoChrom"/> + <output name="out_file1" file="joiner_out4.bed"/> + </test> + </tests> + <help> + +.. class:: warningmark + +**This tool will attempt to reuse the metadata from your first input.** To change metadata assignments click on the "edit attributes" link of the history item generated by this tool. + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool joins lines of two datasets on a common field. An empty string ("") is not a valid identifier. +You may choose to include lines of your first input that do not join with your second input. + +- Columns are referenced with a **number**. For example, **3** refers to the 3rd column of a tab-delimited file. + +----- + +**Example** + +Dataset1:: + + chr1 10 20 geneA + chr1 50 80 geneB + chr5 10 40 geneL + +Dataset2:: + + geneA tumor-supressor + geneB Foxp2 + geneC Gnas1 + geneE INK4a + +Joining the 4th column of Dataset1 with the 1st column of Dataset2 will yield:: + + chr1 10 20 geneA geneA tumor-suppressor + chr1 50 80 geneB geneB Foxp2 + +Joining the 4th column of Dataset1 with the 1st column of Dataset2, while keeping all lines from Dataset1, will yield:: + + chr1 10 20 geneA geneA tumor-suppressor + chr1 50 80 geneB geneB Foxp2 + chr5 10 40 geneL + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/joiner2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/joiner2.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,13 @@ +<tool id="joiner2" name="Relational join 2" version="1.0.0"> + <description>two datasets a specific column of which has the same value</description> + <command>sort -k $col1 $input1 > $input1.tmp; sort -k $col2 $input2 > $input2.tmp; join -1 $col1 -2 $col2 $input1.tmp $input2.tmp | tr " " "\t" > $out_file1; rm -rf $input1.tmp $input2.tmp </command> + <inputs> + <param name="input1" label="Combine dataset" format="tabular" type="data" /> + <param name="col1" label="using column" type="data_column" data_ref="input1" /> + <param name="input2" label="with dataset" format="tabular" type="data"/> + <param name="col2" label="and column" type="data_column" data_ref="input2" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/lav_to_bed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/lav_to_bed.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,53 @@ +#!/usr/bin/env python +# Reads a LAV file and writes two BED files. +from __future__ import print_function + +import sys + +import bx.align.lav + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def main(): + try: + lav_file = open(sys.argv[1], 'r') + bed_file1 = open(sys.argv[2], 'w') + bed_file2 = open(sys.argv[3], 'w') + except Exception as e: + stop_err( str( e ) ) + + lavsRead = 0 + bedsWritten = 0 + species = {} + # TODO: this is really bad since everything is read into memory. Can we eliminate this tool? + for lavBlock in bx.align.lav.Reader( lav_file ): + lavsRead += 1 + for c in lavBlock.components: + spec, chrom = bx.align.lav.src_split( c.src ) + if bedsWritten < 1: + if len( species ) == 0: + species[spec] = bed_file1 + elif len( species ) == 1: + species[spec] = bed_file2 + else: + continue # this is a pairwise alignment... + if spec in species: + species[spec].write( "%s\t%i\t%i\t%s_%s\t%i\t%s\n" % ( chrom, c.start, c.end, spec, str( bedsWritten ), 0, c.strand ) ) + bedsWritten += 1 + + for spec, file in species.items(): + print("#FILE\t%s\t%s" % (file.name, spec)) + + lav_file.close() + bed_file1.close() + bed_file2.close() + + print("%d lav blocks read, %d regions written\n" % (lavsRead, bedsWritten)) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/lav_to_bed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/lav_to_bed.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,68 @@ +<tool id="lav_to_bed1" name="LAV to BED" version="1.0.0"> + <description>Converts a LAV formatted file to BED format</description> + <command interpreter="python">lav_to_bed.py $lav_file $bed_file1 $bed_file2</command> + <inputs> + <param name="lav_file" type="data" format="lav" label="LAV File" optional="False"/> + </inputs> + <outputs> + <data name="bed_file1" format="bed"/> + <data name="bed_file2" format="bed"/> + </outputs> + <tests> + <test> + <param name="lav_file" value="2.lav" ftype="lav" /> + <output name="bed_file2" file="lav_to_bed_out_1.bed" /> + <output name="bed_file2" file="lav_to_bed_out_2.bed" /> + </test> + </tests> + <help> + +**Syntax** + +This tool converts a LAV formatted file to the BED format. + +- **LAV format** LAV is an alignment format developed by Webb Miller's group at Penn State University. It is the primary output format for BLASTZ. + +- **BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. + +----- + +**Example** + +- Convert LAV format:: + + #:lav + s { + "/galaxy/data/hg16/seq/chr19.nib" 1 63811651 0 1 + "/galaxy/data/mm5/seq/chr11.nib" 1 121648857 0 1 + } + h { + "> hg16.chr19" + "> mm5.chr11 (reverse complement)" + } + a { + s 3500 + b 3001012 70568380 + e 3001075 70568443 + l 3001012 70568380 3001075 70568443 81 + } + a { + s 3900 + b 3008279 70573976 + e 3008357 70574054 + l 3008279 70573976 3008357 70574054 78 + } + #:eof + +- To two BED formatted files:: + + chr19 3001011 3001075 hg16_0 0 + + chr19 3008278 3008357 hg16_1 0 + + + **and**:: + + chr11 70568379 70568443 mm5_0 0 + + chr11 70573975 70574054 mm5_1 0 + + </help> + <code file="lav_to_bed_code.py"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/lav_to_bed_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/lav_to_bed_code.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,19 @@ +# Set build, name, and info for each output BED file +def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): + new_stdout = "" + filename_to_build = {} + for line in stdout.split("\n"): + if line.startswith("#FILE"): + fields = line.split("\t") + filename_to_build[fields[1]] = fields[2].strip() + else: + new_stdout = "%s%s" % ( new_stdout, line ) + for data in out_data.values(): + try: + data.info = "%s\n%s" % ( new_stdout, stderr ) + data.dbkey = filename_to_build[data.file_name] + data.name = "%s (%s)" % ( data.name, data.dbkey ) + app.model.context.add( data ) + app.model.context.flush() + except: + continue |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/mergeCols.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/mergeCols.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,43 @@ +from __future__ import print_function + +import sys + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def __main__(): + try: + infile = open( sys.argv[1], 'r') + outfile = open( sys.argv[2], 'w') + except: + stop_err( 'Cannot open or create a file\n' ) + + if len( sys.argv ) < 4: + stop_err( 'No columns to merge' ) + else: + cols = sys.argv[3:] + + skipped_lines = 0 + + for line in infile: + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ): + fields = line.split( '\t' ) + line += '\t' + for col in cols: + try: + line += fields[ int( col ) - 1 ] + except: + skipped_lines += 1 + + print(line, file=outfile) + + if skipped_lines > 0: + print('Skipped %d invalid lines' % skipped_lines) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/mergeCols.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/mergeCols.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,63 @@ +<tool id="mergeCols1" name="Merge Columns" version="1.0.1"> + <description>together</description> + <command interpreter="python"> + mergeCols.py + $input1 + $out_file1 + $col1 + $col2 + #for $col in $columns + ${col.datacol} + #end for + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> + <param name="col1" label="Merge column" type="data_column" data_ref="input1" /> + <param name="col2" label="with column" type="data_column" data_ref="input1" help="Need to add more columns? Use controls below."/> + <repeat name="columns" title="Columns"> + <param name="datacol" label="Add column" type="data_column" data_ref="input1" /> + </repeat> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <param name="col1" value="4" /> + <param name="col2" value="1" /> + <param name="datacol" value="6" /> + <output name="out_file1" file="mergeCols.dat"/> + </test> + </tests> +<help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**What it does** + +This tool merges columns together. Any number of valid columns can be merged in any order. + +----- + +**Example** + +Input dataset (five columns: c1, c2, c3, c4, and c5):: + + 1 10 1000 gene1 chr + 2 100 1500 gene2 chr + +merging columns "**c5,c1**" will return:: + + 1 10 1000 gene1 chr chr1 + 2 100 1500 gene2 chr chr2 + +.. class:: warningmark + +Note that all original columns are preserved and the result of merge is added as the rightmost column. + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/pasteWrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/pasteWrapper.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,35 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +my $command = ""; +# a wrapper for paste for use in galaxy +# pasteWrapper.pl [filename1] [filename2] [delimiter] [output] + +die "Check arguments" unless @ARGV == 4; + +if ($ARGV[2] eq 'T') { + $command = "paste $ARGV[0] $ARGV[1]"; +} elsif ($ARGV[2] eq 'C') { + $command = "paste -d \",\" $ARGV[0] $ARGV[1]"; +} elsif ($ARGV[2] eq 'D') { + $command = "paste -d \"-\" $ARGV[0] $ARGV[1]"; +} elsif ($ARGV[2] eq 'U') { + $command = "paste -d \"_\" $ARGV[0] $ARGV[1]"; +} elsif ($ARGV[2] eq 'P') { + $command = "paste -d \"|\" $ARGV[0] $ARGV[1]"; +} elsif ($ARGV[2] eq 'Dt') { + $command = "paste -d \".\" $ARGV[0] $ARGV[1]"; +} elsif ($ARGV[2] eq 'Sp') { + $command = "paste -d \" \" $ARGV[0] $ARGV[1]"; +} + +open (OUT, ">$ARGV[3]") or die "Cannot create $ARGV[2]:$!\n"; +open (PASTE, "$command |") or die "Cannot run paste:$!\n"; + +while (<PASTE>) { + print OUT; +} +close OUT; +close PASTE; + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/pasteWrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/pasteWrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,66 @@ +<tool id="Paste1" name="Paste" version="1.0.0"> + <description>two files side by side</description> + <command>perl '$__tool_directory__/pasteWrapper.pl' '$input1' '$input2' $delimiter '$out_file1'</command> + <inputs> +<!-- <display>paste $input1 and $input2 using $delimiter as delimiter</display> --> + <param format="txt" name="input1" type="data" label="Paste"/> + <param format="txt" name="input2" type="data" label="and"/> + <param name="delimiter" type="select" label="Delimit by"> + <option value="T">Tab</option> + <option value="Dt">Dot</option> + <option value="C">Comma</option> + <option value="D">Dash</option> + <option value="U">Underscore</option> + <option value="P">Pipe</option> + <option value="Sp">Space</option> + </param> + </inputs> + <outputs> + <data format_source="input1" name="out_file1" metadata_source="input1"> + <change_format> + <when input="input1" value="bed" format="interval"/> + </change_format> + </data> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="delimiter" value="T"/> + <output name="out_file1" file="eq-paste.dat"/> + </test> + </tests> + <help> +.. class:: infomark + +Paste preserves column assignments of the first dataset. + +----- + +**What it does** + +This tool merges two datasets side by side. If the first (left) dataset contains column assignments such as chromosome, start, end and strand, these will be preserved. However, if you would like to change column assignments, click the pencil icon in the history item. + +----- + +**Example** + +First dataset:: + + a 1 + a 2 + a 3 + +Second dataset:: + + 20 + 30 + 40 + +Pasting them together will produce:: + + a 1 20 + a 2 30 + a 3 40 + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/random_lines_two_pass.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/random_lines_two_pass.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,78 @@ +#!/usr/bin/env python +# Dan Blankenberg +# Selects N random lines from a file and outputs to another file, maintaining original line order +# allows specifying a seed +# does two passes to determine line offsets/count, and then to output contents +from __future__ import print_function + +import optparse +import random + + +def get_random_by_subtraction( line_offsets, num_lines ): + while len( line_offsets ) > num_lines: + del line_offsets[ random.randint( 0, len( line_offsets ) - 1 ) ] + return line_offsets + + +def get_random_by_sample( line_offsets, num_lines ): + line_offsets = random.sample( line_offsets, num_lines ) + line_offsets.sort() + return line_offsets + + +def get_random( line_offsets, num_lines ): + if num_lines > ( len( line_offsets ) / 2 ): + return get_random_by_subtraction( line_offsets, num_lines ) + else: + return get_random_by_sample( line_offsets, num_lines ) + + +def __main__(): + parser = optparse.OptionParser() + parser.add_option( '-s', '--seed', dest='seed', action='store', type="string", default=None, help='Set the random seed.' ) + (options, args) = parser.parse_args() + + assert len( args ) == 3, "Invalid command line specified." + + input = open( args[0], 'rb' ) + output = open( args[1], 'wb' ) + num_lines = int( args[2] ) + assert num_lines > 0, "You must select at least one line." + + if options.seed is not None: + random.seed( options.seed ) + + # get line offsets + line_offsets = [] + teller = input.tell + readliner = input.readline + appender = line_offsets.append + while True: + offset = teller() + if readliner(): + appender( offset ) + else: + break + + total_lines = len( line_offsets ) + assert num_lines <= total_lines, "Error: asked to select more lines (%i) than there were in the file (%i)." % ( num_lines, total_lines ) + + # get random line offsets + line_offsets = get_random( line_offsets, num_lines ) + + # write out random lines + seeker = input.seek + writer = output.write + for line_offset in line_offsets: + seeker( line_offset ) + writer( readliner() ) + input.close() + output.close() + print("Kept %i of %i total lines." % ( num_lines, total_lines )) + if options.seed is not None: + print('Used random seed of "%s".' % options.seed) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/randomlines.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/randomlines.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,36 @@ +#!/usr/bin/env python +# Kanwei Li, 2010 +# Selects N random lines from a file and outputs to another file + +import random +import sys + + +def main(): + infile = open(sys.argv[1], 'r') + total_lines = int(sys.argv[2]) + + if total_lines < 1: + sys.stderr.write( "Must select at least one line." ) + sys.exit() + + kept = [] + n = 0 + for line in infile: + line = line.rstrip("\n") + n += 1 + if (n <= total_lines): + kept.append(line) + elif random.randint(1, n) <= total_lines: + kept.pop(random.randint(0, total_lines - 1)) + kept.append(line) + + if n < total_lines: + sys.stderr.write( "Error: asked to select more lines than there were in the file." ) + sys.exit() + + open(sys.argv[3], 'w').write( "\n".join(kept) ) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/randomlines.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/randomlines.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,66 @@ +<tool id="random_lines1" name="Select random lines" version="2.0.1"> + <description>from a file</description> + <command interpreter="python">random_lines_two_pass.py "${input}" "${out_file1}" "${num_lines}" + #if str( $seed_source.seed_source_selector ) == "set_seed": + --seed "${seed_source.seed}" + #end if + </command> + <inputs> + <param name="num_lines" size="5" type="integer" value="1" label="Randomly select" help="lines"/> + <param format="txt" name="input" type="data" label="from"/> + <conditional name="seed_source"> + <param name="seed_source_selector" type="select" label="Set a random seed"> + <option value="no_seed" selected="True">Don't set seed</option> + <option value="set_seed">Set seed</option> + </param> + <when value="no_seed"> + <!-- Do nothing here --> + </when> + <when value="set_seed"> + <param name="seed" type="text" label="Random seed" /> + </when> + </conditional> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="num_lines" value="65"/> + <param name="input" value="1.bed"/> + <param name="seed_source_selector" value="no_seed"/> + <output name="out_file1" file="1.bed"/> + </test> + <test> + <param name="num_lines" value="1"/> + <param name="input" value="1.bed"/> + <param name="seed_source_selector" value="set_seed"/> + <param name="seed" value="asdf"/> + <output name="out_file1" file="1_bed_random_lines_1_seed_asdf_out.bed"/> + </test> + </tests> + <help> + +**What it does** + +This tool selects N random lines from a file, with no repeats, and preserving ordering. + +----- + +**Example** + +Input File:: + + chr7 56632 56652 D17003_CTCF_R6 310 + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56761 56781 D17003_CTCF_R4 220 + + chr7 56772 56792 D17003_CTCF_R7 372 + + chr7 56775 56795 D17003_CTCF_R4 207 + + +Selecting 2 random lines might return this:: + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56775 56795 D17003_CTCF_R4 207 + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/remove_beginning.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/remove_beginning.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,33 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +# Removes the specified number of lines from the beginning of the file. +# remove_beginning.pl [input] [num_lines] [output] + +die "Check arguments" unless @ARGV == 3; + +my $inputfile = $ARGV[0]; +my $num_lines = $ARGV[1]; +my $outputfile = $ARGV[2]; + +my $curCount=0; + +my $fhIn; +open ($fhIn, "< $inputfile") or die "Cannot open source file"; + +my $fhOut; +open ($fhOut, "> $outputfile"); + +while (<$fhIn>) +{ + $curCount++; + if ($curCount<=$num_lines) + { + next; + } + print $fhOut $_; +} +close ($fhIn) or die "Cannot close source file"; +close ($fhOut) or die "Cannot close output file"; |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/remove_beginning.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/remove_beginning.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,42 @@ +<tool id="Remove beginning1" name="Remove beginning" version="1.0.0"> + <description>of a file</description> + <command interpreter="perl">remove_beginning.pl $input $num_lines $out_file1</command> + <inputs> + <param name="num_lines" size="5" type="integer" value="1" label="Remove first" help="lines"/> + <param format="txt" name="input" type="data" label="from"/> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="num_lines" value="5"/> + <param name="input" value="1.bed"/> + <output name="out_file1" file="eq-removebeginning.dat"/> + </test> + </tests> + <help> + +**What it does** + +This tool removes a specified number of lines from the beginning of a dataset. + +----- + +**Example** + +Input File:: + + chr7 56632 56652 D17003_CTCF_R6 310 + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56761 56781 D17003_CTCF_R4 220 + + chr7 56772 56792 D17003_CTCF_R7 372 + + chr7 56775 56795 D17003_CTCF_R4 207 + + +After removing the first 3 lines the dataset will look like this:: + + chr7 56772 56792 D17003_CTCF_R7 372 + + chr7 56775 56795 D17003_CTCF_R4 207 + + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/secure_hash_message_digest.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/secure_hash_message_digest.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,48 @@ +#!/usr/bin/env python +# Dan Blankenberg +""" +A script for calculating secure hashes / message digests. +""" +import hashlib +import optparse + +from galaxy.util.odict import odict + +HASH_ALGORITHMS = [ 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512' ] +CHUNK_SIZE = 2 ** 20 # 1mb + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-a', '--algorithm', dest='algorithms', action='append', type="string", help='Algorithms to use, eg. (md5, sha1, sha224, sha256, sha384, sha512)' ) + parser.add_option( '-i', '--input', dest='input', action='store', type="string", help='Input filename' ) + parser.add_option( '-o', '--output', dest='output', action='store', type="string", help='Output filename' ) + (options, args) = parser.parse_args() + + algorithms = odict() + for algorithm in options.algorithms: + assert algorithm in HASH_ALGORITHMS, "Invalid algorithm specified: %s" % ( algorithm ) + assert algorithm not in algorithms, "Specify each algorithm only once." + algorithms[ algorithm ] = hashlib.new( algorithm ) + assert options.algorithms, "You must provide at least one algorithm." + assert options.input, "You must provide an input filename." + assert options.output, "You must provide an output filename." + + input = open( options.input ) + while True: + chunk = input.read( CHUNK_SIZE ) + if chunk: + for algorithm in algorithms.values(): + algorithm.update( chunk ) + else: + break + + output = open( options.output, 'wb' ) + output.write( '#%s\n' % ( '\t'.join( algorithms.keys() ) ) ) + output.write( '%s\n' % ( '\t'.join( x.hexdigest() for x in algorithms.values() ) ) ) + output.close() + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/secure_hash_message_digest.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/secure_hash_message_digest.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,45 @@ +<tool id="secure_hash_message_digest" name="Secure Hash / Message Digest" version="0.0.1"> + <description>on a dataset</description> + <command interpreter="python">secure_hash_message_digest.py --input "${input1}" --output "${out_file1}" + #if $algorithms.value: + #for $algorithm in str( $algorithms ).split( "," ): + --algorithm "${algorithm}" + #end for + #end if + </command> + <inputs> + <param format="data" name="input1" type="data" label="Text file"/> + <param name="algorithms" type="select" multiple="True" display="checkboxes" label="Choose the algorithms"> + <option value="md5"/> + <option value="sha1"/> + <option value="sha224"/> + <option value="sha256"/> + <option value="sha384"/> + <option value="sha512"/> + <validator type="no_options" message="You must select at least one algorithm." /> + </param> + </inputs> + <outputs> + <data format="tabular" name="out_file1"/> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <param name="algorithms" value="md5,sha1,sha224,sha384,sha512"/> + <output name="out_file1" file="secure_hash_message_digest_out1.tabular" /> + </test> + </tests> + <help> + +**What it does** + +This tool outputs Secure Hashes / Message Digests of a dataset using the user selected algorithms. + +------ + +**Citation** + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/sff_extract.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/sff_extract.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b"@@ -0,0 +1,1340 @@\n+#!/usr/bin/python\n+'''This software extracts the seq, qual and ancillary information from an sff\n+file, like the ones used by the 454 sequencer.\n+\n+Optionally, it can also split paired-end reads if given the linker sequence.\n+The splitting is done with maximum match, i.e., every occurence of the linker\n+sequence will be removed, even if occuring multiple times.'''\n+\n+# copyright Jose Blanca and Bastien Chevreux\n+# COMAV institute, Universidad Politecnica de Valencia (UPV)\n+# Valencia, Spain\n+\n+# additions to handle paired end reads by Bastien Chevreux\n+# bugfixes for linker specific lengths: Lionel Guy\n+\n+# This program is free software: you can redistribute it and/or modify\n+# it under the terms of the GNU General Public License as published by\n+# the Free Software Foundation, either version 3 of the License, or\n+# (at your option) any later version.\n+# This program is distributed in the hope that it will be useful,\n+# but WITHOUT ANY WARRANTY; without even the implied warranty of\n+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+# GNU General Public License for more details.\n+# You should have received a copy of the GNU General Public License\n+# along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\n+from __future__ import print_function\n+\n+import os\n+import struct\n+import subprocess\n+import sys\n+import tempfile\n+\n+__author__ = 'Jose Blanca and Bastien Chevreux'\n+__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'\n+__license__ = 'GPLv3 or later'\n+__version__ = '0.2.10'\n+__email__ = 'jblanca@btc.upv.es'\n+__status__ = 'beta'\n+\n+fake_sff_name = 'fake_sff_name'\n+\n+# readname as key: lines with matches from SSAHA, one best match\n+ssahapematches = {}\n+# linker readname as key: length of linker sequence\n+linkerlengths = {}\n+\n+# set to true if something really fishy is going on with the sequences\n+stern_warning = True\n+\n+\n+def read_bin_fragment(struct_def, fileh, offset=0, data=None, byte_padding=None):\n+ '''It reads a chunk of a binary file.\n+\n+ You have to provide the struct, a file object, the offset (where to start\n+ reading).\n+ Also you can provide an optional dict that will be populated with the\n+ extracted data.\n+ If a byte_padding is given the number of bytes read will be a multiple of\n+ that number, adding the required pad at the end.\n+ It returns the number of bytes reads and the data dict.\n+ '''\n+ if data is None:\n+ data = {}\n+\n+ # we read each item\n+ bytes_read = 0\n+ for item in struct_def:\n+ # we go to the place and read\n+ fileh.seek(offset + bytes_read)\n+ n_bytes = struct.calcsize(item[1])\n+ buffer = fileh.read(n_bytes)\n+ read = struct.unpack('>' + item[1], buffer)\n+ if len(read) == 1:\n+ read = read[0]\n+ data[item[0]] = read\n+ bytes_read += n_bytes\n+\n+ # if there is byte_padding the bytes_to_read should be a multiple of the\n+ # byte_padding\n+ if byte_padding is not None:\n+ pad = byte_padding\n+ bytes_read = ((bytes_read + pad - 1) // pad) * pad\n+\n+ return (bytes_read, data)\n+\n+\n+def check_magic(magic):\n+ '''It checks that the magic number of the file matches the sff magic.'''\n+ if magic != 779314790:\n+ raise RuntimeError('This file does not seems to be an sff file.')\n+\n+\n+def check_version(version):\n+ '''It checks that the version is supported, otherwise it raises an error.'''\n+ supported = ('\\x00', '\\x00', '\\x00', '\\x01')\n+ i = 0\n+ for item in version:\n+ if version[i] != supported[i]:\n+ raise RuntimeError('SFF version not supported. Please contact the author of the software.')\n+ i += 1\n+\n+\n+def read_header(fileh):\n+ '''It reads the header from the sff file and returns a dict with the\n+ information'''\n+ # first we read the first part of the header\n+ head_struct = [\n+ ('magic_number', 'I'),\n+ ('version', 'cccc'),\n+ ('index_offs"..b'"pelinker_fname",\n+ help="FASTA file with paired-end linker sequences", metavar="FILE")\n+\n+ group = OptionGroup(parser, "File name options", "")\n+ group.add_option(\'-c\', \'--clip\', action="store_true", dest=\'clip\',\n+ help=\'clip (completely remove) ends with low qual and/or adaptor sequence\', default=False)\n+ group.add_option(\'-u\', \'--upper_case\', action="store_false", dest=\'mix_case\',\n+ help=\'all bases in upper case, including clipped ends\', default=True)\n+ group.add_option(\'\', \'--min_left_clip\', dest=\'min_leftclip\',\n+ metavar="INTEGER", type="int",\n+ help=\'if the left clip coming from the SFF is smaller than this value, override it\', default=0)\n+ group.add_option(\'-Q\', \'--fastq\', action="store_true", dest=\'want_fastq\',\n+ help=\'store as FASTQ file instead of FASTA + FASTA quality file\', default=False)\n+ parser.add_option_group(group)\n+\n+ group = OptionGroup(parser, "File name options", "")\n+ group.add_option("-o", "--out_basename", dest="basename",\n+ help="base name for all output files")\n+ group.add_option("-s", "--seq_file", dest="seq_fname",\n+ help="output sequence file name", metavar="FILE")\n+ group.add_option("-q", "--qual_file", dest="qual_fname",\n+ help="output quality file name", metavar="FILE")\n+ group.add_option("-x", "--xml_file", dest="xml_fname",\n+ help="output ancillary xml file name", metavar="FILE")\n+ parser.add_option_group(group)\n+\n+ # default fnames\n+ # is there an sff file?\n+ basename = \'reads\'\n+ if sys.argv[-1][-4:].lower() == \'.sff\':\n+ basename = sys.argv[-1][:-4]\n+ def_seq_fname = basename + \'.fasta\'\n+ def_qual_fname = basename + \'.fasta.qual\'\n+ def_xml_fname = basename + \'.xml\'\n+ def_pelinker_fname = \'\'\n+ parser.set_defaults(seq_fname=def_seq_fname)\n+ parser.set_defaults(qual_fname=def_qual_fname)\n+ parser.set_defaults(xml_fname=def_xml_fname)\n+ parser.set_defaults(pelinker_fname=def_pelinker_fname)\n+\n+ # we parse the cmd line\n+ (options, args) = parser.parse_args()\n+\n+ # we put the result in a dict\n+ global config\n+ config = {}\n+ for property in dir(options):\n+ if property[0] == \'_\' or property in (\'ensure_value\', \'read_file\', \'read_module\'):\n+ continue\n+ config[property] = getattr(options, property)\n+\n+ if config[\'basename\'] is None:\n+ config[\'basename\'] = basename\n+\n+ # if we have not set a file name with -s, -q or -x we set the basename\n+ # based file name\n+ if config[\'want_fastq\']:\n+ config[\'qual_fname\'] = \'\'\n+ if config[\'seq_fname\'] == def_seq_fname:\n+ config[\'seq_fname\'] = config[\'basename\'] + \'.fastq\'\n+ else:\n+ if config[\'seq_fname\'] == def_seq_fname:\n+ config[\'seq_fname\'] = config[\'basename\'] + \'.fasta\'\n+ if config[\'qual_fname\'] == def_qual_fname:\n+ config[\'qual_fname\'] = config[\'basename\'] + \'.fasta.qual\'\n+\n+ if config[\'xml_fname\'] == def_xml_fname:\n+ config[\'xml_fname\'] = config[\'basename\'] + \'.xml\'\n+\n+ # we parse the extra info for the xml file\n+ config[\'xml_info\'] = parse_extra_info(config[\'xml_info\'])\n+ return config, args\n+\n+\n+def testsome():\n+ sys.exit()\n+ return\n+\n+\n+def main():\n+ argv = sys.argv\n+ if len(argv) == 1:\n+ sys.argv.append(\'-h\')\n+ read_config()\n+ sys.exit()\n+ try:\n+ config, args = read_config()\n+\n+ if config[\'pelinker_fname\']:\n+ load_linker_sequences(config[\'pelinker_fname\'])\n+ if len(args) == 0:\n+ raise RuntimeError("No SFF file given?")\n+ extract_reads_from_sff(config, args)\n+ except (OSError, IOError, RuntimeError) as errval:\n+ print(errval)\n+ return 1\n+\n+ if stern_warning:\n+ return 1\n+\n+ return 0\n+\n+\n+if __name__ == "__main__":\n+ sys.exit(main())\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/sff_extractor.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/sff_extractor.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,58 @@ +<tool id="Sff_extractor" name="SFF converter" version="1.0.1"> + <description></description> + <command interpreter="python"> + #if str($fastq_output) == "fastq_false" #sff_extract.py $clip --seq_file=$out_file3 --qual_file=$out_file4 --xml_file=$out_file2 $input + #elif str($fastq_output) == "fastq_true" #sff_extract.py $clip --fastq --seq_file=$out_file1 --xml_file=$out_file2 $input + #end if# + </command> + <inputs> + <param format="sff" name="input" type="data" label="Extract from this dataset"/> + <param name="clip" type="select" label="Completely remove ends with low qual and/or adaptor sequence"> + <option value="">No</option> + <option value="--clip">Yes</option> + </param> + <param name="fastq_output" type="boolean" truevalue="fastq_true" falsevalue="fastq_false" checked="False" label="Do you want FASTQ file instead of FASTA + FASTA quality file?" /> + </inputs> + <outputs> + <data format="fastqsanger" name="out_file1" > + <filter>fastq_output is True</filter> + </data> + <data format="xml" name="out_file2"> + </data> + <data format="fasta" name="out_file3"> + <filter>fastq_output is False</filter> + </data> + <data format="qual" name="out_file4"> + <filter>fastq_output is False</filter> + </data> + </outputs> + <tests> + <test> + <param name="input" value="2.sff"/> + <param name="clip" value=""/> + <param name="fastq_output" value="false"/> + <output name="out_file2" file="sff_converter_xml_1.dat"/> + <output name="out_file3" file="sff_converter_fasta.dat"/> + <output name="out_file4" file="sff_converter_qual.dat"/> + </test> + <test> + <param name="input" value="2.sff"/> + <param name="clip" value=""/> + <param name="fastq_output" value="true"/> + <output name="out_file1" file="sff_converter_fastq.dat"/> + <output name="out_file2" file="sff_converter_xml_2.dat"/> + </test> + </tests> + <help> + +**What it does** + +This tool extracts data from the 454 Sequencer SFF format and creates three files containing the: +Sequences (FASTA), +Qualities (QUAL) and +Clippings (XML) + + </help> +</tool> + + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/sorter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/sorter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,58 @@ +""" + Sorts tabular data on one or more columns. All comments of the file are collected + and placed at the beginning of the sorted output file. + + usage: sorter.py [options] + -i, --input: Tabular file to be sorted + -o, --output: Sorted output file + -k, --key: Key (see manual for bash/sort) + + usage: sorter.py input output [key ...] +""" +# 03/05/2013 guerler + +import os +import sys +from optparse import OptionParser + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def main(): + # define options + parser = OptionParser() + parser.add_option("-i", "--input") + parser.add_option("-o", "--output") + parser.add_option("-k", "--key", action="append") + + # parse + options, args = parser.parse_args() + + try: + # retrieve options + input = options.input + output = options.output + key = [" -k" + k for k in options.key] + + # grep comments + grep_comments = "(grep '^#' %s) > %s" % (input, output) + + # grep and sort columns + sort_columns = "(grep '^[^#]' %s | sort -f -t '\t' %s) >> %s" % (input, ' '.join(key), output) + + # execute + os.system(grep_comments) + os.system(sort_columns) + + except Exception as ex: + stop_err('Error running sorter.py\n' + str(ex)) + + # exit + sys.exit(0) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/sorter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/sorter.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,188 @@ +<tool id="sort1" name="Sort" version="1.0.3"> + <description>data in ascending or descending order</description> + <command interpreter="python"> + sorter.py + + --input=${input} + --output=${out_file1} + + #if (str($style) == 'num'): + #set $style = 'n' + #elif (str($style) == 'gennum'): + #set $style = 'g' + #else: + #set $style = '' + #end if + + #set $order = '' if (str($order) == 'ASC') else 'r' + + --key=${column},${column}${style}${order} + + + #for $col in $column_set: + #set $other_column = str($col.other_column) + + #if (str($col.other_style) == 'num'): + #set $other_style = 'n' + #elif (str($col.other_style) == 'gennum'): + #set $other_style = 'g' + #else: + #set $other_style = '' + #end if + + #set $other_order = '' if (str($col.other_order) == "ASC") else 'r' + --key=${other_column},${other_column}${other_style}${other_order} + #end for + </command> + <inputs> + <param format="tabular" name="input" type="data" label="Sort Dataset" /> + <param name="column" label="on column" type="data_column" data_ref="input" accept_default="true"/> + <param name="style" type="select" label="with flavor"> + <option value="num">Numerical sort</option> + <option value="gennum">General numeric sort</option> + <option value="alpha">Alphabetical sort</option> + </param> + <param name="order" type="select" label="everything in"> + <option value="DESC">Descending order</option> + <option value="ASC">Ascending order</option> + </param> + <repeat name="column_set" title="Column selection"> + <param name="other_column" label="on column" type="data_column" data_ref="input" accept_default="true" /> + <param name="other_style" type="select" label="with flavor"> + <option value="num">Numerical sort</option> + <option value="gennum">General numeric sort</option> + <option value="alpha">Alphabetical sort</option> + </param> + <param name="other_order" type="select" label="everything in"> + <option value="DESC">Descending order</option> + <option value="ASC">Ascending order</option> + </param> + </repeat> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="input" value="sort_in1.bed"/> + <param name="column" value="1"/> + <param name="style" value="alpha"/> + <param name="order" value="ASC"/> + <param name="other_column" value="3"/> + <param name="other_style" value="num"/> + <param name="other_order" value="DESC"/> + <output name="out_file1" file="sort_out1.bed"/> + </test> + <test> + <param name="input" value="sort_in1.bed"/> + <param name="column" value="1"/> + <param name="style" value="alpha"/> + <param name="order" value="ASC"/> + <param name="other_column" value="3"/> + <param name="other_style" value="num"/> + <param name="other_order" value="ASC"/> + <output name="out_file1" file="sort_out2.bed"/> + </test> + <test> + <param name="input" value="sort_in2.bed"/> + <param name="column" value="5"/> + <param name="style" value="gennum"/> + <param name="order" value="ASC"/> + <output name="out_file1" file="sort_out3.bed"/> + </test> + </tests> + <help> +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool sorts the dataset on any number of columns in either ascending or descending order. + +* **Numerical sort** orders numbers by their magnitude, ignores all characters besides numbers, and evaluates a string of numbers to the value they signify. +* **General numeric sort** orders numbers by their general numerical value. Unlike the numerical sort option, it can handle numbers in scientific notation too. +* **Alphabetical sort** is a phonebook type sort based on the conventional order of letters in an alphabet. Each nth letter is compared with the nth letter of other words in the list, starting at the first letter of each word and advancing to the second, third, fourth, and so on, until the order is established. Therefore, in an alphabetical sort, 2 comes after 100 (1 < 2). + +----- + +**Examples** + +The list of numbers 4,17,3,5 collates to 3,4,5,17 by numerical sorting, while it collates to 17,3,4,5 by alphabetical sorting. + +Sorting the following:: + + Q d 7 II jhu 45 + A kk 4 I h 111 + Pd p 1 ktY WS 113 + A g 10 H ZZ 856 + A edf 4 tw b 234 + BBB rt 10 H ZZ 100 + A rew 10 d b 1111 + C sd 19 YH aa 10 + Hah c 23 ver bb 467 + MN gtr 1 a X 32 + N j 9 a T 205 + BBB rrf 10 b Z 134 + odfr ws 6 Weg dew 201 + C f 3 WW SW 34 + A jhg 4 I b 345 + Pd gf 7 Gthe de 567 + rS hty 90 YY LOp 89 + A g 10 H h 43 + A g 4 I h 500 + +on columns 1 (alphabetical), 3 (numerical), and 6 (numerical) in ascending order will yield:: + + A kk 4 I h 111 + A edf 4 tw b 234 + A jhg 4 I b 345 + A g 4 I h 500 + A g 10 H h 43 + A g 10 H ZZ 856 + A rew 10 d b 1111 + BBB rt 10 H ZZ 100 + BBB rrf 10 b Z 134 + C f 3 WW SW 34 + C sd 19 YH aa 10 + Hah c 23 ver bb 467 + MN gtr 1 a X 32 + N j 9 a T 205 + odfr ws 6 Weg dew 201 + Pd p 1 ktY WS 113 + Pd gf 7 Gthe de 567 + Q d 7 II jhu 45 + rS hty 90 YY LOp 89 + + +Sorting the following:: + + chr10 100 200 feature1 100.01 + + chr20 800 900 feature2 1.1 + + chr2 500 600 feature3 1000.1 + + chr1 300 400 feature4 1.1e-05 + + chr21 300 500 feature5 1.1e2 + + chr15 700 800 feature6 1.1e4 + + +on column 5 (numerical) in ascending order will yield:: + + chr1 300 400 feature4 1.1e-05 + + chr15 700 800 feature6 1.1e4 + + chr20 800 900 feature2 1.1 + + chr21 300 500 feature5 1.1e2 + + chr10 100 200 feature1 100.01 + + chr2 500 600 feature3 1000.1 + + +on column 5 (general numeric) in ascending order will yield:: + + chr1 300 400 feature4 1.1e-05 + + chr20 800 900 feature2 1.1 + + chr10 100 200 feature1 100.01 + + chr21 300 500 feature5 1.1e2 + + chr2 500 600 feature3 1000.1 + + chr15 700 800 feature6 1.1e4 + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/tailWrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/tailWrapper.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,19 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +# a wrapper for tail for use in galaxy +# lessWrapper.pl [filename] [# lines to show] [output] + +die "Check arguments" unless @ARGV == 3; +die "Line number should be an integer\n" unless $ARGV[1]=~ m/^\d+$/; + +open (OUT, ">$ARGV[2]") or die "Cannot create $ARGV[2]:$!\n"; +open (TAIL, "tail -n $ARGV[1] $ARGV[0]|") or die "Cannot run tail:$!\n"; +while (<TAIL>) { + print OUT; +} +close OUT; +close TAIL; + |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/tailWrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/tailWrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,42 @@ +<tool id="Show tail1" name="Select last" version="1.0.0"> + <description>lines from a dataset</description> + <command interpreter="perl">tailWrapper.pl $input $lineNum $out_file1</command> + <inputs> + <param name="lineNum" size="5" type="integer" value="10" label="Select last" help="lines"/> + <param format="txt" name="input" type="data" label="from"/> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="lineNum" value="10"/> + <param name="input" value="1.bed"/> + <output name="out_file1" file="eq-showtail.dat"/> + </test> + </tests> + <help> + +**What it does** + +This tool outputs specified number of lines from the **end** of a dataset + +----- + +**Example** + +- Input File:: + + chr7 57134 57154 D17003_CTCF_R7 356 - + chr7 57247 57267 D17003_CTCF_R4 207 + + chr7 57314 57334 D17003_CTCF_R5 269 + + chr7 57341 57361 D17003_CTCF_R7 375 + + chr7 57457 57477 D17003_CTCF_R3 188 + + +- Show last two lines of above file. The result is:: + + chr7 57341 57361 D17003_CTCF_R7 375 + + chr7 57457 57477 D17003_CTCF_R3 188 + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/trimmer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/trimmer.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,112 @@ +#!/usr/bin/env python +from __future__ import print_function + +import optparse +import sys + + +def stop_err( msg ): + sys.exit(msg) + + +def main(): + usage = """%prog [options] + +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-a', '--ascii', + dest='ascii', + action='store_true', + default=False, + help='Use ascii codes to defined ignored beginnings instead of raw characters') + + parser.add_option( + '-q', '--fastq', + dest='fastq', + action='store_true', + default=False, + help='The input data in fastq format. It selected the script skips every even line since they contain sequence ids') + + parser.add_option( + '-i', '--ignore', + dest='ignore', + help='A comma separated list on ignored beginnings (e.g., ">,@"), or its ascii codes (e.g., "60,42") if option -a is enabled') + + parser.add_option( + '-s', '--start', + dest='start', + default='0', + help='Trim from beginning to here (1-based)') + + parser.add_option( + '-e', '--end', + dest='end', + default='0', + help='Trim from here to the ned (1-based)') + + parser.add_option( + '-f', '--file', + dest='input_txt', + default=False, + help='Name of file to be chopped. STDIN is default') + + parser.add_option( + '-c', '--column', + dest='col', + default='0', + help='Column to chop. If 0 = chop the whole line') + + options, args = parser.parse_args() + invalid_starts = [] + + if options.input_txt: + infile = open( options.input_txt, 'r') + else: + infile = sys.stdin + + if options.ignore and options.ignore != "None": + invalid_starts = options.ignore.split(',') + + if options.ascii and options.ignore and options.ignore != "None": + for i, item in enumerate( invalid_starts ): + invalid_starts[i] = chr( int( item ) ) + + col = int( options.col ) + + for i, line in enumerate( infile ): + line = line.rstrip( '\r\n' ) + if line: + if options.fastq and i % 2 == 0: + print(line) + continue + + if line[0] not in invalid_starts: + if col == 0: + if int( options.end ) > 0: + line = line[ int( options.start ) - 1:int( options.end ) ] + elif int( options.end ) < 0: + endposition = len(line) + int( options.end ) + line = line[ int( options.start ) - 1:endposition ] + else: + line = line[ int( options.start ) - 1: ] + else: + fields = line.split( '\t' ) + if col - 1 > len( fields ): + stop_err('Column %d does not exist. Check input parameters\n' % col) + + if int( options.end ) > 0: + fields[col - 1] = fields[col - 1][ int( options.start ) - 1:int( options.end ) ] + elif int( options.end ) < 0: + endposition = len(fields[col - 1]) + int( options.end ) + fields[col - 1] = fields[col - 1][ int( options.start ) - 1:endposition ] + else: + fields[col - 1] = fields[col - 1][ int( options.start ) - 1: ] + line = '\t'.join(fields) + print(line) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/trimmer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/trimmer.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,140 @@ +<tool id="trimmer" name="Trim" version="0.0.1"> + <description>leading or trailing characters</description> + <command detect_errors="exit_code"> +<![CDATA[ +python '$__tool_directory__/trimmer.py' -a -f '$input1' -c $col -s $start -e $end -i '$ignore' $fastq > '$out_file1' +]]> + </command> + <inputs> + <param name="input1" type="data" format="tabular,txt" label="Input dataset" /> + <param name="col" type="integer" value="0" label="Trim this column only" help="0 = process entire line" /> + <param name="start" type="integer" value="1" label="Trim from the beginning up to this position" help="Only positive positions allowed. 1 = do not trim the beginning"/> + <param name="end" type="integer" value="0" label="Remove everything from this position to the end" help="Use negative position to indicate position starting from the end. 0 = do not trim the end"/> + <param name="fastq" type="select" label="Is input dataset in FASTQ format?" help="If set to 'Yes', the tool will not trim evenly numbered lines (0, 2, 4, etc...). This allows for trimming the seq and qual lines, only if they are not spread over multiple lines (see warning below)"> + <option value="" selected="true">No</option> + <option value="-q">Yes</option> + </param> + <param name="ignore" type="select" display="checkboxes" multiple="true" label="Ignore lines beginning with these characters" help="Lines beginning with these are not trimmed"> + <option value="62">></option> + <option value="64">@</option> + <option value="43">+</option> + <option value="60"><</option> + <option value="42">*</option> + <option value="45">-</option> + <option value="61">=</option> + <option value="124">|</option> + <option value="63">?</option> + <option value="36">$</option> + <option value="46">.</option> + <option value="58">:</option> + <option value="38">&</option> + <option value="37">%</option> + <option value="94">^</option> + <option value="35">#</option> + </param> + </inputs> + <outputs> + <data name="out_file1" format_source="input1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="trimmer_tab_delimited.dat"/> + <param name="col" value="0"/> + <param name="start" value="1"/> + <param name="end" value="13"/> + <param name="ignore" value="62"/> + <param name="fastq" value="No"/> + <output name="out_file1" file="trimmer_a_f_c0_s1_e13_i62.dat"/> + </test> + <test> + <param name="input1" value="trimmer_tab_delimited.dat"/> + <param name="col" value="2"/> + <param name="start" value="1"/> + <param name="end" value="2"/> + <param name="ignore" value="62"/> + <param name="fastq" value="No"/> + <output name="out_file1" file="trimmer_a_f_c2_s1_e2_i62.dat"/> + </test> + <test> + <param name="input1" value="trimmer_tab_delimited.dat"/> + <param name="col" value="2"/> + <param name="start" value="2"/> + <param name="end" value="-2"/> + <param name="ignore" value="62"/> + <param name="fastq" value="No"/> + <output name="out_file1" file="trimmer_a_f_c2_s2_e-2_i62.dat"/> + </test> + </tests> + + <help> +**What it does** + +Trims specified number of characters from a dataset or its field (if dataset is tab-delimited). + +----- + +**Example 1** + +Trimming this dataset:: + + 1234567890 + abcdefghijk + +by setting **Trim from the beginning up to this position** to *2* and **Remove everything from this position to the end** to *6* will produce:: + + 23456 + bcdef + +----- + +**Example 2** + +Trimming column 2 of this dataset:: + + abcde 12345 fghij 67890 + fghij 67890 abcde 12345 + +by setting **Trim content of this column only** to *2*, **Trim from the beginning up to this position** to *2*, and **Remove everything from this position to the end** to *4* will produce:: + + abcde 234 fghij 67890 + fghij 789 abcde 12345 + +----- + +**Example 3** + +Trimming column 2 of this dataset:: + + abcde 12345 fghij 67890 + fghij 67890 abcde 12345 + +by setting **Trim content of this column only** to *2*, **Trim from the beginning up to this position** to *2*, and **Remove everything from this position to the end** to *-2* will produce:: + + abcde 23 fghij 67890 + fghij 78 abcde 12345 + +---- + +**Trimming FASTQ datasets** + +This tool can be used to trim sequences and quality strings in FASTQ datasets. This is done by selected *Yes* from the **Is input dataset in FASTQ format?** dropdown. If set to *Yes*, the tool will skip all even numbered lines (see warning below). For example, trimming last 5 bases of this dataset:: + + @081017-and-081020:1:1:1715:1759 + GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC + + + II#IIIIIII$5+.(9IIIIIII$%*$G$A31I&&B + +cab done by setting **Remove everything from this position to the end** to 31:: + + @081017-and-081020:1:1:1715:1759 + GGACTCAGATAGTAATCCACGCTCCTTTAAA + + + II#IIIIIII$5+.(9IIIIIII$%*$G$A3 + +**Note** that headers are skipped. + +.. class:: warningmark + +**WARNING:** This tool will only work on properly formatted FASTQ datasets where (1) each read and quality string occupy one line and (2) '@' (read header) and "+" (quality header) lines are evenly numbered like in the above example. + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/ucsc_gene_bed_to_exon_bed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/ucsc_gene_bed_to_exon_bed.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,139 @@ +#!/usr/bin/env python +""" +Read a table dump in the UCSC gene table format and print a tab separated +list of intervals corresponding to requested features of each gene. + +usage: ucsc_gene_table_to_intervals.py [options] + +options: + -h, --help show this help message and exit + -rREGION, --region=REGION + Limit to region: one of coding, utr3, utr5, codon, intron, transcribed [default] + -e, --exons Only print intervals overlapping an exon + -i, --input=inputfile input file + -o, --output=outputfile output file +""" +from __future__ import print_function + +import optparse +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def main(): + parser = optparse.OptionParser( usage="%prog [options] " ) + parser.add_option( "-r", "--region", dest="region", default="transcribed", + help="Limit to region: one of coding, utr3, utr5, transcribed [default]" ) + parser.add_option( "-e", "--exons", action="store_true", dest="exons", + help="Only print intervals overlapping an exon" ) + parser.add_option( "-s", "--strand", action="store_true", dest="strand", + help="Print strand after interval" ) + parser.add_option( "-i", "--input", dest="input", default=None, + help="Input file" ) + parser.add_option( "-o", "--output", dest="output", default=None, + help="Output file" ) + options, args = parser.parse_args() + assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed', 'intron', 'codon' ), "Invalid region argument" + + try: + out_file = open(options.output, "w") + except: + print("Bad output file.", file=sys.stderr) + sys.exit(0) + + try: + in_file = open(options.input) + except: + print("Bad input file.", file=sys.stderr) + sys.exit(0) + + print("Region:", options.region + ";") + """print "Only overlap with Exons:", + if options.exons: + print "Yes" + else: + print "No" + """ + + # Read table and handle each gene + for line in in_file: + try: + if line[0:1] == "#": + continue + # Parse fields from gene tabls + fields = line.split( '\t' ) + chrom = fields[0] + tx_start = int( fields[1] ) + tx_end = int( fields[2] ) + name = fields[3] + strand = fields[5].replace(" ", "_") + cds_start = int( fields[6] ) + cds_end = int( fields[7] ) + + # Determine the subset of the transcribed region we are interested in + if options.region == 'utr3': + if strand == '-': + region_start, region_end = tx_start, cds_start + else: + region_start, region_end = cds_end, tx_end + elif options.region == 'utr5': + if strand == '-': + region_start, region_end = cds_end, tx_end + else: + region_start, region_end = tx_start, cds_start + elif options.region == 'coding' or options.region == 'codon': + region_start, region_end = cds_start, cds_end + else: + region_start, region_end = tx_start, tx_end + + # If only interested in exons, print the portion of each exon overlapping + # the region of interest, otherwise print the span of the region + # options.exons is always TRUE + if options.exons: + exon_starts = [int(_) + tx_start for _ in fields[11].rstrip( ',\n' ).split( ',' )] + exon_ends = [int(_) for _ in fields[10].rstrip( ',\n' ).split( ',' )] + exon_ends = [x + y for x, y in zip(exon_starts, exon_ends)] + + # for Intron regions: + if options.region == 'intron': + i = 0 + while i < len(exon_starts) - 1: + intron_starts = exon_ends[i] + intron_ends = exon_starts[i + 1] + if strand: + print_tab_sep(out_file, chrom, intron_starts, intron_ends, name, "0", strand ) + else: + print_tab_sep(out_file, chrom, intron_starts, intron_ends ) + i += 1 + # for non-intron regions: + else: + for start, end in zip( exon_starts, exon_ends ): + start = max( start, region_start ) + end = min( end, region_end ) + if start < end: + if options.region == 'codon': + start += (3 - ((start - region_start) % 3)) % 3 + c_start = start + while c_start + 3 <= end: + if strand: + print_tab_sep(out_file, chrom, c_start, c_start + 3, name, "0", strand ) + else: + print_tab_sep(out_file, chrom, c_start, c_start + 3) + c_start += 3 + else: + if strand: + print_tab_sep(out_file, chrom, start, end, name, "0", strand ) + else: + print_tab_sep(out_file, chrom, start, end ) + except: + continue + + +def print_tab_sep(out_file, *args ): + """Print items in `l` to stdout separated by tabs""" + print('\t'.join(str( f ) for f in args), file=out_file) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/ucsc_gene_bed_to_exon_bed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/ucsc_gene_bed_to_exon_bed.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,78 @@ +<tool id="gene2exon1" name="Gene BED To Exon/Intron/Codon BED" version="1.0.0"> +<description>expander</description> + <command interpreter="python">ucsc_gene_bed_to_exon_bed.py --input=$input1 --output=$out_file1 --region=$region "--exons"</command> + <inputs> + <param name="region" type="select"> + <label>Extract</label> + <option value="transcribed">Coding Exons + UTR Exons</option> + <option value="coding">Coding Exons only</option> + <option value="utr5">5'-UTR Exons</option> + <option value="utr3">3'-UTR Exons</option> + <option value="intron">Introns</option> + <option value="codon">Codons</option> + </param> + <param name="input1" type="data" format="bed" label="from" help="this history item must contain a 12 field BED (see below)"/> + </inputs> + <outputs> + <data name="out_file1" format="bed"/> + </outputs> + <tests> + <test> + <param name="input1" value="3.bed" /> + <param name="region" value="transcribed" /> + <output name="out_file1" file="cf-gene2exon.dat"/> + </test> + </tests> +<help> + +.. class:: warningmark + +This tool works only on a BED file that contains at least 12 fields (see **Example** and **About formats** below). The output will be empty if applied to a BED file with 3 or 6 fields. + +------ + +**What it does** + +BED format can be used to represent a single gene in just one line, which contains the information about exons, coding sequence location (CDS), and positions of untranslated regions (UTRs). This tool *unpacks* this information by converting a single line describing a gene into a collection of lines representing individual exons, introns, UTRs, etc. + +------- + +**Example** + +Extracting **Coding Exons + UTR Exons** from the following two BED lines:: + + chr7 127475281 127491632 NM_000230 0 + 127486022 127488767 0 3 29,172,3225, 0,10713,13126 + chr7 127486011 127488900 D49487 0 + 127486022 127488767 0 2 155,490, 0,2399 + +will return:: + + chr7 127475281 127475310 NM_000230 0 + + chr7 127485994 127486166 NM_000230 0 + + chr7 127488407 127491632 NM_000230 0 + + chr7 127486011 127486166 D49487 0 + + chr7 127488410 127488900 D49487 0 + + +------ + +.. class:: infomark + +**About formats** + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and additional optional ones. In the specific case of this tool the following fields must be present:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/ucsc_gene_bed_to_intron_bed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/ucsc_gene_bed_to_intron_bed.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,85 @@ +#!/usr/bin/env python +""" +Read a table dump in the UCSC gene table format and print a tab separated +list of intervals corresponding to requested features of each gene. + +usage: ucsc_gene_table_to_intervals.py [options] + +options: + -h, --help show this help message and exit + -rREGION, --region=REGION + Limit to region: one of coding, utr3, utr5, transcribed [default] + -e, --exons Only print intervals overlapping an exon + -i, --input=inputfile input file + -o, --output=outputfile output file +""" +from __future__ import print_function + +import optparse +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def main(): + parser = optparse.OptionParser( usage="%prog [options] " ) + parser.add_option( "-s", "--strand", action="store_true", dest="strand", + help="Print strand after interval" ) + parser.add_option( "-i", "--input", dest="input", default=None, + help="Input file" ) + parser.add_option( "-o", "--output", dest="output", default=None, + help="Output file" ) + options, args = parser.parse_args() + + try: + out_file = open(options.output, "w") + except: + print("Bad output file.", file=sys.stderr) + sys.exit(0) + + try: + in_file = open(options.input) + except: + print("Bad input file.", file=sys.stderr) + sys.exit(0) + + # Read table and handle each gene + for line in in_file: + try: + if line[0:1] == "#": + continue + + # Parse fields from gene tabls + fields = line.split( '\t' ) + chrom = fields[0] + tx_start = int( fields[1] ) + int( fields[2] ) + name = fields[3] + strand = fields[5].replace(" ", "_") + int( fields[6] ) + int( fields[7] ) + + exon_starts = [int(_) + tx_start for _ in fields[11].rstrip( ',\n' ).split( ',' )] + exon_ends = [int(_) for _ in fields[10].rstrip( ',\n' ).split( ',' )] + exon_ends = [x + y for x, y in zip(exon_starts, exon_ends)] + + i = 0 + while i < len(exon_starts) - 1: + intron_starts = exon_ends[i] + 1 + intron_ends = exon_starts[i + 1] - 1 + if strand: + print_tab_sep(out_file, chrom, intron_starts, intron_ends, name, "0", strand ) + else: + print_tab_sep(out_file, chrom, intron_starts, intron_ends ) + i += 1 + except: + continue + + +def print_tab_sep(out_file, *args ): + """Print items in `l` to stdout separated by tabs""" + print('\t'.join(str( f ) for f in args), file=out_file) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/ucsc_gene_bed_to_intron_bed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/ucsc_gene_bed_to_intron_bed.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,60 @@ +<tool id="gene2intron1" name="Gene BED To Intron BED" version="1.0.0"> +<description>expander</description> + <command interpreter="python">ucsc_gene_bed_to_intron_bed.py --input=$input1 --output=$out_file1</command> + <inputs> + <param name="input1" type="data" format="interval" label="UCSC Gene Table"/> + + </inputs> + <outputs> + <data name="out_file1" format="bed"/> + </outputs> + <tests> + <test> + <param name="input1" value="3.bed" /> + <output name="out_file1" file="cf-gene2intron.dat"/> + </test> + </tests> +<help> + +**Syntax** + +This tool converts a UCSC gene bed format file to a list of bed format lines corresponding to requested features of each gene. + +- **BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and twelve additional optional ones:: + + The first three BED fields (required) are: + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + + The twelve additional BED fields (optional) are: + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser. + 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser. + 9. reserved - This should always be set to zero. + 10. blockCount - The number of blocks (exons) in the BED line. + 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. + 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. + 13. expCount - The number of experiments. + 14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount. + 15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount. + +----- + +**Example** + +- A UCSC gene bed format file:: + + chr7 127475281 127491632 NM_000230 0 + 127486022 127488767 0 3 29,172,3225, 0,10713,13126 + chr7 127486011 127488900 D49487 0 + 127486022 127488767 0 2 155,490, 0,2399 + +- Converts the above file to a list of bed lines, which has the introns:: + + chr7 127475311 127475993 NM_000230 0 + + chr7 127486167 127488406 NM_000230 0 + + chr7 127486167 127488409 D49487 0 + + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/ucsc_gene_table_to_intervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/ucsc_gene_table_to_intervals.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,117 @@ +#!/usr/bin/env python +""" +Read a table dump in the UCSC gene table format and print a tab separated +list of intervals corresponding to requested features of each gene. + +usage: ucsc_gene_table_to_intervals.py [options] + +options: + -h, --help show this help message and exit + -rREGION, --region=REGION + Limit to region: one of coding, utr3, utr5, transcribed [default] + -e, --exons Only print intervals overlapping an exon + -i, --input=inputfile input file + -o, --output=outputfile output file +""" +from __future__ import print_function + +import optparse +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def main(): + parser = optparse.OptionParser( usage="%prog [options] " ) + parser.add_option( "-r", "--region", dest="region", default="transcribed", + help="Limit to region: one of coding, utr3, utr5, transcribed [default]" ) + parser.add_option( "-e", "--exons", action="store_true", dest="exons", + help="Only print intervals overlapping an exon" ) + parser.add_option( "-s", "--strand", action="store_true", dest="strand", + help="Print strand after interval" ) + parser.add_option( "-i", "--input", dest="input", default=None, + help="Input file" ) + parser.add_option( "-o", "--output", dest="output", default=None, + help="Output file" ) + options, args = parser.parse_args() + assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" + + try: + out_file = open(options.output, "w") + except: + print("Bad output file.", file=sys.stderr) + sys.exit(0) + + try: + in_file = open(options.input) + except: + print("Bad input file.", file=sys.stderr) + sys.exit(0) + + print("Region:", options.region + ";") + print("Only overlap with Exons:", end=' ') + if options.exons: + print("Yes") + else: + print("No") + + # Read table and handle each gene + for line in in_file: + try: + if line[0:1] == "#": + continue + # Parse fields from gene tabls + fields = line.split( '\t' ) + name = fields[0] + chrom = fields[1] + strand = fields[2].replace(" ", "_") + tx_start = int( fields[3] ) + tx_end = int( fields[4] ) + cds_start = int( fields[5] ) + cds_end = int( fields[6] ) + + # Determine the subset of the transcribed region we are interested in + if options.region == 'utr3': + if strand == '-': + region_start, region_end = tx_start, cds_start + else: + region_start, region_end = cds_end, tx_end + elif options.region == 'utr5': + if strand == '-': + region_start, region_end = cds_end, tx_end + else: + region_start, region_end = tx_start, cds_start + elif options.region == 'coding': + region_start, region_end = cds_start, cds_end + else: + region_start, region_end = tx_start, tx_end + + # If only interested in exons, print the portion of each exon overlapping + # the region of interest, otherwise print the span of the region + if options.exons: + exon_starts = map( int, fields[8].rstrip( ',\n' ).split( ',' ) ) + exon_ends = map( int, fields[9].rstrip( ',\n' ).split( ',' ) ) + for start, end in zip( exon_starts, exon_ends ): + start = max( start, region_start ) + end = min( end, region_end ) + if start < end: + if strand: + print_tab_sep(out_file, chrom, start, end, name, "0", strand ) + else: + print_tab_sep(out_file, chrom, start, end ) + else: + if strand: + print_tab_sep(out_file, chrom, region_start, region_end, name, "0", strand ) + else: + print_tab_sep(out_file, chrom, region_start, region_end ) + except: + continue + + +def print_tab_sep(out_file, *args ): + """Print items in `l` to stdout separated by tabs""" + print('\t'.join(str( f ) for f in args), file=out_file) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/ucsc_gene_table_to_intervals.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/ucsc_gene_table_to_intervals.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,25 @@ +<tool id="ucsc_gene_table_to_intervals1" name="Gene Table To BED" version="1.0.0"> +<description>Parse a UCSC Gene Table dump</description> + <command interpreter="python">ucsc_gene_table_to_intervals.py --input=$input1 --output=$out_file1 --region=$region $exon</command> + <inputs> + <param name="input1" type="data" format="inverval" label="UCSC Gene Table"/> + <param name="region" type="select"> + <label>Feature Type</label> + <option value="transcribed">Transcribed</option> + <option value="coding">Coding</option> + <option value="utr3">3' UTR</option> + <option value="utr5">5' UTR</option> + </param> + <param name="exon" type="select"> + <label>Only print intervals overlapping an exon</label> + <option value="">False</option> + <option value="--exons">True</option> + </param> + </inputs> + <outputs> + <data name="out_file1" format="bed"/> + </outputs> +<help> +Read a table dump in the UCSC gene table format and create a BED file corresponding to the requested feature of each gene. +</help> +</tool> \ No newline at end of file |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/uniq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/uniq.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,141 @@ +# Filename: uniq.py +# Author: Ian N. Schenck +# Version: 19/12/2005 +# +# This script accepts an input file, an output file, a column +# delimiter, and a list of columns. The script then grabs unique +# lines based on the columns, and returns those records with a count +# of occurences of each unique column (ignoring trailing spaces), +# inserted before the columns. +# +# This executes the command pipeline: +# cut -f $fields | sort | uniq -C +# +# -i Input file +# -o Output file +# -d Delimiter +# -c Column list (Comma Seperated) +from __future__ import print_function + +import re +import subprocess +import sys + + +# This function is exceedingly useful, perhaps package for reuse? +def getopts(argv): + opts = {} + while argv: + if argv[0][0] == '-': + opts[argv[0]] = argv[1] + argv = argv[2:] + else: + argv = argv[1:] + return opts + + +def main(): + args = sys.argv[1:] + + try: + opts = getopts(args) + except IndexError: + print("Usage:") + print(" -i Input file") + print(" -o Output file") + print(" -c Column list (comma seperated)") + print(" -d Delimiter:") + print(" T Tab") + print(" C Comma") + print(" D Dash") + print(" U Underscore") + print(" P Pipe") + print(" Dt Dot") + print(" Sp Space") + print(" -s Sorting: value (default), largest, or smallest") + return 0 + + outputfile = opts.get("-o") + if outputfile is None: + print("No output file specified.") + return -1 + + inputfile = opts.get("-i") + if inputfile is None: + print("No input file specified.") + return -2 + + delim = opts.get("-d") + if delim is None: + print("Field delimiter not specified.") + return -3 + + columns = opts.get("-c") + if columns is None or columns == 'None': + print("Columns not specified.") + return -4 + + sorting = opts.get("-s") + if sorting is None: + sorting = "value" + if sorting not in ["value", "largest", "smallest"]: + print("Unknown sorting option %r" % sorting) + return -5 + + # All inputs have been specified at this point, now validate. + fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") + columnRegEx = re.compile("([0-9]{1,},?)+") + + if not columnRegEx.match(columns): + print("Illegal column specification.") + return -4 + if not fileRegEx.match(outputfile): + print("Illegal output filename.") + return -5 + if not fileRegEx.match(inputfile): + print("Illegal input filename.") + return -6 + + column_list = re.split(",", columns) + columns_for_display = "c" + ", c".join(column_list) + + commandline = "cut " + # Set delimiter + if delim == 'C': + commandline += "-d \",\" " + if delim == 'D': + commandline += "-d \"-\" " + if delim == 'U': + commandline += "-d \"_\" " + if delim == 'P': + commandline += "-d \"|\" " + if delim == 'Dt': + commandline += "-d \".\" " + if delim == 'Sp': + commandline += "-d \" \" " + + # set columns + commandline += "-f " + columns + # we want to remove *trailing* spaces from each field, + # so look for spaces then tab (for first and middle selected columns) + # and replacw with just tab, and remove any spaces at end of the line + # (for the final selected column): + commandline += " " + inputfile + " | sed 's/\ *\t/\t/' | sed 's/\ *$//'" + commandline += " | sort | uniq -c" + # uniq -C puts counts at the start, so we can sort lines by numerical value + if sorting == "largest": + commandline += " | sort -n -r" + elif sorting == "smallest": + commandline += " | sort -n" + # uniq -C produces lines with leading spaces, use sed to remove that + # uniq -C puts a space between the count and the field, want a tab. + # To replace just first tab, use sed again with 1 as the index + commandline += " | sed 's/^\ *//' | sed 's/ /\t/1' > " + outputfile + errorcode = subprocess.call(commandline, shell=True) + + print("Count of unique values in " + columns_for_display) + return errorcode + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/uniq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/uniq.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,105 @@ +<tool id="Count1" name="Count" version="1.0.2"> + <description>occurrences of each record</description> + <command interpreter="python">uniq.py -i $input -o $out_file1 -c "$column" -d $delim -s $sorting</command> + <inputs> + <param name="input" type="data" format="tabular" label="from dataset" help="Dataset missing? See TIP below"/> + <param name="column" type="data_column" data_ref="input" multiple="True" numerical="False" label="Count occurrences of values in column(s)" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" /> + <param name="delim" type="select" label="Delimited by"> + <option value="T">Tab</option> + <option value="Sp">Whitespace</option> + <option value="Dt">Dot</option> + <option value="C">Comma</option> + <option value="D">Dash</option> + <option value="U">Underscore</option> + <option value="P">Pipe</option> + </param> + <param name="sorting" type="select" label="How should the results be sorted?"> + <option value="value">By the values being counted</option> + <option value="largest">With the most common values first</option> + <option value="smallest">With the rarest values first</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="1.bed"/> + <output name="out_file1" file="uniq_out.dat"/> + <param name="column" value="1"/> + <param name="delim" value="T"/> + </test> + <test> + <param name="input" value="species_assignment.tabular" ftype="tabular"/> + <output name="out_file1" file="species_assignment_c2.tabular"/> + <param name="column" value="2"/> + <param name="delim" value="T"/> + </test> + <test> + <param name="input" value="species_assignment.tabular" ftype="tabular"/> + <output name="out_file1" file="species_assignment_c2_c3.tabular"/> + <param name="column" value="2,3"/> + <param name="delim" value="T"/> + </test> + <test> + <param name="input" value="species_assignment.tabular" ftype="tabular"/> + <output name="out_file1" file="species_assignment_c2_c3_largest.tabular"/> + <param name="column" value="2,3"/> + <param name="delim" value="T"/> + <param name="sorting" value="largest"/> + </test> + <test> + <param name="input" value="species_assignment.tabular" ftype="tabular"/> + <output name="out_file1" file="species_assignment_c2_c3_smallest.tabular"/> + <param name="column" value="2,3"/> + <param name="delim" value="T"/> + <param name="sorting" value="smallest"/> + </test> + </tests> + <help> +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool counts occurrences of unique values in selected column(s). + +- If multiple columns are selected, counting is performed on each unique group of all values in the selected columns. +- The first column of the resulting dataset will be the count of unique values in the selected column(s) and will be followed by each value. + +----- + +**Example** + +- Input file:: + + chr1 10 100 gene1 + chr1 105 200 gene2 + chr1 205 300 gene3 + chr2 10 100 gene4 + chr2 1000 1900 gene5 + chr3 15 1656 gene6 + chr4 10 1765 gene7 + chr4 10 1765 gene8 + +- Counting unique values in column c1 will result in:: + + 3 chr1 + 2 chr2 + 1 chr3 + 2 chr4 + +- Counting unique values in the grouping of columns c2 and c3 will result in:: + + 2 10 100 + 2 10 1765 + 1 1000 1900 + 1 105 200 + 1 15 1656 + 1 205 300 + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/wc_gnu.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/wc_gnu.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,72 @@ +<tool id="wc_gnu" name="Line/Word/Character count" version="1.0.0"> + <description>of a dataset</description> + <command> + #set $word_to_arg = { 'characters':'m', 'words':'w', 'lines':'l' } + #set $arg_order = [ 'lines', 'words', 'characters' ] + #if not isinstance( $options.value, list ): + #set $args = [ $options.value ] + #else: + #set $args = $options.value + #end if + #if $include_header.value: + echo "#${ "\t".join( [ i for i in $arg_order if i in $args ] ) }" > $out_file1 + && + #end if + wc + #for $option in $args: + -${ word_to_arg[ str(option) ] } + #end for + $input1 | awk '{ print ${ '"\\t"'.join( [ "$%i" % ( i+1 ) for i in range( len( $args ) ) ] ) } }' + >> $out_file1 + </command> + <inputs> + <param format="txt" name="input1" type="data" label="Text file"/> + <param name="options" type="select" multiple="True" display="checkboxes" label="Desired values"> + <!-- <option value="bytes" selected="True">Byte count</option> --> + <option value="lines" selected="True">Line count</option> + <option value="words" selected="True">Word count</option> + <option value="characters" selected="True">Character count</option> + <validator type="no_options" message="You must pick at least one attribute to count." /> + </param> + <param name="include_header" type="boolean" label="Include Output header" checked="True"/> + </inputs> + <outputs> + <data format="tabular" name="out_file1"/> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <param name="options" value="lines,words,characters"/> + <param name="include_header" value="True"/> + <output name="out_file1" file="wc_gnu_out_1.tabular"/> + </test> + <test> + <param name="input1" value="1.bed"/> + <param name="options" value="lines,words,characters"/> + <param name="include_header" value="False"/> + <output name="out_file1" file="wc_gnu_out_2.tabular"/> + </test> + </tests> + <help> + +**What it does** + +This tool outputs counts of specified attributes (lines, words, characters) of a dataset. + +----- + +**Example Output** + +:: + + #lines words characters + 7499 41376 624971 + +------ + +**Citation** + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/wig_to_bigwig.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/wig_to_bigwig.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,94 @@ +<tool id="wig_to_bigWig" name="Wig/BedGraph-to-bigWig" version="1.1.0"> + <description>converter</description> + <requirements> + <requirement type="package">ucsc_tools</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <regex match="needLargeMem: trying to allocate 0 bytes" + description="Your input file might be empty or wrongly formatted"/> + <regex match="^Error"/> + </stdio> + <command> + <![CDATA[ + grep -v "^track" $input1 | wigToBigWig stdin $chromInfo $out_file1 + #if $settings.settingsType == "full": + -blockSize=${settings.blockSize} -itemsPerSlot=${settings.itemsPerSlot} ${settings.clip} ${settings.unc} + #else: + -clip + #end if + 2>&1 || echo "Error running wigToBigWig." >&2 + ]]> + </command> + <inputs> + <param format="wig,bedgraph" name="input1" type="data" label="Convert"> + <validator type="unspecified_build" /> + </param> + <conditional name="settings"> + <param name="settingsType" type="select" label="Converter settings to use" help="Default settings should usually be used."> + <option value="preset">Default</option> + <option value="full">Full parameter list</option> + </param> + <when value="preset" /> + <when value="full"> + <param name="blockSize" size="4" type="integer" value="256" label="Items to bundle in r-tree" help="Default is 256 (blockSize)" /> + <param name="itemsPerSlot" size="4" type="integer" value="1024" label="Data points bundled at lowest level" help="Default is 1024 (itemsPerSlot)" /> + <param name="clip" type="boolean" truevalue="-clip" falsevalue="" checked="True" label="Clip chromosome positions" help="Issue warning messages rather than dying if wig file contains items off end of chromosome. (clip)"/> + <param name="unc" type="boolean" truevalue="-unc" falsevalue="" checked="False" label="Do not use compression" help="(unc)"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="bigwig" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="2.wig" dbkey="hg17" /> + <param name="settingsType" value="full" /> + <param name="blockSize" value="256" /> + <param name="itemsPerSlot" value="1024" /> + <param name="clip" value="True" /> + <param name="unc" value="False" /> + <output name="out_file1" file="2.bigwig"/> + </test> + <test> + <param name="input1" value="2.wig" dbkey="hg17" /> + <param name="settingsType" value="preset" /> + <output name="out_file1" file="2.bigwig"/> + </test> + <test> + <param name="input1" value="1.bedgraph" dbkey="hg19" ftype="bedgraph"/> + <param name="settingsType" value="preset" /> + <output name="out_file1" file="3.bigwig"/> + </test> + </tests> + <help> +**Syntax** + +This tool converts bedgraph or wiggle data into bigWig type. + +- **Wiggle format**: The .wig format is line-oriented. Wiggle data is preceded by a UCSC track definition line. Following the track definition line is the track data, which can be entered in three different formats described below. + + - **BED format** with no declaration line and four columns of data:: + + chromA chromStartA chromEndA dataValueA + chromB chromStartB chromEndB dataValueB + + - **variableStep** two column data; started by a declaration line and followed with chromosome positions and data values:: + + variableStep chrom=chrN [span=windowSize] + chromStartA dataValueA + chromStartB dataValueB + + - **fixedStep** single column data; started by a declaration line and followed with data values:: + + fixedStep chrom=chrN start=position step=stepInterval [span=windowSize] + dataValue1 + dataValue2 + +- The **BedGraph format** is described in detail at the `UCSC Bioinformatics website`_ + +.. _UCSC Bioinformatics website: http://genome.ucsc.edu/goldenPath/help/bedgraph.html + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/wiggle_to_simple.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/wiggle_to_simple.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,48 @@ +#!/usr/bin/env python +""" +Read a wiggle track and print out a series of lines containing +"chrom position score". Ignores track lines, handles bed, variableStep +and fixedStep wiggle lines. +""" +from __future__ import print_function + +import sys + +import bx.wiggle + +from galaxy.util.ucsc import UCSCLimitException, UCSCOutWrapper + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def main(): + if len( sys.argv ) > 1: + in_file = open( sys.argv[1] ) + else: + in_file = open( sys.stdin ) + + if len( sys.argv ) > 2: + out_file = open( sys.argv[2], "w" ) + else: + out_file = sys.stdout + + try: + for fields in bx.wiggle.IntervalReader( UCSCOutWrapper( in_file ) ): + out_file.write( "%s\n" % "\t".join( map( str, fields ) ) ) + except UCSCLimitException: + # Wiggle data was truncated, at the very least need to warn the user. + print('Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.') + except ValueError as e: + in_file.close() + out_file.close() + stop_err( str( e ) ) + + in_file.close() + out_file.close() + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c filters/wiggle_to_simple.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filters/wiggle_to_simple.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,88 @@ +<tool id="wiggle2simple1" name="Wiggle-to-Interval" version="1.0.0"> + <description>converter</description> + <command interpreter="python">wiggle_to_simple.py $input $out_file1 </command> + <inputs> + <param format="wig" name="input" type="data" label="Convert"/> + </inputs> + <outputs> + <data format="interval" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="2.wig" /> + <output name="out_file1" file="2.interval"/> + </test> + <test> + <param name="input" value="3.wig" /> + <output name="out_file1" file="3_wig.bed"/> + </test> + </tests> + <help> +**Syntax** + +This tool converts wiggle data into interval type. + +- **Wiggle format**: The .wig format is line-oriented. Wiggle data is preceded by a UCSC track definition line. Following the track definition line is the track data, which can be entered in three different formats described below. + + - **BED format** with no declaration line and four columns of data:: + + chromA chromStartA chromEndA dataValueA + chromB chromStartB chromEndB dataValueB + + - **variableStep** two column data; started by a declaration line and followed with chromosome positions and data values:: + + variableStep chrom=chrN [span=windowSize] + chromStartA dataValueA + chromStartB dataValueB + + - **fixedStep** single column data; started by a declaration line and followed with data values:: + + fixedStep chrom=chrN start=position step=stepInterval [span=windowSize] + dataValue1 + dataValue2 + +----- + +**Example** + +- input wiggle format file:: + + #track type=wiggle_0 name="Bed Format" description="BED format" + chr19 59302000 59302300 -1.0 + chr19 59302300 59302600 -0.75 + chr19 59302600 59302900 -0.50 + chr19 59302900 59303200 -0.25 + chr19 59303200 59303500 0.0 + #track type=wiggle_0 name="variableStep" description="variableStep format" + variableStep chrom=chr19 span=150 + 59304701 10.0 + 59304901 12.5 + 59305401 15.0 + 59305601 17.5 + #track type=wiggle_0 name="fixedStep" description="fixed step" visibility=full + fixedStep chrom=chr19 start=59307401 step=300 span=200 + 1000 + 900 + 800 + 700 + 600 + +- convert the above file to interval file:: + + chr19 59302000 59302300 + -1.0 + chr19 59302300 59302600 + -0.75 + chr19 59302600 59302900 + -0.5 + chr19 59302900 59303200 + -0.25 + chr19 59303200 59303500 + 0.0 + chr19 59304701 59304851 + 10.0 + chr19 59304901 59305051 + 12.5 + chr19 59305401 59305551 + 15.0 + chr19 59305601 59305751 + 17.5 + chr19 59307701 59307901 + 1000.0 + chr19 59308001 59308201 + 900.0 + chr19 59308301 59308501 + 800.0 + chr19 59308601 59308801 + 700.0 + chr19 59308901 59309101 + 600.0 + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_exporter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_exporter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,334 @@\n+#!/usr/bin/env python\n+# Dan Blankenberg\n+from __future__ import print_function\n+\n+import base64\n+import binascii\n+import datetime\n+import hashlib\n+import json\n+import logging\n+import optparse\n+import os\n+import tempfile\n+\n+import six\n+from six.moves import http_cookiejar\n+from six.moves.urllib.error import HTTPError\n+from six.moves.urllib.parse import quote, urlencode, urljoin\n+from six.moves.urllib.request import build_opener, HTTPCookieProcessor, Request, urlopen\n+\n+log = logging.getLogger( "tools.genomespace.genomespace_exporter" )\n+\n+try:\n+ import boto\n+ from boto.s3.connection import S3Connection\n+except ImportError:\n+ boto = None\n+\n+GENOMESPACE_API_VERSION_STRING = "v1.0"\n+GENOMESPACE_SERVER_URL_PROPERTIES = "https://dm.genomespace.org/config/%s/serverurl.properties" % ( GENOMESPACE_API_VERSION_STRING )\n+DEFAULT_GENOMESPACE_TOOLNAME = \'Galaxy\'\n+\n+CHUNK_SIZE = 2 ** 20 # 1mb\n+\n+# TODO: TARGET_SPLIT_SIZE and TARGET_SIMPLE_PUT_UPLOAD_SIZE are arbitrarily defined\n+# we should programmatically determine these, based upon the current environment\n+TARGET_SPLIT_SIZE = 250 * 1024 * 1024 # 250 mb\n+MIN_MULTIPART_UPLOAD_SIZE = 5 * 1024 * 1024 # 5mb\n+MAX_SIMPLE_PUT_UPLOAD_SIZE = 5 * 1024 * 1024 * 1024 # 5gb\n+TARGET_SIMPLE_PUT_UPLOAD_SIZE = MAX_SIMPLE_PUT_UPLOAD_SIZE / 2\n+\n+# Some basic Caching, so we don\'t have to reload and download everything every time,\n+# especially now that we are calling the parameter\'s get options method 5 times\n+# (6 on reload) when a user loads the tool interface\n+# For now, we\'ll use 30 seconds as the cache valid time\n+CACHE_TIME = datetime.timedelta( seconds=30 )\n+GENOMESPACE_DIRECTORIES_BY_USER = {}\n+\n+\n+def chunk_write( source_stream, target_stream, source_method="read", target_method="write" ):\n+ source_method = getattr( source_stream, source_method )\n+ target_method = getattr( target_stream, target_method )\n+ while True:\n+ chunk = source_method( CHUNK_SIZE )\n+ if chunk:\n+ target_method( chunk )\n+ else:\n+ break\n+\n+\n+def get_cookie_opener( gs_username, gs_token, gs_toolname=None ):\n+ """ Create a GenomeSpace cookie opener """\n+ cj = http_cookiejar.CookieJar()\n+ for cookie_name, cookie_value in [ ( \'gs-token\', gs_token ), ( \'gs-username\', gs_username ) ]:\n+ # create a super-cookie, valid for all domains\n+ cookie = http_cookiejar.Cookie(version=0, name=cookie_name, value=cookie_value, port=None, port_specified=False, domain=\'\', domain_specified=False, domain_initial_dot=False, path=\'/\', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={\'HttpOnly\': None}, rfc2109=False )\n+ cj.set_cookie( cookie )\n+ cookie_opener = build_opener( HTTPCookieProcessor( cj ) )\n+ cookie_opener.addheaders.append( ( \'gs-toolname\', gs_toolname or DEFAULT_GENOMESPACE_TOOLNAME ) )\n+ return cookie_opener\n+\n+\n+def get_genomespace_site_urls():\n+ genomespace_sites = {}\n+ for line in urlopen( GENOMESPACE_SERVER_URL_PROPERTIES ).read().split( \'\\n\' ):\n+ line = line.rstrip()\n+ if not line or line.startswith( "#" ):\n+ continue\n+ server, line = line.split( \'.\', 1 )\n+ if server not in genomespace_sites:\n+ genomespace_sites[server] = {}\n+ line = line.split( "=", 1 )\n+ genomespace_sites[server][line[0]] = line[1]\n+ return genomespace_sites\n+\n+\n+def get_directory( url_opener, dm_url, path ):\n+ url = dm_url\n+ i = None\n+ dir_dict = {}\n+ for i, sub_path in enumerate( path ):\n+ url = "%s/%s" % ( url, sub_path )\n+ dir_request = Request( url, headers={ \'Content-Type\': \'application/json\', \'Accept\': \'application/json\' } )\n+ dir_request.get_method = lambda: \'GET\'\n+ try:\n+ dir_dict = json.loads( url_opener.open( dir_request ).read() )\n+ except HTTPError:\n+ # print "e", e, url #punting, assuming lack of permissions at this low of a level...\n+ '..b'et_directory_dict[\'path\'], quote( target_filename, safe=\'\' ), urlencode( upload_params ) )\n+ new_file_request = Request( upload_url ) # , headers = { \'Content-Type\': \'application/json\', \'Accept\': \'application/text\' } ) #apparently http://www.genomespace.org/team/specs/updated-dm-rest-api:"Every HTTP request to the Data Manager should include the Accept header with a preference for the media types application/json and application/text." is not correct\n+ new_file_request.get_method = lambda: \'GET\'\n+ # get url to upload to\n+ target_upload_url = url_opener.open( new_file_request ).read()\n+ # upload file to determined url\n+ upload_headers = dict( upload_params )\n+ # upload_headers[ \'x-amz-meta-md5-hash\' ] = content_md5.hexdigest()\n+ upload_headers[ \'Accept\' ] = \'application/json\'\n+ upload_file_request = Request( target_upload_url, headers=upload_headers, data=input_file )\n+ upload_file_request.get_method = lambda: \'PUT\'\n+ upload_result = urlopen( upload_file_request ).read()\n+ result_url = "%s/%s" % ( target_directory_dict[\'url\'], quote( target_filename, safe=\'\' ) )\n+ # determine available gs launch apps\n+ web_tools = get_genome_space_launch_apps( genomespace_site_dict[\'atmServer\'], url_opener, result_url, file_type )\n+ if log_filename:\n+ log_file = open( log_filename, \'wb\' )\n+ log_file.write( "<html><head><title>File uploaded to GenomeSpace from Galaxy</title></head><body>\\n" )\n+ log_file.write( \'<p>Uploaded <a href="%s">%s/%s</a> to GenomeSpace.</p>\\n\' % ( result_url, target_directory_dict[\'path\'], target_filename ) )\n+ if web_tools:\n+ log_file.write( "<p>You may open this file directly in the following applications:</p>\\n" )\n+ log_file.write( \'<p><ul>\\n\' )\n+ for web_tool in web_tools:\n+ log_file.write( \'<li><a href="%s">%s</a></li>\\n\' % ( web_tool ) )\n+ log_file.write( \'</p></ul>\\n\' )\n+ else:\n+ log_file.write( \'<p>There are no GenomeSpace applications available for file type: %s</p>\\n\' % ( file_type ) )\n+ log_file.write( "</body></html>\\n" )\n+ return upload_result\n+\n+\n+if __name__ == \'__main__\':\n+ # Parse Command Line\n+ parser = optparse.OptionParser()\n+ parser.add_option( \'-s\', \'--genomespace_site\', dest=\'genomespace_site\', action=\'store\', type="string", default=None, help=\'genomespace_site\' )\n+ parser.add_option( \'-t\', \'--token\', dest=\'token\', action=\'store\', type="string", default=None, help=\'token\' )\n+ parser.add_option( \'-u\', \'--username\', dest=\'username\', action=\'store\', type="string", default=None, help=\'username\' )\n+ parser.add_option( \'-d\', \'--dataset\', dest=\'dataset\', action=\'store\', type="string", default=None, help=\'dataset\' )\n+ parser.add_option( \'-f\', \'--filename\', dest=\'filename\', action=\'store\', type="string", default=None, help=\'filename\' )\n+ parser.add_option( \'-y\', \'--subdirectory\', dest=\'subdirectory\', action=\'append\', type="string", default=None, help=\'subdirectory\' )\n+ parser.add_option( \'\', \'--file_type\', dest=\'file_type\', action=\'store\', type="string", default=None, help=\'file_type\' )\n+ parser.add_option( \'-c\', \'--content_type\', dest=\'content_type\', action=\'store\', type="string", default=None, help=\'content_type\' )\n+ parser.add_option( \'-l\', \'--log\', dest=\'log\', action=\'store\', type="string", default=None, help=\'log\' )\n+ parser.add_option( \'\', \'--genomespace_toolname\', dest=\'genomespace_toolname\', action=\'store\', type="string", default=DEFAULT_GENOMESPACE_TOOLNAME, help=\'value to use for gs-toolname, used in GenomeSpace internal logging\' )\n+\n+ (options, args) = parser.parse_args()\n+\n+ send_file_to_genomespace( options.genomespace_site, options.username, options.token, options.dataset, [binascii.unhexlify(_) for _ in options.subdirectory], binascii.unhexlify( options.filename ), options.file_type, options.content_type, options.log, options.genomespace_toolname )\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_exporter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_exporter.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<tool name="GenomeSpace Exporter" id="genomespace_exporter" require_login="True" version="0.0.4"> + <description> - send data to GenomeSpace</description> + <command interpreter="python">genomespace_exporter.py + --genomespace_site "prod" + #assert $__user__, Exception( 'You must be logged in to use this tool.' ) + #set $username = $__user__.preferences.get( 'genomespace_username', None ) + #set $token = $__user__.preferences.get( 'genomespace_token', None ) + #assert None not in ( $username, $token ), Exception( 'You must associate a GenomeSpace OpenID with your account and log in with it.' ) + #import binascii + --username "${username}" + --token "${token}" + --dataset "${input1}" + #if $subdirectory: + #for $subd in str( $subdirectory ).split( '/' ): + #if not $subd: + --subdirectory "${ binascii.hexlify( '/' ) }" + #else: + --subdirectory "${ binascii.hexlify( $subd ) }" + #end if + #end for + #else: + --subdirectory "${ binascii.hexlify( 'galaxy_export' ) }" + --subdirectory "${ binascii.hexlify( str( $base_url ).split( '://', 1 )[-1] ) }" ##Protocol removed by request + #end if + #if $filename: + --filename "${ binascii.hexlify( str( $filename ) ) }" + #else: + --filename "${ binascii.hexlify( "Galaxy History Item %s (%s) - %s: %s.%s" % ( $__app__.security.encode_id( $input1.id ), $__app__.security.encode_id( $output_log.id ), $input1.hid, $input1.name, $input1.ext ) ) }" + #end if + --file_type "${input1.ext}" + --content_type "${input1.get_mime()}" + --log "${output_log}" + --genomespace_toolname="\${GENOMESPACE_TOOLNAME:-Galaxy}" + </command> + <inputs> + <param format="data" name="input1" type="data" label="Send this dataset to GenomeSpace" /> + <param name="base_url" type="baseurl" /> + <param name="subdirectory" type="drill_down" display="radio" hierarchy="exact" multiple="False" optional="True" label="Choose Target Directory" dynamic_options="galaxy_code_get_genomespace_folders( genomespace_site = 'prod', trans=__trans__, value=__value__, input_dataset=input1, base_url=base_url )" help="Leave blank to generate automatically"/> + <param name="filename" type="text" size="80" label="Filename" help="Leave blank to generate automatically" /> + </inputs> + <outputs> + <data format="html" name="output_log" /> + </outputs> + <help> +This Tool allows you to export data to GenomeSpace. You must have logged in using your GenomeSpace OpenID. You can associate your OpenID credentials under the User Preferences panel. + +If you are having trouble with this tool, click here_ to refresh your GenomeSpace token before reporting errors. + +.. _here: ${static_path}/../user/openid_auth?openid_provider=genomespace&auto_associate=True + </help> + <options refresh="True"/> + <code file="genomespace_exporter.py" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_file_browser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_file_browser.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,217 @@\n+# Dan Blankenberg\n+import json\n+import optparse\n+import os\n+\n+from six.moves import http_cookiejar\n+from six.moves.urllib.parse import unquote_plus, urlencode, urlparse\n+from six.moves.urllib.request import build_opener, HTTPCookieProcessor, Request, urlopen\n+\n+from galaxy.datatypes import sniff\n+from galaxy.datatypes.registry import Registry\n+\n+GENOMESPACE_API_VERSION_STRING = "v1.0"\n+GENOMESPACE_SERVER_URL_PROPERTIES = "https://dm.genomespace.org/config/%s/serverurl.properties" % ( GENOMESPACE_API_VERSION_STRING )\n+DEFAULT_GENOMESPACE_TOOLNAME = \'Galaxy\'\n+FILENAME_VALID_CHARS = \'.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \'\n+\n+CHUNK_SIZE = 2**20 # 1mb\n+\n+AUTO_GALAXY_EXT = "auto"\n+DEFAULT_GALAXY_EXT = "data"\n+\n+# genomespace format identifier is the URL\n+GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT = {} # TODO: fix this so it is not a global variable\n+# TODO: we should use a better way to set up this mapping\n+GENOMESPACE_EXT_TO_GALAXY_EXT = {\'rifles\': \'rifles\',\n+ \'lifes\': \'lifes\',\n+ \'cn\': \'cn\',\n+ \'GTF\': \'gtf\',\n+ \'res\': \'res\',\n+ \'xcn\': \'xcn\',\n+ \'lowercasetxt\': \'lowercasetxt\',\n+ \'bed\': \'bed\',\n+ \'CBS\': \'cbs\',\n+ \'genomicatab\': \'genomicatab\',\n+ \'gxp\': \'gxp\',\n+ \'reversedtxt\': \'reversedtxt\',\n+ \'nowhitespace\': \'nowhitespace\',\n+ \'unknown\': \'unknown\',\n+ \'txt\': \'txt\',\n+ \'uppercasetxt\': \'uppercasetxt\',\n+ \'GISTIC\': \'gistic\',\n+ \'GFF\': \'gff\',\n+ \'gmt\': \'gmt\',\n+ \'gct\': \'gct\'}\n+\n+GENOMESPACE_UNKNOWN_FORMAT_KEY = \'unknown\'\n+GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN = None\n+\n+\n+def chunk_write( source_stream, target_stream, source_method="read", target_method="write" ):\n+ source_method = getattr( source_stream, source_method )\n+ target_method = getattr( target_stream, target_method )\n+ while True:\n+ chunk = source_method( CHUNK_SIZE )\n+ if chunk:\n+ target_method( chunk )\n+ else:\n+ break\n+\n+\n+def get_cookie_opener( gs_username, gs_token, gs_toolname=None ):\n+ """ Create a GenomeSpace cookie opener """\n+ cj = http_cookiejar.CookieJar()\n+ for cookie_name, cookie_value in [ ( \'gs-token\', gs_token ), ( \'gs-username\', gs_username ) ]:\n+ # create a super-cookie, valid for all domains\n+ cookie = http_cookiejar.Cookie(version=0, name=cookie_name, value=cookie_value, port=None, port_specified=False, domain=\'\', domain_specified=False, domain_initial_dot=False, path=\'/\', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={\'HttpOnly\': None}, rfc2109=False )\n+ cj.set_cookie( cookie )\n+ cookie_opener = build_opener( HTTPCookieProcessor( cj ) )\n+ cookie_opener.addheaders.append( ( \'gs-toolname\', gs_toolname or DEFAULT_GENOMESPACE_TOOLNAME ) )\n+ return cookie_opener\n+\n+\n+def get_galaxy_ext_from_genomespace_format_url( url_opener, file_format_url ):\n+ ext = GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT.get( file_format_url, None )\n+ if ext is not None:\n+ ext = GENOMESPACE_EXT_TO_GALAXY_EXT.get( ext, None )\n+ if ext is None:\n+ # could check content type, etc here\n+ ext = AUTO_GALAXY_EXT\n+ return ext\n+\n+\n+def get_genomespace_site_urls():\n+ genomespace_sites = {}\n+ for line in urlopen( GENOMESPACE_SERVER_URL_PROPERTIES ).read().split( \'\\n\' ):\n+ line = line.rstrip()\n+ if not line or line.startswith( "#" ):\n+ continue\n+ '..b' datasource_params.get( filetype_key, None )\n+ galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url )\n+ formatted_download_url = "%s?%s" % ( download_url, urlencode( [ ( \'dataformat\', filetype_url ) ] ) )\n+ new_file_request = Request( formatted_download_url )\n+ new_file_request.get_method = lambda: \'GET\'\n+ target_download_url = url_opener.open( new_file_request )\n+ filename = None\n+ if \'Content-Disposition\' in target_download_url.info():\n+ # If the response has Content-Disposition, try to get filename from it\n+ content_disposition = dict( x.strip().split(\'=\') if \'=\' in x else ( x.strip(), \'\' ) for x in target_download_url.info()[\'Content-Disposition\'].split( \';\' ) )\n+ if \'filename\' in content_disposition:\n+ filename = content_disposition[ \'filename\' ].strip( "\\"\'" )\n+ if not filename:\n+ parsed_url = urlparse( download_url )\n+ filename = unquote_plus( parsed_url[2].split( \'/\' )[-1] )\n+ if not filename:\n+ filename = download_url\n+ metadata_dict = None\n+ original_filename = filename\n+ if output_filename is None:\n+ filename = \'\'.join( c in FILENAME_VALID_CHARS and c or \'-\' for c in filename )\n+ while filename in used_filenames:\n+ filename = "-%s" % filename\n+ used_filenames.append( filename )\n+ output_filename = os.path.join( os.getcwd(), \'primary_%i_%s_visible_%s\' % ( hda_id, filename, galaxy_ext ) )\n+\n+ metadata_dict = dict( type=\'new_primary_dataset\',\n+ base_dataset_id=dataset_id,\n+ ext=galaxy_ext,\n+ filename=output_filename,\n+ name="GenomeSpace import on %s" % ( original_filename ) )\n+ else:\n+ if dataset_id is not None:\n+ metadata_dict = dict( type=\'dataset\',\n+ dataset_id=dataset_id,\n+ ext=galaxy_ext,\n+ name="GenomeSpace import on %s" % ( filename ) )\n+ output_file = open( output_filename, \'wb\' )\n+ chunk_write( target_download_url, output_file )\n+ output_file.close()\n+\n+ if ( galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN ) and metadata_dict:\n+ # try to sniff datatype\n+ try:\n+ galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry )\n+ except:\n+ # sniff failed\n+ galaxy_ext = original_filename.rsplit( \'.\', 1 )[-1]\n+ if galaxy_ext not in datatypes_registry.datatypes_by_extension:\n+ galaxy_ext = DEFAULT_GALAXY_EXT\n+ metadata_dict[ \'ext\' ] = galaxy_ext\n+\n+ output_filename = None # only have one filename available\n+\n+ # write out metadata info\n+ if metadata_dict:\n+ metadata_parameter_file.write( "%s\\n" % json.dumps( metadata_dict ) )\n+\n+ metadata_parameter_file.close()\n+ return True\n+\n+\n+if __name__ == \'__main__\':\n+ parser = optparse.OptionParser()\n+ parser.add_option( \'-p\', \'--json_parameter_file\', dest=\'json_parameter_file\', action=\'store\', type="string", default=None, help=\'json_parameter_file\' )\n+ parser.add_option( \'-s\', \'--genomespace_site\', dest=\'genomespace_site\', action=\'store\', type="string", default=None, help=\'genomespace_site\' )\n+ parser.add_option( \'\', \'--genomespace_toolname\', dest=\'genomespace_toolname\', action=\'store\', type="string", default=DEFAULT_GENOMESPACE_TOOLNAME, help=\'value to use for gs-toolname, used in GenomeSpace internal logging\' )\n+ (options, args) = parser.parse_args()\n+\n+ download_from_genomespace_file_browser( options.json_parameter_file, options.genomespace_site, options.genomespace_toolname )\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_file_browser_dev.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_file_browser_dev.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tool name="GenomeSpace import" id="genomespace_file_browser_dev" tool_type="data_source" add_galaxy_url="False" force_history_refresh="True" version="0.0.1"> + <description>from file browser (development)</description> + <command interpreter="python">genomespace_file_browser.py --json_parameter_file "${output}" --genomespace_site "dev" --genomespace_toolname="\${GENOMESPACE_TOOLNAME:-Galaxy}"</command> + <inputs action="https://dmdev.genomespace.org:8444/datamanager/defaultdirectory" check_values="False" method="post"> + <display>go to GenomeSpace Data Manager </display> + <param name="appCallbackUrl" type="baseurl" value="/tool_runner?tool_id=genomespace_file_browser_dev&runtool_btn=Execute" /> + <param name="appName" type="hidden" value="Galaxy" /> + </inputs> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="auto" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_file_browser_prod.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_file_browser_prod.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tool name="GenomeSpace import" id="genomespace_file_browser_prod" tool_type="data_source" add_galaxy_url="False" force_history_refresh="True" version="0.0.1"> + <description>from file browser</description> + <command interpreter="python">genomespace_file_browser.py --json_parameter_file "${output}" --genomespace_site "prod" --genomespace_toolname="\${GENOMESPACE_TOOLNAME:-Galaxy}"</command> + <inputs action="https://dm.genomespace.org/datamanager/defaultdirectory" check_values="False" method="post"> + <display>go to GenomeSpace Data Manager </display> + <param name="appCallbackUrl" type="baseurl" value="/tool_runner?tool_id=genomespace_file_browser_prod&runtool_btn=Execute" /> + <param name="appName" type="hidden" value="Galaxy" /> + </inputs> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="auto" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_file_browser_test.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_file_browser_test.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tool name="GenomeSpace import" id="genomespace_file_browser_test" tool_type="data_source" add_galaxy_url="False" force_history_refresh="True" version="0.0.1"> + <description>from file browser (test)</description> + <command interpreter="python">genomespace_file_browser.py --json_parameter_file "${output}" --genomespace_site "test" --genomespace_toolname="\${GENOMESPACE_TOOLNAME:-Galaxy}"</command> + <inputs action="https://dmtest.genomespace.org:8444/datamanager/defaultdirectory" check_values="False" method="post"> + <display>go to GenomeSpace Data Manager </display> + <param name="appCallbackUrl" type="baseurl" value="/tool_runner?tool_id=genomespace_file_browser_test&runtool_btn=Execute" /> + <param name="appName" type="hidden" value="Galaxy" /> + </inputs> + <uihints minwidth="800"/> + <outputs> + <data name="output" format="auto" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_importer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_importer.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,220 @@\n+# Dan Blankenberg\n+\n+import json\n+import optparse\n+import os\n+import shutil\n+import tempfile\n+\n+from six.moves import http_cookiejar\n+from six.moves.urllib.parse import parse_qs, unquote_plus, urlparse\n+from six.moves.urllib.request import build_opener, HTTPCookieProcessor, Request, urlopen\n+\n+from galaxy.datatypes import sniff\n+from galaxy.datatypes.registry import Registry\n+\n+GENOMESPACE_API_VERSION_STRING = "v1.0"\n+GENOMESPACE_SERVER_URL_PROPERTIES = "https://dm.genomespace.org/config/%s/serverurl.properties" % ( GENOMESPACE_API_VERSION_STRING )\n+DEFAULT_GENOMESPACE_TOOLNAME = \'Galaxy\'\n+FILENAME_VALID_CHARS = \'.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \'\n+\n+CHUNK_SIZE = 2**20 # 1mb\n+\n+DEFAULT_GALAXY_EXT = "data"\n+\n+# genomespace format identifier is the URL\n+GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT = {} # TODO: fix this so it is not a global variable\n+# TODO: we should use a better way to set up this mapping\n+GENOMESPACE_EXT_TO_GALAXY_EXT = {\'rifles\': \'rifles\',\n+ \'lifes\': \'lifes\',\n+ \'cn\': \'cn\',\n+ \'GTF\': \'gtf\',\n+ \'res\': \'res\',\n+ \'xcn\': \'xcn\',\n+ \'lowercasetxt\': \'lowercasetxt\',\n+ \'bed\': \'bed\',\n+ \'CBS\': \'cbs\',\n+ \'genomicatab\': \'genomicatab\',\n+ \'gxp\': \'gxp\',\n+ \'reversedtxt\': \'reversedtxt\',\n+ \'nowhitespace\': \'nowhitespace\',\n+ \'unknown\': \'unknown\',\n+ \'txt\': \'txt\',\n+ \'uppercasetxt\': \'uppercasetxt\',\n+ \'GISTIC\': \'gistic\',\n+ \'GFF\': \'gff\',\n+ \'gmt\': \'gmt\',\n+ \'gct\': \'gct\'}\n+\n+\n+def chunk_write( source_stream, target_stream, source_method="read", target_method="write" ):\n+ source_method = getattr( source_stream, source_method )\n+ target_method = getattr( target_stream, target_method )\n+ while True:\n+ chunk = source_method( CHUNK_SIZE )\n+ if chunk:\n+ target_method( chunk )\n+ else:\n+ break\n+\n+\n+def get_cookie_opener( gs_username, gs_token, gs_toolname=None ):\n+ """ Create a GenomeSpace cookie opener """\n+ cj = http_cookiejar.CookieJar()\n+ for cookie_name, cookie_value in [ ( \'gs-token\', gs_token ), ( \'gs-username\', gs_username ) ]:\n+ # create a super-cookie, valid for all domains\n+ cookie = http_cookiejar.Cookie(version=0, name=cookie_name, value=cookie_value, port=None, port_specified=False, domain=\'\', domain_specified=False, domain_initial_dot=False, path=\'/\', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={\'HttpOnly\': None}, rfc2109=False )\n+ cj.set_cookie( cookie )\n+ cookie_opener = build_opener( HTTPCookieProcessor( cj ) )\n+ cookie_opener.addheaders.append( ( \'gs-toolname\', gs_toolname or DEFAULT_GENOMESPACE_TOOLNAME ) )\n+ return cookie_opener\n+\n+\n+def get_galaxy_ext_from_genomespace_format_url( url_opener, file_format_url, default=DEFAULT_GALAXY_EXT ):\n+ ext = GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT.get( file_format_url, None )\n+ if ext is not None:\n+ ext = GENOMESPACE_EXT_TO_GALAXY_EXT.get( ext, None )\n+ if ext is None:\n+ # could check content type, etc here\n+ ext = default\n+ return ext\n+\n+\n+def get_genomespace_site_urls():\n+ genomespace_sites = {}\n+ for line in urlopen( GENOMESPACE_SERVER_URL_PROPERTIES ).read().split( \'\\n\' ):\n+ line = line.rstrip()\n+ if not line or line.startswith( "#" ):\n+ continue\n+ server, line = line.split( \'.\', 1 )\n+ if server not i'..b' metadata_url = url_opener.open( metadata_request )\n+ file_metadata_dict = json.loads( metadata_url.read() )\n+ metadata_url.close()\n+ file_type = file_metadata_dict.get( \'dataFormat\', None )\n+ if file_type and file_type.get( \'url\' ):\n+ file_type = file_type.get( \'url\' )\n+ file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default=None )\n+ except:\n+ pass\n+ if file_type is None:\n+ # try to sniff datatype\n+ try:\n+ file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry )\n+ except:\n+ pass # sniff failed\n+ if file_type is None and \'.\' in parsed_url[2]:\n+ # still no known datatype, fall back to using extension\n+ file_type = parsed_url[2].rsplit( \'.\', 1 )[-1]\n+ file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get( file_type, file_type )\n+ if file_type is None:\n+ # use default extension (e.g. \'data\')\n+ file_type = DEFAULT_GALAXY_EXT\n+\n+ # save json info for single primary dataset\n+ if dataset_id is not None:\n+ metadata_parameter_file.write( "%s\\n" % json.dumps( dict( type=\'dataset\',\n+ dataset_id=dataset_id,\n+ ext=file_type,\n+ name="GenomeSpace importer on %s" % ( filename ) ) ) )\n+ # if using tmp file, move the file to the new file path dir to get scooped up later\n+ if using_temp_file:\n+ original_filename = filename\n+ filename = \'\'.join( c in FILENAME_VALID_CHARS and c or \'-\' for c in filename )\n+ while filename in used_filenames:\n+ filename = "-%s" % filename\n+ used_filenames.append( filename )\n+ target_output_filename = os.path.join( os.getcwd(), \'primary_%i_%s_visible_%s\' % ( hda_id, filename, file_type ) )\n+ shutil.move( output_filename, target_output_filename )\n+ metadata_parameter_file.write( "%s\\n" % json.dumps( dict( type=\'new_primary_dataset\',\n+ base_dataset_id=base_dataset_id,\n+ ext=file_type,\n+ filename=target_output_filename,\n+ name="GenomeSpace importer on %s" % ( original_filename ) ) ) )\n+ dataset_id = None # only one primary dataset available\n+ output_filename = None # only have one filename available\n+ metadata_parameter_file.close()\n+ return True\n+\n+\n+if __name__ == \'__main__\':\n+ parser = optparse.OptionParser()\n+ parser.add_option( \'-p\', \'--json_parameter_file\', dest=\'json_parameter_file\', action=\'store\', type="string", default=None, help=\'json_parameter_file\' )\n+ parser.add_option( \'-s\', \'--genomespace_site\', dest=\'genomespace_site\', action=\'store\', type="string", default=None, help=\'genomespace_site\' )\n+ parser.add_option( \'-t\', \'--token\', dest=\'token\', action=\'store\', type="string", default=None, help=\'token\' )\n+ parser.add_option( \'-u\', \'--username\', dest=\'username\', action=\'store\', type="string", default=None, help=\'username\' )\n+ parser.add_option( \'\', \'--genomespace_toolname\', dest=\'genomespace_toolname\', action=\'store\', type="string", default=DEFAULT_GENOMESPACE_TOOLNAME, help=\'value to use for gs-toolname, used in GenomeSpace internal logging\' )\n+ (options, args) = parser.parse_args()\n+\n+ download_from_genomespace_importer( options.username, options.token, options.json_parameter_file, options.genomespace_site, options.genomespace_toolname )\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c genomespace/genomespace_importer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genomespace/genomespace_importer.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,26 @@ +<?xml version="1.0"?> +<tool name="GenomeSpace Importer" id="genomespace_importer" tool_type="data_source" force_history_refresh="True" hidden="True" display_interface="False" require_login="True" version="0.0.2"> + <description> - receive data from GenomeSpace</description> + <command interpreter="python">genomespace_importer.py + --genomespace_site "prod" + #assert $__user__, Exception( 'You must be logged in to use this tool.' ) + #set $username = $__user__.preferences.get( 'genomespace_username', None ) + #set $token = $__user__.preferences.get( 'genomespace_token', None ) + #assert None not in ( $username, $token ), Exception( 'You must associate a GenomeSpace OpenID with your account and log in with it.' ) + --username "${username}" + --token "${token}" + --json_parameter_file "${output_file1}" + --genomespace_toolname="\${GENOMESPACE_TOOLNAME:-Galaxy}" + </command> + <inputs check_values="False"> + <!-- <param name="file_name" type="text" value="" /> --> + <param name="URL" type="hidden" value="" /> + </inputs> + <outputs> + <data format="auto" name="output_file1" /> + </outputs> + <help> + some help text here... + </help> + <options sanitize="False" refresh="True"/> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/genebed_maf_to_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/genebed_maf_to_fasta.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,97 @@ +<tool id="GeneBed_Maf_Fasta2" name="Stitch Gene blocks" version="1.0.1"> + <description>given a set of coding exon intervals</description> + <macros> + <import>macros.xml</import> + </macros> + <command> +python '$__tool_directory__/interval_maf_to_merged_fasta.py' --dbkey=$dbkey --species=$maf_source_type.species +#if $maf_source_type.maf_source == "user" + --mafSource='$maf_source_type.maf_file' --mafIndex='$maf_source_type.maf_file.metadata.maf_index' +#else + --mafSource='$maf_source_type.maf_identifier' +#end if +--interval_file='$input1' --output_file='$out_file1' --mafSourceType=$maf_source_type.maf_source --geneBED --mafIndexFileDir='${GALAXY_DATA_INDEX_DIR}' --overwrite_with_gaps=$overwrite_with_gaps + </command> + <inputs> + <param name="input1" type="data" format="bed" label="Gene BED File"> + <validator type="unspecified_build" /> + <validator type="expression" message="Input must be in BED12 format.">value.metadata.columns >= 12</validator> <!-- allow 12+ columns, not as strict as possible. TODO: only list bed files with 12+ columns --> + </param> + <conditional name="maf_source_type"> + <param name="maf_source" type="select" label="MAF Source"> + <option value="cached" selected="true">Locally Cached Alignments</option> + <option value="user">Alignments in Your History</option> + </param> + <when value="user"> + <param name="maf_file" type="data" format="maf" label="MAF File"> + <validator type="dataset_ok_validator" /> + <options> + <filter type="data_meta" ref="input1" key="dbkey" /> + </options> + </param> + <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> + <options> + <filter type="data_meta" ref="maf_file" key="species" /> + </options> + </param> + </when> + <when value="cached"> + <param name="maf_identifier" type="select" label="MAF Type"> + <options from_file="maf_index.loc"> + <column name="name" index="0"/> + <column name="value" index="1"/> + <column name="dbkey" index="2"/> + <column name="species" index="3"/> + <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> + <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> + </options> + </param> + <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> + <options from_file="maf_index.loc"> + <column name="uid" index="1"/> + <column name="value" index="3"/> + <column name="name" index="3"/> + <filter type="param_value" ref="maf_identifier" name="uid" column="1"/> + <filter type="multiple_splitter" column="3" separator=","/> + </options> + </param> + </when> + </conditional> + <param name="overwrite_with_gaps" type="select" label="Split into Gapless MAF blocks" help="When set to Yes, blocks are divided around gaps appearing in any species. This will prevent gaps occurring in the interior of the sequence for an aligning species from overwriting a nucleotide found for the same position in a lower-scoring block."> + <option value="True" selected="true">No</option> + <option value="False">Yes</option> + </param> + </inputs> + <outputs> + <data format="fasta" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="8.bed"/> + <param name="maf_source" value="cached"/> + <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/> + <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="overwrite_with_gaps" value="True"/> + <output name="out_file1" file="gene_bed_maf_to_fasta_out.fasta" /> + </test> + <test> + <param name="input1" value="8.bed"/> + <param name="maf_source" value="user"/> + <param name="maf_file" value="4.maf"/> + <param name="species" value="hg17,panTro1"/> + <param name="overwrite_with_gaps" value="True"/> + <output name="out_file1" file="gene_bed_maf_to_fasta_user_out.fasta" /> + </test> + </tests> + <help> +**What it does** + +The coding sequence of genes are usually composed of several coding exons. Each of these coding exons is an individual genomic region, which when concatenated with each other constitutes the coding sequence. A single genomic region can be covered by multiple alignment blocks. In many cases it is desirable to stitch these alignment blocks together. This tool accepts a list of gene-based intervals, in the Gene BED format. For every interval it performs the following: + + * finds all MAF blocks that overlap the coding regions; + * sorts MAF blocks by alignment score; + * stitches blocks together and resolves overlaps based on alignment score; + * outputs alignments in FASTA format. + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/interval2maf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/interval2maf.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,145 @@ +#!/usr/bin/env python +""" +Reads a list of intervals and a maf. Produces a new maf containing the +blocks or parts of blocks in the original that overlapped the intervals. + +If a MAF file, not UID, is provided the MAF file is indexed before being processed. + +NOTE: If two intervals overlap the same block it will be written twice. + +usage: %prog maf_file [options] + -d, --dbkey=d: Database key, ie hg17 + -c, --chromCol=c: Column of Chr + -s, --startCol=s: Column of Start + -e, --endCol=e: Column of End + -S, --strandCol=S: Column of Strand + -t, --mafType=t: Type of MAF source to use + -m, --mafFile=m: Path of source MAF file, if not using cached version + -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version + -i, --interval_file=i: Input interval file + -o, --output_file=o: Output MAF file + -p, --species=p: Species to include in output + -P, --split_blocks_by_species=P: Split blocks by species + -r, --remove_all_gap_columns=r: Remove all Gap columns + -l, --indexLocation=l: Override default maf_index.loc file + -z, --mafIndexFile=z: Directory of local maf index file ( maf_index.loc or maf_pairwise.loc ) +""" +# Dan Blankenberg +from __future__ import print_function + +import bx.align.maf +import bx.intervals.io +from bx.cookbook import doc_optparse + +from galaxy.tools.util import maf_utilities + + +def __main__(): + index = index_filename = None + + # Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + + if options.dbkey: + dbkey = options.dbkey + else: + dbkey = None + if dbkey in [None, "?"]: + maf_utilities.tool_fail( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) + + species = maf_utilities.parse_species_option( options.species ) + + if options.chromCol: + chromCol = int( options.chromCol ) - 1 + else: + maf_utilities.tool_fail( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) + + if options.startCol: + startCol = int( options.startCol ) - 1 + else: + maf_utilities.tool_fail( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) + + if options.endCol: + endCol = int( options.endCol ) - 1 + else: + maf_utilities.tool_fail( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) + + if options.strandCol: + strandCol = int( options.strandCol ) - 1 + else: + strandCol = -1 + + if options.interval_file: + interval_file = options.interval_file + else: + maf_utilities.tool_fail( "Input interval file has not been specified." ) + + if options.output_file: + output_file = options.output_file + else: + maf_utilities.tool_fail( "Output file has not been specified." ) + + split_blocks_by_species = remove_all_gap_columns = False + if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species': + split_blocks_by_species = True + if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns': + remove_all_gap_columns = True + else: + remove_all_gap_columns = True + # Finish parsing command line + + # Open indexed access to MAFs + if options.mafType: + if options.indexLocation: + index = maf_utilities.maf_index_by_uid( options.mafType, options.indexLocation ) + else: + index = maf_utilities.maf_index_by_uid( options.mafType, options.mafIndexFile ) + if index is None: + maf_utilities.tool_fail( "The MAF source specified (%s) appears to be invalid." % ( options.mafType ) ) + elif options.mafFile: + index, index_filename = maf_utilities.open_or_build_maf_index( options.mafFile, options.mafIndex, species=[dbkey] ) + if index is None: + maf_utilities.tool_fail( "Your MAF file appears to be malformed." ) + else: + maf_utilities.tool_fail( "Desired source MAF type has not been specified." ) + + # Create MAF writter + out = bx.align.maf.Writer( open(output_file, "w") ) + + # Iterate over input regions + num_blocks = 0 + num_regions = None + for num_regions, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col=chromCol, start_col=startCol, end_col=endCol, strand_col=strandCol, fix_strand=True, return_header=False, return_comments=False ) ): + src = maf_utilities.src_merge( dbkey, region.chrom ) + for block in index.get_as_iterator( src, region.start, region.end ): + if split_blocks_by_species: + blocks = [ new_block for new_block in maf_utilities.iter_blocks_split_by_species( block ) if maf_utilities.component_overlaps_region( new_block.get_component_by_src_start( dbkey ), region ) ] + else: + blocks = [ block ] + for block in blocks: + block = maf_utilities.chop_block_by_region( block, src, region ) + if block is not None: + if species is not None: + block = block.limit_to_species( species ) + block = maf_utilities.orient_block_by_region( block, src, region ) + if remove_all_gap_columns: + block.remove_all_gap_columns() + out.write( block ) + num_blocks += 1 + + # Close output MAF + out.close() + + # remove index file if created during run + maf_utilities.remove_temp_index_file( index_filename ) + + if num_blocks: + print("%i MAF blocks extracted for %i regions." % ( num_blocks, ( num_regions + 1 ) )) + elif num_regions is not None: + print("No MAF blocks could be extracted for %i regions." % ( num_regions + 1 )) + else: + print("No valid regions have been provided.") + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/interval2maf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/interval2maf.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,290 @@\n+<tool id="Interval2Maf1" name="Extract MAF blocks" version="1.0.1">\n+ <description>given a set of genomic intervals</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <command>\n+python \'$__tool_directory__/interval2maf.py\' --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol}\n+#if $maf_source_type.maf_source == "user"\n+ --mafFile=\'$maf_source_type.mafFile\' --mafIndex=\'$maf_source_type.mafFile.metadata.maf_index\'\n+#else\n+ --mafType=\'$maf_source_type.mafType\'\n+#end if\n+--interval_file=\'$input1\' --output_file=\'$out_file1\' --mafIndexFile=\'${GALAXY_DATA_INDEX_DIR}/maf_index.loc\' --species=$maf_source_type.species --split_blocks_by_species=$split_blocks_by_species_selector.split_blocks_by_species\n+#if $split_blocks_by_species_selector.split_blocks_by_species == "split_blocks_by_species"\n+ --remove_all_gap_columns=$split_blocks_by_species_selector.remove_all_gap_columns\n+#end if\n+ </command>\n+ <inputs>\n+ <param format="interval" name="input1" type="data" label="Choose intervals">\n+ <validator type="unspecified_build" />\n+ </param>\n+ <conditional name="maf_source_type">\n+ <param name="maf_source" type="select" label="MAF Source">\n+ <option value="cached" selected="true">Locally Cached Alignments</option>\n+ <option value="user">Alignments in Your History</option>\n+ </param>\n+ <when value="user">\n+ <param format="maf" name="mafFile" label="Choose alignments" type="data">\n+ <options>\n+ <filter type="data_meta" ref="input1" key="dbkey" />\n+ </options>\n+ <validator type="dataset_ok_validator" />\n+ </param>\n+ <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">\n+ <options>\n+ <filter type="data_meta" ref="mafFile" key="species" />\n+ </options>\n+ </param>\n+ </when>\n+ <when value="cached">\n+ <param name="mafType" type="select" label="Choose alignments">\n+ <options from_data_table="indexed_maf_files">\n+ <!--\n+ <column name="name" index="0"/>\n+ <column name="value" index="1"/>\n+ <column name="dbkey" index="2"/>\n+ <column name="species" index="3"/>\n+ -->\n+ <filter type="data_meta" ref="input1" key="dbkey" column="dbkey" multiple="True" separator=","/>\n+ <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>\n+ </options>\n+ </param>\n+ <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">\n+ <options from_data_table="indexed_maf_files">\n+ <column name="uid" index="1"/>\n+ <column name="value" index="3"/>\n+ <column name="name" index="3"/>\n+ <filter type="param_value" ref="mafType" column="uid"/>\n+ <filter type="multiple_splitter" column="name" separator=","/>\n+ </options>\n+ </param>\n+ </when>\n+ </conditional>\n+ <conditional name="split_blocks_by_species_selector">\n+ <param name="split_blocks_by_species" type="select" label="Split blocks by species" help="Not usually applicable. See help below for more information.">\n+ <option value="split_blocks_by_species">Split by species</option>\n+ <option value="dont_split_blocks_by_species" selected="true">Do not split</option>\n+ </param>\n+ <when value="dont_split_blocks_by_species">\n+ <!-- do nothing here -->\n+ </when>\n+ <when value="split_blocks_by_species">\n+ <param name="remove_all_gap_columns" type="select" label="Collapse empty alignment columns"'..b'229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG\n+ s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+ </help>\n+ <expand macro="citations" />\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/interval2maf_pairwise.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/interval2maf_pairwise.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,46 @@ +<tool id="Interval2Maf_pairwise1" name="Extract Pairwise MAF blocks" version="1.0.1"> + <description>given a set of genomic intervals</description> + <macros> + <import>macros.xml</import> + </macros> + <command>python '$__tool_directory__/interval2maf.py' --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafType='$mafType' --interval_file='$input1' --output_file='$out_file1' --indexLocation='${GALAXY_DATA_INDEX_DIR}/maf_pairwise.loc'</command> + <inputs> + <param name="input1" type="data" format="interval" label="Interval File"> + <validator type="unspecified_build" /> + </param> + <param name="mafType" type="select" label="Choose MAF source"> + <options from_file="maf_pairwise.loc"> + <column name="name" index="0"/> + <column name="value" index="1"/> + <column name="dbkey" index="2"/> + <column name="species" index="3"/> + <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> + <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> + </options> + </param> + </inputs> + <outputs> + <data format="maf" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="8.bed" dbkey="hg17" ftype="bed"/> + <param name="mafType" value="PAIRWISE_hg17_fr1"/> + <output name="out_file1" file="Interval2Maf_pairwise_out.maf"/> + </test> + </tests> + <help> +**What it does** + +This tool takes genomic coordinates, superimposes them on pairwise alignments (in MAF format) stored on the Galaxy site, and excises alignment blocks corresponding to each set of coordinates. Alignment blocks that extend past START and/or END positions of an interval are trimmed. Note that a single genomic interval may correspond to two or more alignment blocks. + +----- + +**Example** + +Here a single interval is superimposed on three MAF blocks. Blocks 1 and 3 are trimmed because they extend beyond boundaries of the interval: + +.. image:: ${static_path}/images/maf_icons/interval2maf.png + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/interval_maf_to_merged_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/interval_maf_to_merged_fasta.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,204 @@\n+#!/usr/bin/env python\n+"""\n+Reads an interval or gene BED and a MAF Source.\n+Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates\n+\n+Alignment blocks are layered ontop of each other based upon score.\n+\n+usage: %prog maf_file [options]\n+ -d, --dbkey=d: Database key, ie hg17\n+ -c, --chromCol=c: Column of Chr\n+ -s, --startCol=s: Column of Start\n+ -e, --endCol=e: Column of End\n+ -S, --strandCol=S: Column of Strand\n+ -G, --geneBED: Input is a Gene BED file, process and join exons as one region\n+ -t, --mafSourceType=t: Type of MAF source to use\n+ -m, --mafSource=m: Path of source MAF file, if not using cached version\n+ -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version\n+ -i, --interval_file=i: Input interval file\n+ -o, --output_file=o: Output MAF file\n+ -p, --species=p: Species to include in output\n+ -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.\n+ -z, --mafIndexFileDir=z: Directory of local maf_index.loc file\n+\n+usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR\n+"""\n+# Dan Blankenberg\n+from __future__ import print_function\n+\n+import sys\n+\n+import bx.intervals.io\n+from bx.cookbook import doc_optparse\n+\n+from galaxy.tools.util import maf_utilities\n+\n+\n+def stop_err( msg ):\n+ sys.stderr.write( msg )\n+ sys.exit()\n+\n+\n+def __main__():\n+ # Parse Command Line\n+ options, args = doc_optparse.parse( __doc__ )\n+ mincols = 0\n+ strand_col = -1\n+\n+ if options.dbkey:\n+ primary_species = options.dbkey\n+ else:\n+ primary_species = None\n+ if primary_species in [None, "?", "None"]:\n+ stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )\n+\n+ include_primary = True\n+ secondary_species = maf_utilities.parse_species_option( options.species )\n+ if secondary_species:\n+ species = list( secondary_species ) # make copy of species list\n+ if primary_species in secondary_species:\n+ secondary_species.remove( primary_species )\n+ else:\n+ include_primary = False\n+ else:\n+ species = None\n+\n+ if options.interval_file:\n+ interval_file = options.interval_file\n+ else:\n+ stop_err( "Input interval file has not been specified." )\n+\n+ if options.output_file:\n+ output_file = options.output_file\n+ else:\n+ stop_err( "Output file has not been specified." )\n+\n+ if not options.geneBED:\n+ if options.chromCol:\n+ chr_col = int( options.chromCol ) - 1\n+ else:\n+ stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )\n+\n+ if options.startCol:\n+ start_col = int( options.startCol ) - 1\n+ else:\n+ stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )\n+\n+ if options.endCol:\n+ end_col = int( options.endCol ) - 1\n+ else:\n+ stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )\n+\n+ if options.strandCol:\n+ strand_col = int( options.strandCol ) - 1\n+\n+ mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir\n+\n+ overwrite_with_gaps = True\n+ if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == \'false\':\n+ overwrite_with_gaps = False\n+\n+ # Finish parsing command line\n+\n+ # get index for mafs based on type\n+ index = index_filename = None\n+ # using specified uid for locally cached\n+ if options.mafSourceType.lower() in ["'..b'ator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )\n+ else:\n+ region_enumerator = enumerate(bx.intervals.io.NiceReaderWrapper(\n+ open( interval_file, \'r\' ), chrom_col=chr_col, start_col=start_col,\n+ end_col=end_col, strand_col=strand_col, fix_strand=True,\n+ return_header=False, return_comments=False ) )\n+\n+ # Step through intervals\n+ regions_extracted = 0\n+ line_count = 0\n+ for line_count, line in region_enumerator:\n+ try:\n+ if options.geneBED: # Process as Gene BED\n+ try:\n+ starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )\n+ # create spliced alignment object\n+ alignment = maf_utilities.get_spliced_region_alignment(\n+ index, primary_species, fields[0], starts, ends,\n+ strand=\'+\', species=species, mincols=mincols,\n+ overwrite_with_gaps=overwrite_with_gaps )\n+ primary_name = secondary_name = fields[3]\n+ alignment_strand = fields[5]\n+ except Exception as e:\n+ print("Error loading exon positions from input line %i: %s" % ( line_count, e ))\n+ continue\n+ else: # Process as standard intervals\n+ try:\n+ # create spliced alignment object\n+ alignment = maf_utilities.get_region_alignment(\n+ index, primary_species, line.chrom, line.start,\n+ line.end, strand=\'+\', species=species, mincols=mincols,\n+ overwrite_with_gaps=overwrite_with_gaps )\n+ primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )\n+ secondary_name = ""\n+ alignment_strand = line.strand\n+ except Exception as e:\n+ print("Error loading region positions from input line %i: %s" % ( line_count, e ))\n+ continue\n+\n+ # Write alignment to output file\n+ # Output primary species first, if requested\n+ if include_primary:\n+ output.write( ">%s.%s\\n" % ( primary_species, primary_name ) )\n+ if alignment_strand == "-":\n+ output.write( alignment.get_sequence_reverse_complement( primary_species ) )\n+ else:\n+ output.write( alignment.get_sequence( primary_species ) )\n+ output.write( "\\n" )\n+ # Output all remainging species\n+ for spec in secondary_species or alignment.get_species_names( skip=primary_species ):\n+ if secondary_name:\n+ output.write( ">%s.%s\\n" % ( spec, secondary_name ) )\n+ else:\n+ output.write( ">%s\\n" % ( spec ) )\n+ if alignment_strand == "-":\n+ output.write( alignment.get_sequence_reverse_complement( spec ) )\n+ else:\n+ output.write( alignment.get_sequence( spec ) )\n+ output.write( "\\n" )\n+\n+ output.write( "\\n" )\n+ regions_extracted += 1\n+ except Exception as e:\n+ print("Unexpected error from input line %i: %s" % ( line_count, e ))\n+ continue\n+\n+ # close output file\n+ output.close()\n+\n+ # remove index file if created during run\n+ maf_utilities.remove_temp_index_file( index_filename )\n+\n+ # Print message about success for user\n+ if regions_extracted > 0:\n+ print("%i regions were processed successfully." % ( regions_extracted ))\n+ else:\n+ print("No regions were processed successfully.")\n+ if line_count > 0 and options.geneBED:\n+ print("This tool requires your input file to conform to the 12 column BED standard.")\n+\n+\n+if __name__ == "__main__":\n+ __main__()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/interval_maf_to_merged_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/interval_maf_to_merged_fasta.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,112 @@ +<tool id="Interval_Maf_Merged_Fasta2" name="Stitch MAF blocks" version="1.0.1"> + <description>given a set of genomic intervals</description> + <macros> + <import>macros.xml</import> + </macros> + <command> +python '$__tool_directory__/interval_maf_to_merged_fasta.py' --dbkey=$dbkey --species=$maf_source_type.species +#if $maf_source_type.maf_source == "user" + --mafSource='$maf_source_type.maf_file' --mafIndex='$maf_source_type.maf_file.metadata.maf_index' +#else + --mafSource='$maf_source_type.maf_identifier' +#end if +--interval_file='$input1' --output_file='$out_file1' --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafSourceType=$maf_source_type.maf_source --mafIndexFileDir='${GALAXY_DATA_INDEX_DIR}' --overwrite_with_gaps=$overwrite_with_gaps + </command> + <inputs> + <param format="interval" name="input1" type="data" label="Choose intervals"> + <validator type="unspecified_build" /> + </param> + <conditional name="maf_source_type"> + <param name="maf_source" type="select" label="MAF Source"> + <option value="cached" selected="true">Locally Cached Alignments</option> + <option value="user">Alignments in Your History</option> + </param> + <when value="user"> + <param name="maf_file" type="data" format="maf" label="MAF File"> + <options> + <filter type="data_meta" ref="input1" key="dbkey" /> + </options> + <validator type="dataset_ok_validator" /> + </param> + <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> + <options> + <filter type="data_meta" ref="maf_file" key="species" /> + </options> + </param> + </when> + <when value="cached"> + <param name="maf_identifier" type="select" label="MAF Type" > + <options from_file="maf_index.loc"> + <column name="name" index="0"/> + <column name="value" index="1"/> + <column name="dbkey" index="2"/> + <column name="species" index="3"/> + <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> + <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> + </options> + </param> + <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> + <options from_file="maf_index.loc"> + <column name="uid" index="1"/> + <column name="value" index="3"/> + <column name="name" index="3"/> + <filter type="param_value" ref="maf_identifier" name="uid" column="1"/> + <filter type="multiple_splitter" column="3" separator=","/> + </options> + </param> + </when> + </conditional> + <param name="overwrite_with_gaps" type="select" label="Split into Gapless MAF blocks" help="When set to Yes, blocks are divided around gaps appearing in any species. This will prevent gaps occurring in the interior of the sequence for an aligning species from overwriting a nucleotide found for the same position in a lower-scoring block."> + <option value="True" selected="true">No</option> + <option value="False">Yes</option> + </param> + </inputs> + <outputs> + <data format="fasta" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="13.bed" dbkey="hg18" ftype="bed"/> + <param name="maf_source" value="cached"/> + <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/> + <param name="species" value="hg18,mm8"/> + <param name="overwrite_with_gaps" value="True"/> + <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" /> + </test> + <test> + <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/> + <param name="maf_source" value="cached"/> + <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/> + <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="overwrite_with_gaps" value="True"/> + <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" /> + </test> + <test> + <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/> + <param name="maf_source" value="user"/> + <param name="maf_file" value="5.maf"/> + <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/> + <param name="overwrite_with_gaps" value="True"/> + <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" /> + </test> + </tests> + <help> +**What it does** + +A single genomic region can be covered by multiple alignment blocks. In many cases it is desirable to stitch these alignment blocks together. This tool accepts a list of genomic intervals. For every interval it performs the following: + + * finds all MAF blocks that overlap the interval; + * sorts MAF blocks by alignment score; + * stitches blocks together and resolves overlaps based on alignment score; + * outputs alignments in FASTA format. + +------ + +**Example** + +Here three MAF blocks overlapping a single interval are stitched together. Space between blocks 2 and 3 is filled with gaps: + +.. image:: ${static_path}/images/maf_icons/stitchMaf.png + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/macros.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,7 @@ +<macros> + <xml name="citations"> + <citations> + <citation type="doi">10.1093/bioinformatics/btr398</citation> + </citations> + </xml> +</macros> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_by_block_number.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_by_block_number.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,50 @@ +#!/usr/bin/env python +# Dan Blankenberg +""" +Reads a list of block numbers and a maf. Produces a new maf containing the +blocks specified by number. +""" +from __future__ import print_function + +import sys + +import bx.align.maf + +from galaxy.tools.util import maf_utilities + + +def __main__(): + input_block_filename = sys.argv[1].strip() + input_maf_filename = sys.argv[2].strip() + output_filename1 = sys.argv[3].strip() + block_col = int( sys.argv[4].strip() ) - 1 + if block_col < 0: + print("Invalid column specified", file=sys.stderr) + sys.exit(0) + species = maf_utilities.parse_species_option( sys.argv[5].strip() ) + + maf_writer = bx.align.maf.Writer( open( output_filename1, 'w' ) ) + # we want to maintain order of block file and write blocks as many times as they are listed + failed_lines = [] + for ctr, line in enumerate( open( input_block_filename, 'r' ) ): + try: + block_wanted = int( line.split( "\t" )[block_col].strip() ) + except: + failed_lines.append( str( ctr ) ) + continue + try: + for count, block in enumerate( bx.align.maf.Reader( open( input_maf_filename, 'r' ) ) ): + if count == block_wanted: + if species: + block = block.limit_to_species( species ) + maf_writer.write( block ) + break + except: + print("Your MAF file appears to be malformed.", file=sys.stderr) + sys.exit() + if len( failed_lines ) > 0: + print("Failed to extract from %i lines (%s)." % ( len( failed_lines ), ",".join( failed_lines ) )) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_by_block_number.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_by_block_number.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,35 @@ +<tool id="maf_by_block_number1" name="Extract MAF by block number" version="1.0.1"> + <description>given a set of block numbers and a MAF file</description> + <macros> + <import>macros.xml</import> + </macros> + <command>python '$__tool_directory__/maf_by_block_number.py' '$input1' '$input2' '$out_file1' $block_col $species</command> + <inputs> + <param format="txt" name="input1" type="data" label="Block Numbers"/> + <param format="maf" name="input2" label="MAF File" type="data"/> + <param name="block_col" type="data_column" label="Column containing Block number" data_ref="input1" accept_default="True" /> + <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> + <options> + <filter type="data_meta" ref="input2" key="species" /> + </options> + </param> + </inputs> + <outputs> + <data format="maf" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="maf_by_block_numbers.dat"/> + <param name="input2" value="3.maf"/> + <param name="block_col" value="1"/> + <param name="species" value="hg17,panTro1,mm5,rn3,canFam1"/> + <output name="out_file1" file="maf_by_block_number_out.dat" /> + </test> + </tests> + <help> +**What it does** + +This tool takes a list of block numbers, one per line, and extracts the corresponding MAF blocks from the provided file. Block numbers start at 0. + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_filter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_filter.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,74 @@ +# Dan Blankenberg +# Filters a MAF file according to the provided code file, which is generated in maf_filter.xml <configfiles> +# Also allows filtering by number of columns in a block, and limiting output species +from __future__ import print_function + +import os +import shutil +import sys + +import bx.align.maf + +from galaxy.tools.util import maf_utilities + + +def main(): + # Read command line arguments + try: + script_file = sys.argv.pop( 1 ) + maf_file = sys.argv.pop( 1 ) + out_file = sys.argv.pop( 1 ) + additional_files_path = sys.argv.pop( 1 ) + species = maf_utilities.parse_species_option( sys.argv.pop( 1 ) ) + min_size = int( sys.argv.pop( 1 ) ) + max_size = int( sys.argv.pop( 1 ) ) + if max_size < 1: + max_size = sys.maxsize + min_species_per_block = int( sys.argv.pop( 1 ) ) + exclude_incomplete_blocks = int( sys.argv.pop( 1 ) ) + if species: + num_species = len( species ) + else: + num_species = len( sys.argv.pop( 1 ).split( ',') ) + except: + print("One or more arguments is missing.\nUsage: maf_filter.py maf_filter_file input_maf output_maf path_to_save_debug species_to_keep", file=sys.stderr) + sys.exit() + + # Open input and output MAF files + try: + maf_reader = bx.align.maf.Reader( open( maf_file, 'r' ) ) + maf_writer = bx.align.maf.Writer( open( out_file, 'w' ) ) + except: + print("Your MAF file appears to be malformed.", file=sys.stderr) + sys.exit() + + # Save script file for debuging/verification info later + os.mkdir( additional_files_path ) + shutil.copy( script_file, os.path.join( additional_files_path, 'debug.txt' ) ) + + # Loop through blocks, running filter on each + # 'maf_block' and 'ret_val' are used/shared in the provided code file + # 'ret_val' should be set to True if the block is to be kept + i = 0 + blocks_kept = 0 + for i, maf_block in enumerate( maf_reader ): + if min_size <= maf_block.text_size <= max_size: + local = {'maf_block': maf_block, 'ret_val': False} + exec(compile(open( script_file ).read(), script_file, 'exec'), {}, local) + if local['ret_val']: + # Species limiting must be done after filters as filters could be run on non-requested output species + if species: + maf_block = maf_block.limit_to_species( species ) + if len( maf_block.components ) >= min_species_per_block and ( not exclude_incomplete_blocks or len( maf_block.components ) >= num_species ): + maf_writer.write( maf_block ) + blocks_kept += 1 + maf_writer.close() + maf_reader.close() + if i == 0: + print("Your file contains no valid maf_blocks.") + else: + print('Kept %s of %s blocks (%.2f%%).' % ( blocks_kept, i + 1, float( blocks_kept ) / float( i + 1 ) * 100.0 )) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_filter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_filter.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,196 @@\n+<tool id="MAF_filter" name="Filter MAF" version="1.0.1">\n+ <description>by specified attributes</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <command>python \'$__tool_directory__/maf_filter.py\' \'$maf_filter_file\' \'$input1\' \'$out_file1\' \'$out_file1.files_path\' $species $min_size $max_size $min_species_per_block $exclude_incomplete_blocks ${input1.metadata.species}</command>\n+ <configfiles>\n+ <configfile name="maf_filter_file">\n+#set $is_isnot_valid = {"==":"==", "!=":"!=", "in":"in", "not in":"not in"}\n+def maf_block_pass_filter( maf_block ):\n+#for $maf_filter in $maf_filters:\n+#if $len( $maf_filter[\'species1_attributes\'][\'filter_condition\'] ) == 0:\n+#continue\n+#end if\n+ primary_component = maf_block.get_component_by_src_start( """$maf_filter[\'species1\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ) )\n+ if primary_component is not None:\n+#if $maf_filter[\'species1_attributes\'][\'species1_attribute_type\'] == \'attribute_chr\':\n+ if primary_component.src.split( "." )[-1] $is_isnot_valid.get( $maf_filter[\'species1_attributes\'][\'species1_is_isnot\'].value.strip(), \'is in\' ) """$maf_filter[\'species1_attributes\'][\'species1_attribute\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ).split( "," ):\n+#else\n+ if primary_component.strand $is_isnot_valid.get( $maf_filter[\'species1_attributes\'][\'species1_is_isnot\'].value.strip(), \'==\' ) """$maf_filter[\'species1_attributes\'][\'species1_attribute\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ):\n+#end if\n+#for $filter_condition in $maf_filter[\'species1_attributes\'][\'filter_condition\']:\n+ secondary_component = maf_block.get_component_by_src_start( """$filter_condition[\'species2\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ) )\n+#if $filter_condition[\'species2_attributes\'][\'species2_attribute_type\'] == \'attribute_chr\':\n+ if secondary_component is not None:\n+ if not ( secondary_component.src.split( "." )[-1] $is_isnot_valid.get( $filter_condition[\'species2_attributes\'][\'species2_is_isnot\'].value.strip(), \'is in\' ) """$filter_condition[\'species2_attributes\'][\'species2_attribute\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ).split( "," ) ):\n+ return False\n+#else:\n+ if secondary_component is not None:\n+ if not ( secondary_component.strand $is_isnot_valid.get( $filter_condition[\'species2_attributes\'][\'species2_is_isnot\'].value.strip(), \'==\' ) """$filter_condition[\'species2_attributes\'][\'species2_attribute\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ) ):\n+ return False\n+#end if\n+#end for\n+#end for\n+ return True\n+\n+\n+ret_val = maf_block_pass_filter( maf_block )</configfile>\n+ </configfiles>\n+ <inputs>\n+ <param name="input1" type="data" format="maf" label="MAF File"/>\n+ <param name="min_size" label="Minimum Size" value="0" type="integer"/>\n+ <param name="max_size" label="Maximum Size" value="0" type="integer" help="A maximum size less than 1 indicates no limit"/>\n+ <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">\n+ <options>\n+ <filter type="data_meta" ref="input1" key="species" />\n+ </options>\n+ </param>\n+ <param name="min_species_per_block" type="select" label="Exclude blocks which have only one species" >\n+ <option value="2">Yes</option>\n+ <option value="1" selected="True">No</option>\n+ </param>\n+ <param name="exclude_incomplete_blocks" type="select" label="Exclude blocks which have missing species" >\n+ <option value="1">Yes</option>\n+ <option value="0" selected="True">No</option>\n+ </param>\n+ <repeat name="maf_filters" title="Filter">\n+ <param name="speci'..b'abel="Species Attribute">\n+ <option value="attribute_strand">Strand</option>\n+ <option value="attribute_chr" selected="true">Chromosome</option>\n+ </param>\n+ <when value="attribute_strand">\n+ <param name="species2_is_isnot" type="select" label="Conditional">\n+ <option value="==">Is</option>\n+ <option value="!=">Is Not</option>\n+ </param>\n+ <param name="species2_attribute" type="select" label="Strand">\n+ <option value="+" selected="true">+</option>\n+ <option value="-">-</option>\n+ </param>\n+ </when>\n+ <when value="attribute_chr">\n+ <param name="species2_is_isnot" type="select" label="Conditional">\n+ <option value="in">Is</option>\n+ <option value="not in">Is Not</option>\n+ </param>\n+ <param name="species2_attribute" type="text" label="Chromosome" value="chr1"/>\n+ </when>\n+ </conditional>\n+ </repeat>\n+ </when>\n+ </conditional>\n+ </repeat>\n+ </inputs>\n+ <outputs>\n+ <data format="maf" name="out_file1" />\n+ </outputs>\n+<!--\n+ <tests>\n+ <test>\n+ <param name="input1" value="4.maf"/>\n+ <param name="species" value="bosTau2,canFam2,hg17,panTro1,rheMac2,rn3"/>\n+ <param name="exclude_incomplete_blocks" value="0"/>\n+ <param name="min_species_per_block" value="1"/>\n+ <param name="min_size" value="0"/>\n+ <param name="max_size" value="0"/>\n+ <param name="species1" value="hg17"/>\n+ <param name="species2" value="hg17"/>\n+ <param name="species1_attribute_type" value="attribute_chr"/>\n+ <param name="species1_is_isnot" value="in"/>\n+ <param name="species1_attribute" value="chr1"/>\n+ <param name="filter_condition"/> Test will ERROR when this is set or when it is not set.\n+ <output name="out_file1" file="cf_maf_limit_to_species.dat"/>\n+ </test>\n+ </tests>\n+-->\n+ <help>\n+This tool allows you to build complex filters to be applied to each alignment block of a MAF file. You can define restraints on species based upon chromosome and strand. You can specify comma separated lists of chromosomes where appropriate.\n+\n+.. class:: infomark\n+\n+For example, this tool is useful to restrict a set of alignments to only those blocks which contain alignments between chromosomes that are considered homologous.\n+\n+-----\n+\n+.. class:: warningmark\n+\n+If a species is not found in a particular block, all filters on that species are ignored.\n+\n+-----\n+\n+This tool allows the user to remove any undesired species from a MAF file. If no species are specified then all species will be kept. If species are specified, columns which contain only gaps are removed. The options for this are:\n+\n+ * **Exclude blocks which have missing species** - suppose you want to restrict an 8-way alignment to human, mouse, and rat. The tool will first remove all other species. Next, if this option is set to **YES** the tool WILL NOT return MAF blocks, which do not include human, mouse, or rat. This means that all alignment blocks returned by the tool will have exactly three sequences in this example.\n+\n+ * **Exclude blocks which have only one species** - if this option is set to **YES** all single sequence alignment blocks WILL NOT be returned.\n+\n+-----\n+\n+You can also provide a size range and limit your output to the MAF blocks which fall within the specified range.\n+ </help>\n+ <expand macro="citations" />\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_limit_size.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_limit_size.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,37 @@ +#!/usr/bin/env python +# Dan Blankenberg +""" +Removes blocks that fall outside of specified size range. +""" +from __future__ import print_function + +import sys + +import bx.align.maf + + +def __main__(): + input_maf_filename = sys.argv[1].strip() + output_filename1 = sys.argv[2].strip() + min_size = int( sys.argv[3].strip() ) + max_size = int( sys.argv[4].strip() ) + if max_size < 1: + max_size = sys.maxsize + maf_writer = bx.align.maf.Writer( open( output_filename1, 'w' ) ) + try: + maf_reader = bx.align.maf.Reader( open( input_maf_filename, 'r' ) ) + except: + print("Your MAF file appears to be malformed.", file=sys.stderr) + sys.exit() + + blocks_kept = 0 + i = 0 + for i, m in enumerate( maf_reader ): + if min_size <= m.text_size <= max_size: + maf_writer.write( m ) + blocks_kept += 1 + print('Kept %s of %s blocks (%.2f%%).' % ( blocks_kept, i + 1, float( blocks_kept ) / float( i + 1 ) * 100.0 )) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_limit_size.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_limit_size.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,29 @@ +<tool id="maf_limit_size1" name="Filter MAF blocks" version="1.0.1"> + <description>by Size</description> + <macros> + <import>macros.xml</import> + </macros> + <command>python '$__tool_directory__/maf_limit_size.py' '$input1' '$out_file1' $min_size $max_size</command> + <inputs> + <param format="maf" name="input1" label="MAF File" type="data"/> + <param name="min_size" label="Minimum Size" value="0" type="integer"/> + <param name="max_size" label="Maximum Size" value="0" type="integer" help="A maximum size less than 1 indicates no limit"/> + </inputs> + <outputs> + <data format="maf" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="3.maf" ftype="maf" /> + <param name="min_size" value="0"/> + <param name="max_size" value="0"/> + <output name="out_file1" file="maf_limit_size1_out.maf" /> + </test> + </tests> + <help> +**What it does** + +This tool takes a MAF file and a size range and extracts the MAF blocks which fall within the specified range. + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_limit_to_species.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_limit_to_species.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,54 @@ +#!/usr/bin/env python +""" +Read a maf file and write out a new maf with only blocks having the +required species, after dropping any other species and removing +columns containing only gaps. + +usage: %prog species,species2,... input_maf output_maf allow_partial min_species_per_block +""" +# Dan Blankenberg +from __future__ import print_function + +import sys + +import bx.align.maf + +from galaxy.tools.util import maf_utilities + + +def main(): + species = maf_utilities.parse_species_option( sys.argv[1] ) + if species: + spec_len = len( species ) + else: + spec_len = 0 + try: + maf_reader = bx.align.maf.Reader( open( sys.argv[2], 'r' ) ) + maf_writer = bx.align.maf.Writer( open( sys.argv[3], 'w' ) ) + except: + print("Your MAF file appears to be malformed.", file=sys.stderr) + sys.exit() + allow_partial = False + if int( sys.argv[4] ): + allow_partial = True + min_species_per_block = int( sys.argv[5] ) + + maf_blocks_kept = 0 + for m in maf_reader: + if species: + m = m.limit_to_species( species ) + m.remove_all_gap_columns() + spec_in_block_len = len( maf_utilities.get_species_in_block( m ) ) + if ( not species or allow_partial or spec_in_block_len == spec_len ) and spec_in_block_len > min_species_per_block: + maf_writer.write( m ) + maf_blocks_kept += 1 + + maf_reader.close() + maf_writer.close() + + print("Restricted to species: %s." % ", ".join( species )) + print("%i MAF blocks have been kept." % maf_blocks_kept) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_limit_to_species.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_limit_to_species.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,45 @@ +<tool id="MAF_Limit_To_Species1" name="Filter MAF blocks" version="1.0.0"> + <description>by Species</description> + <macros> + <import>macros.xml</import> + </macros> + <command>python '$__tool_directory__/maf_limit_to_species.py' $species '$input1' '$out_file1' $allow_partial $min_species</command> + <inputs> + <param name="input1" type="data" format="maf" label="MAF file"/> + <param name="allow_partial" type="select" label="Exclude blocks which have missing species" > + <option value="1">No</option> + <option value="0">Yes</option> + </param> + <param name="min_species" type="select" label="Exclude blocks which have only one species" > + <option value="1">Yes</option> + <option value="0">No</option> + </param> + <param name="species" type="select" label="Species to keep" display="checkboxes" multiple="true"> + <options> + <filter type="data_meta" ref="input1" key="species" /> + </options> + </param> + </inputs> + <outputs> + <data format="maf" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="4.maf"/> + <param name="species" value="bosTau2,canFam2,hg17,panTro1,rheMac2,rn3"/> + <param name="allow_partial" value="0"/> + <param name="min_species" value="0"/> + <output name="out_file1" file="cf_maf_limit_to_species.dat"/> + </test> + </tests> + <help> +**What It Does** + +This tool allows the user to remove any undesired species from a MAF file. Columns which contain only gaps are removed. The options for this tool are: + + * **Exclude blocks which have missing species** - suppose you want to restrict an 8-way alignment to human, mouse, and rat. The tool will first remove all other species. Next, if this option is set to **YES** the tool WILL NOT return MAF blocks, which do not include human, mouse, or rat. This means that all alignment blocks returned by the tool will have exactly three sequences in this example. + + * **Exclude blocks with have only one species** - if this option is set to **YES** all single sequence alignment blocks WILL NOT be returned. + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_reverse_complement.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_reverse_complement.py Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" +Reads a MAF file. Produces a MAF file containing +the reverse complement for each block in the source file. + +usage: %prog input_maf_file output_maf_file +""" +# Dan Blankenberg +from __future__ import print_function + +import sys + +import bx.align.maf + +from galaxy.tools.util import maf_utilities + + +def __main__(): + # Parse Command Line + input_file = sys.argv.pop( 1 ) + output_file = sys.argv.pop( 1 ) + species = maf_utilities.parse_species_option( sys.argv.pop( 1 ) ) + + try: + maf_writer = bx.align.maf.Writer( open( output_file, 'w' ) ) + except: + print(sys.stderr, "Unable to open output file") + sys.exit() + try: + count = 0 + for count, maf in enumerate( bx.align.maf.Reader( open( input_file ) ) ): + maf = maf.reverse_complement() + if species: + maf = maf.limit_to_species( species ) + maf_writer.write( maf ) + except: + print("Your MAF file appears to be malformed.", file=sys.stderr) + sys.exit() + print("%i regions were reverse complemented." % count) + maf_writer.close() + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_reverse_complement.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_reverse_complement.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,47 @@ +<tool id="MAF_Reverse_Complement_1" name="Reverse Complement" version="1.0.1"> + <description>a MAF file</description> + <macros> + <import>macros.xml</import> + </macros> + <command>python '$__tool_directory__/maf_reverse_complement.py' '$input1' '$out_file1' $species</command> + <inputs> + <param format="maf" name="input1" label="Alignment File" type="data"/> + <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment"> + <options> + <filter type="data_meta" ref="input1" key="species" /> + </options> + </param> + </inputs> + <outputs> + <data format="maf" name="out_file1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="3.maf" dbkey="hg17" ftype="maf"/> + <param name="species" value="hg17,panTro1,mm5,rn3,canFam1"/> + <output name="out_file1" file="maf_reverse_complement_out.dat"/> + </test> + </tests> + <help> +**What it does** + +This tool takes a MAF file and creates a new MAF file, where each block has been reversed complemented. + +**Example** + +This MAF Block:: + + a score=8157.000000 + s hg17.chr7 127471526 58 + 158628139 AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + s panTro1.chr6 129885407 58 + 161576975 AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + s mm5.chr6 28904928 54 + 149721531 AA----CGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG + +becomes:: + + a score=8157.000000 + s hg17.chr7 31156555 58 - 158628139 CCTCTTCCACTATAGACCTCCTTAAACAAAATAATGAAAAATGAATAAACCACAAATT + s panTro1.chr6 31691510 58 - 161576975 CCTCTTCCACTATAGACCTCCTTAAACAAAATAATGAAAAACGAATAAACCACAAATT + s mm5.chr6 120816549 54 - 149721531 CCTCTTCCACTGAGGAATTTCTTTTTTTAAATGATGAGCAATCAATGAAACG----TT + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_split_by_species.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_split_by_species.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,46 @@ +#!/usr/bin/env python +""" +Read a maf and split blocks by unique species combinations +""" +from __future__ import print_function + +import sys + +from bx.align import maf + +from galaxy.tools.util import maf_utilities +from galaxy.util import string_as_bool + + +def __main__(): + try: + maf_reader = maf.Reader( open( sys.argv[1] ) ) + except Exception as e: + maf_utilities.tool_fail( "Error opening MAF: %s" % e ) + try: + out = maf.Writer( open( sys.argv[2], "w") ) + except Exception as e: + maf_utilities.tool_fail( "Error opening file for output: %s" % e ) + try: + collapse_columns = string_as_bool( sys.argv[3] ) + except Exception as e: + maf_utilities.tool_fail( "Error determining collapse columns value: %s" % e ) + + start_count = 0 + end_count = 0 + for start_count, start_block in enumerate( maf_reader ): + for block in maf_utilities.iter_blocks_split_by_species( start_block ): + if collapse_columns: + block.remove_all_gap_columns() + out.write( block ) + end_count += 1 + out.close() + + if end_count: + print("%i alignment blocks created from %i original blocks." % ( end_count, start_count + 1 )) + else: + print("No alignment blocks were created.") + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_split_by_species.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_split_by_species.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,217 @@\n+<tool id="MAF_split_blocks_by_species1" name="Split MAF blocks" version="1.0.0">\n+ <description>by Species</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <command>python \'$__tool_directory__/maf_split_by_species.py\' \'$input1\' \'$out_file1\' $collapse_columns</command>\n+ <inputs>\n+ <param format="maf" name="input1" type="data" label="MAF file to split"/>\n+ <param name="collapse_columns" type="select" label="Collapse empty alignment columns" help="Removes columns that are gaps in all sequences">\n+ <option value="True" selected="true">Yes</option>\n+ <option value="False">No</option>\n+ </param>\n+ </inputs>\n+ <outputs>\n+ <data format="maf" name="out_file1" />\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="input1" value="maf_split_by_species_in.maf"/>\n+ <param name="collapse_columns" value="True"/>\n+ <output name="out_file1" file="maf_split_by_species_collapsed_out.maf"/>\n+ </test>\n+ <test>\n+ <param name="input1" value="maf_split_by_species_in.maf"/>\n+ <param name="collapse_columns" value="False"/>\n+ <output name="out_file1" file="maf_split_by_species_not_collapsed_out.maf"/>\n+ </test>\n+ </tests>\n+ <help>\n+**What it does**\n+\n+This tool examines each MAF block for multiple occurrences of a species in a single block. When this occurs, a block is split into multiple blocks where every combination of one sequence per species per block is represented.\n+\n+The interface for this tool has two inputs:\n+\n+ * **MAF file to split**. Choose multiple alignments from history to be split by species.\n+ * **Collapse empty alignment columns**. Should alignment columns containing only gaps in the new blocks be removed.\n+\n+-----\n+\n+**Example 1**: **Collapse empty alignment columns is Yes**:\n+\n+For the following alignment::\n+\n+ ##maf version=1\n+ a score=2047408.0\n+ s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG\n+ s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG\n+ s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+the tool will create **a single** history item containing 12 alignment blocks (notice that no columns contain only gaps)::\n+\n+ ##maf version=1\n+ a score=2047408.0\n+ s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGG'..b'TCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG\n+ s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+ a score=2047408.0\n+ s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG\n+ s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG\n+ s species3.chr3 68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG\n+\n+-------\n+\n+.. class:: infomark\n+\n+**About formats**\n+\n+**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes.\n+\n+ - The .maf format is line-oriented. Each multiple alignment ends with a blank line.\n+ - Each sequence in an alignment is on a single line.\n+ - Lines starting with # are considered to be comments.\n+ - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment.\n+ - Some MAF files may contain two optional line types:\n+\n+ - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line;\n+ - An "e" line containing information about the size of the gap between the alignments that span the current block.\n+ </help>\n+ <expand macro="citations" />\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_stats.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,115 @@ +#!/usr/bin/env python +# Dan Blankenberg +""" +Reads a list of intervals and a maf. Outputs a new set of intervals with statistics appended. +""" +from __future__ import print_function + +import sys + +import bx.intervals.io +from bx.bitset import BitSet + +from galaxy.tools.util import maf_utilities + + +def __main__(): + maf_source_type = sys.argv.pop( 1 ) + input_maf_filename = sys.argv[1].strip() + input_interval_filename = sys.argv[2].strip() + output_filename = sys.argv[3].strip() + dbkey = sys.argv[4].strip() + try: + chr_col = int( sys.argv[5].strip() ) - 1 + start_col = int( sys.argv[6].strip() ) - 1 + end_col = int( sys.argv[7].strip() ) - 1 + except: + print("You appear to be missing metadata. You can specify your metadata by clicking on the pencil icon associated with your interval file.", file=sys.stderr) + sys.exit() + summary = sys.argv[8].strip() + if summary.lower() == "true": + summary = True + else: + summary = False + + mafIndexFile = "%s/maf_index.loc" % sys.argv[9] + try: + maf_index_filename = sys.argv[10].strip() + except: + maf_index_filename = None + index = index_filename = None + if maf_source_type == "user": + # index maf for use here + index, index_filename = maf_utilities.open_or_build_maf_index( input_maf_filename, maf_index_filename, species=[dbkey] ) + if index is None: + print("Your MAF file appears to be malformed.", file=sys.stderr) + sys.exit() + elif maf_source_type == "cached": + # access existing indexes + index = maf_utilities.maf_index_by_uid( input_maf_filename, mafIndexFile ) + if index is None: + print("The MAF source specified (%s) appears to be invalid." % ( input_maf_filename ), file=sys.stderr) + sys.exit() + else: + print('Invalid source type specified: %s' % maf_source_type, file=sys.stdout) + sys.exit() + + out = open(output_filename, 'w') + + num_region = None + num_bad_region = 0 + species_summary = {} + total_length = 0 + # loop through interval file + for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_interval_filename, 'r' ), chrom_col=chr_col, start_col=start_col, end_col=end_col, fix_strand=True, return_header=False, return_comments=False ) ): + src = "%s.%s" % ( dbkey, region.chrom ) + region_length = region.end - region.start + if region_length < 1: + num_bad_region += 1 + continue + total_length += region_length + coverage = { dbkey: BitSet( region_length ) } + + for block in index.get_as_iterator( src, region.start, region.end ): + for spec in maf_utilities.get_species_in_block( block ): + if spec not in coverage: + coverage[spec] = BitSet( region_length ) + for block in maf_utilities.iter_blocks_split_by_species( block ): + if maf_utilities.component_overlaps_region( block.get_component_by_src( src ), region ): + # need to chop and orient the block + block = maf_utilities.orient_block_by_region( maf_utilities.chop_block_by_region( block, src, region ), src, region, force_strand='+' ) + start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start ) + for i in range( len( alignment[dbkey] ) ): + for spec, text in alignment.items(): + if text[i] != '-': + coverage[spec].set( start_offset + i ) + if summary: + # record summary + for key in coverage.keys(): + if key not in species_summary: + species_summary[key] = 0 + species_summary[key] = species_summary[key] + coverage[key].count_range() + else: + # print coverage for interval + coverage_sum = coverage[dbkey].count_range() + out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, coverage_sum, region_length - coverage_sum ) ) + keys = list(coverage.keys()) + keys.remove( dbkey ) + keys.sort() + for key in keys: + coverage_sum = coverage[key].count_range() + out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, coverage_sum, region_length - coverage_sum ) ) + if summary: + out.write( "#species\tnucleotides\tcoverage\n" ) + for spec in species_summary: + out.write( "%s\t%s\t%.4f\n" % ( spec, species_summary[spec], float( species_summary[spec] ) / total_length ) ) + out.close() + if num_region is not None: + print("%i regions were processed with a total length of %i." % ( num_region + 1, total_length )) + if num_bad_region: + print("%i regions were invalid." % ( num_bad_region )) + maf_utilities.remove_temp_index_file( index_filename ) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_stats.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,112 @@ +<tool id="maf_stats1" name="MAF Coverage Stats" version="1.0.1"> + <description>Alignment coverage information</description> + <macros> + <import>macros.xml</import> + </macros> + <command> +python '$__tool_directory__/maf_stats.py' $maf_source_type.maf_source +#if $maf_source_type.maf_source == "user": + '$input2' +#else: + '$maf_source_type.mafType' +#end if +'$input1' '$out_file1' $dbkey ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} $summary '${GALAXY_DATA_INDEX_DIR}' +#if $maf_source_type.maf_source == "user": + '$input2.metadata.maf_index' +#end if + </command> + <inputs> + <param format="interval" name="input1" label="Interval File" type="data"> + <validator type="unspecified_build" /> + </param> + <conditional name="maf_source_type"> + <param name="maf_source" type="select" label="MAF Source"> + <option value="cached" selected="true">Locally Cached Alignments</option> + <option value="user">Alignments in Your History</option> + </param> + <when value="user"> + <param format="maf" name="input2" label="MAF File" type="data"> + <options> + <filter type="data_meta" ref="input1" key="dbkey" /> + </options> + <validator type="dataset_ok_validator" /> + </param> + </when> + <when value="cached"> + <param name="mafType" type="select" label="MAF Type"> + <options from_file="maf_index.loc"> + <column name="name" index="0"/> + <column name="value" index="1"/> + <column name="dbkey" index="2"/> + <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/> + <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/> + </options> + </param> + </when> + </conditional> + <param name="summary" type="select" label="Type of Output"> + <option value="false" selected="true">Coverage by Region</option> + <option value="true">Summarize Coverage</option> + </param> + </inputs> + <outputs> + <data format="interval" name="out_file1" metadata_source="input1"> + <change_format> + <when input="summary" value="true" format="tabular" /> + </change_format> + </data> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/> + <param name="maf_source" value="cached"/> + <param name="mafType" value="8_WAY_MULTIZ_hg17"/> + <output name="out_file1" file="maf_stats_interval_out.dat"/> + <param name="summary" value="false"/> + </test> + <test> + <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/> + <param name="maf_source" value="cached"/> + <param name="mafType" value="8_WAY_MULTIZ_hg17"/> + <output name="out_file1" file="maf_stats_summary_out.dat"/> + <param name="summary" value="true"/> + </test> + </tests> + <help> +**What it does** + +This tool takes a MAF file and an interval file and relates coverage information by interval for each species. +If a column does not exist in the reference genome, it is not included in the output. + +Consider the interval: "chrX 1000 1100 myInterval" + Let's suppose we want to do stats on three way alignments for H, M, and R. The result look like this: + + chrX 1000 1100 myInterval H XXX YYY + + chrX 1000 1100 myInterval M XXX YYY + + chrX 1000 1100 myInterval R XXX YYY + + + where XXX and YYY are: + + XXX = number of nucleotides + + YYY = number of gaps + +---- + +Alternatively, you can request only summary information for a set of intervals: + + ======== =========== ======== + #species nucleotides coverage + ======== =========== ======== + hg18 30639 0.2372 + rheMac2 7524 0.0582 + panTro2 30390 0.2353 + ======== =========== ======== + + where **coverage** is the number of nucleotides divided by the total length of the provided intervals. + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_thread_for_species.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_thread_for_species.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,55 @@ +#!/usr/bin/env python +""" +Read a maf file and write out a new maf with only blocks having all of +the passed in species, after dropping any other species and removing columns +containing only gaps. This will attempt to fuse together any blocks +which are adjacent after the unwanted species have been dropped. + +usage: %prog input_maf output_maf species1,species2 +""" +# Dan Blankenberg +from __future__ import print_function + +import sys + +import bx.align.maf +from bx.align.tools.fuse import FusingAlignmentWriter +from bx.align.tools.thread import get_components_for_species, remove_all_gap_columns + + +def main(): + input_file = sys.argv.pop( 1 ) + output_file = sys.argv.pop( 1 ) + species = sys.argv.pop( 1 ).split( ',' ) + + try: + maf_reader = bx.align.maf.Reader( open( input_file ) ) + except: + print("Unable to open source MAF file", file=sys.stderr) + sys.exit() + try: + maf_writer = FusingAlignmentWriter( bx.align.maf.Writer( open( output_file, 'w' ) ) ) + except: + print("Unable to open output file", file=sys.stderr) + sys.exit() + try: + for m in maf_reader: + new_components = m.components + if species != ['None']: + new_components = get_components_for_species( m, species ) + if new_components: + remove_all_gap_columns( new_components ) + m.components = new_components + m.score = 0.0 + maf_writer.write( m ) + except Exception as e: + print("Error steping through MAF File: %s" % e, file=sys.stderr) + sys.exit() + maf_reader.close() + maf_writer.close() + + print("Restricted to species: %s." % ", ".join( species )) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_thread_for_species.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_thread_for_species.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,54 @@ +<tool id="MAF_Thread_For_Species1" name="Join MAF blocks" version="1.0.0"> + <description>by Species</description> + <macros> + <import>macros.xml</import> + </macros> + <command>python '$__tool_directory__/maf_thread_for_species.py' '$input1' '$out_file1' $species</command> + <inputs> + <param format="maf" name="input1" type="data" label="MAF file"/> + <param name="species" type="select" label="Species to keep" display="checkboxes" multiple="true"> + <options> + <filter type="data_meta" ref="input1" key="species" /> + </options> + </param> + </inputs> + <outputs> + <data format="maf" name="out_file1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="3.maf" ftype="maf"/> + <param name="species" value="hg17,panTro1"/> + <output name="out_file1" file="maf_thread_for_species.dat"/> + </test> + </tests> + <help> +**What it does** + +This tool allows the user to merge MAF blocks which are adjoining in each specified species from a MAF file. Columns which contain only gaps are removed. Species which are not desired are removed from the output. + +**Example** + +Specifying the desired species as hg17 and panTro1 with this MAF file:: + + ##maf version=1 + a score=60426.000000 + s hg17.chr7 127471195 331 + 158628139 gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCC-------------------------------AAATACT-GCCACTGATGTCCTG-----ATGGAGGTA-------TGAA-------------------AACATCCACTAA + s panTro1.chr6 129885076 331 + 161576975 gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCC-------------------------------AAATACT-GCCACTGATGTCCTG-----ATGGAGGTA-------TGAA-------------------AACATCCACTAA + s mm5.chr6 28904571 357 + 149721531 CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAA-CGAGGGTGGTCCAGTTACTATCTTGACTGCAGCTGGCAGTCAGTT-GCCACT-----CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCCAGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA + s rn3.chr4 56178191 282 + 187371129 CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCT-GTCAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTC---------------------------------GGAGTCTAGCTGTAGACAGCCCA-----ATG--GGTA-------TAAC-------------------AATACTCACTAA + + a score=8157.000000 + s hg17.chr7 127471526 58 + 158628139 AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + s panTro1.chr6 129885407 58 + 161576975 AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + s mm5.chr6 28904928 54 + 149721531 AA----CGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG + +results in:: + + ##maf version=1 + a score=0.0 + s hg17.chr7 127471195 389 + 158628139 gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGGTATGAAAACATCCACTAAAATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + s panTro1.chr6 129885076 389 + 161576975 gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGGTATGAAAACATCCACTAAAATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_bed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_bed.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,84 @@ +#!/usr/bin/env python +""" +Read a maf and output intervals for specified list of species. +""" +from __future__ import print_function + +import os +import sys + +from bx.align import maf + + +def __main__(): + input_filename = sys.argv[1] + output_filename = sys.argv[2] + # where to store files that become additional output + database_tmp_dir = sys.argv[5] + + species = sys.argv[3].split(',') + partial = sys.argv[4] + output_id = sys.argv[6] + out_files = {} + primary_spec = None + + if "None" in species: + species = set() + try: + for i, m in enumerate( maf.Reader( open( input_filename, 'r' ) ) ): + for c in m.components: + spec, chrom = maf.src_split( c.src ) + if not spec or not chrom: + spec = chrom = c.src + species.add(spec) + except: + print("Invalid MAF file specified", file=sys.stderr) + return + + if "?" in species: + print("Invalid dbkey specified", file=sys.stderr) + return + + for i, spec in enumerate( species ): + if i == 0: + out_files[spec] = open( output_filename, 'w' ) + primary_spec = spec + else: + out_files[ spec ] = open( os.path.join( database_tmp_dir, 'primary_%s_%s_visible_bed_%s' % ( output_id, spec, spec ) ), 'wb+' ) + num_species = len( species ) + + print("Restricted to species:", ",".join( species )) + + file_in = open( input_filename, 'r' ) + maf_reader = maf.Reader( file_in ) + + block_num = -1 + + for i, m in enumerate( maf_reader ): + block_num += 1 + if "None" not in species: + m = m.limit_to_species( species ) + l = m.components + if len(l) < num_species and partial == "partial_disallowed": + continue + for c in l: + spec, chrom = maf.src_split( c.src ) + if not spec or not chrom: + spec = chrom = c.src + if spec not in out_files.keys(): + out_files[ spec ] = open( os.path.join( database_tmp_dir, 'primary_%s_%s_visible_bed_%s' % ( output_id, spec, spec ) ), 'wb+' ) + + if c.strand == "-": + out_files[spec].write( chrom + "\t" + str( c.src_size - c.end ) + "\t" + str( c.src_size - c.start ) + "\t" + spec + "_" + str( block_num ) + "\t" + "0\t" + c.strand + "\n" ) + else: + out_files[spec].write( chrom + "\t" + str( c.start ) + "\t" + str( c.end ) + "\t" + spec + "_" + str( block_num ) + "\t" + "0\t" + c.strand + "\n" ) + + file_in.close() + for file_out in out_files.keys(): + out_files[file_out].close() + + print("#FILE1_DBKEY\t%s" % ( primary_spec )) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_bed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_bed.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,129 @@ +<tool id="MAF_To_BED1" name="MAF to BED" version="1.0.0"> + <description>Converts a MAF formatted file to the BED format</description> + <macros> + <import>macros.xml</import> + </macros> + <code file="maf_to_bed_code.py"/> + <command>python '$__tool_directory__/maf_to_bed.py' '${input1}' '${out_file1}' '${species}' ${complete_blocks} '.' '${out_file1.id}'</command> + <inputs> + <param format="maf" name="input1" type="data" label="MAF file to convert"/> + <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="a separate history item will be created for each checked species"> + <options> + <filter type="data_meta" ref="input1" key="species" /> + </options> + </param> + <param name="complete_blocks" type="select" label="Exclude blocks which have a requested species missing"> + <option value="partial_allowed">include blocks with missing species</option> + <option value="partial_disallowed">exclude blocks with missing species</option> + </param> + </inputs> + <outputs> + <data format="bed" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="4.maf"/> + <param name="species" value="hg17"/> + <param name="complete_blocks" value="partial_disallowed"/> + <output name="out_file1" file="cf_maf_to_bed.dat"/> + </test> + </tests> + <help> +**What it does** + +This tool converts every MAF block to an interval line (in BED format; scroll down for description of MAF and BED formats) describing position of that alignment block within a corresponding genome. + +The interface for this tool contains two pages (steps): + + * **Step 1 of 2**. Choose multiple alignments from history to be converted to BED format. + * **Step 2 of 2**. Choose species from the alignment to be included in the output and specify how to deal with alignment blocks that lack one or more species: + + * **Choose species** - the tool reads the alignment provided during Step 1 and generates a list of species contained within that alignment. Using checkboxes you can specify taxa to be included in the output (only reference genome, shown in **bold**, is selected by default). If you select more than one species, then more than one history item will be created. + * **Choose to include/exclude blocks with missing species** - if an alignment block does not contain any one of the species you selected within **Choose species** menu and this option is set to **exclude blocks with missing species**, then coordinates of such a block **will not** be included in the output (see **Example 2** below). + +----- + +**Example 1**: **Include only reference genome** (hg18 in this case) and **include blocks with missing species**: + +For the following alignment:: + + ##maf version=1 + a score=68686.000000 + s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- + s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- + s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- + s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- + s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C + + a score=10289.000000 + s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG + s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG + s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG + +the tool will create **a single** history item containing the following (**note** that field 4 is added to the output and is numbered iteratively: hg18_0, hg18_1 etc.):: + + chr20 56827368 56827443 hg18_0 0 + + chr20 56827443 56827480 hg18_1 0 + + +----- + +**Example 2**: **Include hg18 and mm8** and **exclude blocks with missing species**: + +For the following alignment:: + + ##maf version=1 + a score=68686.000000 + s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- + s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- + s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- + s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- + s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C + + a score=10289.000000 + s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG + s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG + s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG + +the tool will create **two** history items (one for hg18 and one fopr mm8) containing the following (**note** that both history items contain only one line describing the first alignment block. The second MAF block is not included in the output because it does not contain mm8): + +History item **1** (for hg18):: + + chr20 56827368 56827443 hg18_0 0 + + +History item **2** (for mm8):: + + chr2 173910832 173910893 mm8_0 0 + + +------- + +.. class:: infomark + +**About formats** + +**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. + + - The .maf format is line-oriented. Each multiple alignment ends with a blank line. + - Each sequence in an alignment is on a single line. + - Lines starting with # are considered to be comments. + - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment. + - Some MAF files may contain two optional line types: + + - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; + - An "e" line containing information about the size of the gap between the alignments that span the current block. + +**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and a number of additional optional ones: + +The first three BED fields (required) are:: + + 1. chrom - The name of the chromosome (e.g. chr1, chrY_random). + 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.) + 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval). + +Additional (optional) fields are:: + + 4. name - The name of the BED line. + 5. score - A score between 0 and 1000. + 6. strand - Defines the strand - either '+' or '-'. + </help> + <expand macro="citations" /> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_bed_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_bed_code.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,19 @@ +def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): + output_data = next(iter(out_data.values())) + new_stdout = "" + split_stdout = stdout.split("\n") + for line in split_stdout: + if line.startswith("#FILE1"): + fields = line.split("\t") + dbkey = fields[1] + output_data.dbkey = dbkey + output_data.name = "%s (%s)" % ( output_data.name, dbkey ) + app.model.context.add( output_data ) + app.model.context.flush() + else: + new_stdout = "%s\n%s" % ( new_stdout, line ) + for data in output_data.creating_job.output_datasets: + data = data.dataset + data.info = new_stdout + app.model.context.add( data ) + app.model.context.flush() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_fasta.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,196 @@\n+<tool id="MAF_To_Fasta1" name="MAF to FASTA" version="1.0.1">\n+ <description>Converts a MAF formatted file to FASTA format</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <command>\n+python\n+#if $fasta_target_type.fasta_type == "multiple"\n+ \'$__tool_directory__/maf_to_fasta_multiple_sets.py\' \'$input1\' \'$out_file1\' $fasta_target_type.species $fasta_target_type.complete_blocks\n+#else\n+ \'$__tool_directory__/maf_to_fasta_concat.py\' $fasta_target_type.species \'$input1\' \'$out_file1\'\n+#end if\n+ </command>\n+ <inputs>\n+ <param format="maf" name="input1" type="data" label="MAF file to convert"/>\n+ <conditional name="fasta_target_type">\n+ <param name="fasta_type" type="select" label="Type of FASTA Output">\n+ <option value="multiple" selected="true">Multiple Blocks</option>\n+ <option value="concatenated">One Sequence per Species</option>\n+ </param>\n+ <when value="multiple">\n+ <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="checked taxa will be included in the output">\n+ <options>\n+ <filter type="data_meta" ref="input1" key="species" />\n+ </options>\n+ </param>\n+ <param name="complete_blocks" type="select" label="Choose to">\n+ <option value="partial_allowed">include blocks with missing species</option>\n+ <option value="partial_disallowed">exclude blocks with missing species</option>\n+ </param>\n+ </when>\n+ <when value="concatenated">\n+ <param name="species" type="select" label="Species to extract" display="checkboxes" multiple="true">\n+ <options>\n+ <filter type="data_meta" ref="input1" key="species" />\n+ </options>\n+ </param>\n+ </when>\n+ </conditional>\n+ </inputs>\n+ <outputs>\n+ <data format="fasta" name="out_file1" />\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="input1" value="3.maf" ftype="maf"/>\n+ <param name="fasta_type" value="concatenated"/>\n+ <param name="species" value="canFam1"/>\n+ <output name="out_file1" file="cf_maf2fasta_concat.dat" ftype="fasta"/>\n+ </test>\n+ <test>\n+ <param name="input1" value="4.maf" ftype="maf"/>\n+ <param name="fasta_type" value="multiple"/>\n+ <param name="species" value="hg17,panTro1,rheMac2,rn3,mm7,canFam2,bosTau2,dasNov1"/>\n+ <param name="complete_blocks" value="partial_allowed"/>\n+ <output name="out_file1" file="cf_maf2fasta_new.dat" ftype="fasta"/>\n+ </test>\n+ </tests>\n+ <help>\n+**Types of MAF to FASTA conversion**\n+\n+ * **Multiple Blocks** converts a single MAF block to a single FASTA block. For example, if you have 6 MAF blocks, they will be converted to 6 FASTA blocks.\n+ * **One Sequence per Species** converts MAF blocks to a single aggregated FASTA block. For example, if you have 6 MAF blocks, they will be converted and concatenated into a single FASTA block.\n+\n+-------\n+\n+**What it does**\n+\n+This tool converts MAF blocks to FASTA format and concatenates them into a single FASTA block or outputs multiple FASTA blocks separated by empty lines.\n+\n+The interface for this tool contains two pages (steps):\n+\n+ * **Step 1 of 2**. Choose multiple alignments from history to be converted to FASTA format.\n+ * **Step 2 of 2**. Choose the type of output as well as the species from the alignment to be included in the output.\n+\n+ Multiple Block output has additional options:\n+\n+ * **Choose species** - the tool reads the alignment provided during Step 1 and generates a list of species contained within that alignment. Using checkboxes you can specify taxa to be included in the output (all species are selected by default).\n+ * **Choose to include/exclude blocks with missing species** - if an alignment block does not contain any one of the species you selected within **Choose species** menu and this option is set to **exclude blocks with missing species**, then such a'..b'CATCTCCAATTCTAATGGAC-\n+ s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+ s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\n+ s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\n+\n+ a score=10289.000000\n+ s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\n+\n+will be converted to::\n+\n+ >hg18.chr20(+):56827368-56827443|hg18_0\n+ GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ >panTro2.chr20(+):56528685-56528760|panTro2_0\n+ GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ >rheMac2.chr10(-):89144112-89144181|rheMac2_0\n+ GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+ >mm8.chr2(+):173910832-173910893|mm8_0\n+ AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\n+ >canFam2.chr24(+):46551822-46551889|canFam2_0\n+ CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\n+\n+ >hg18.chr20(+):56827443-56827480|hg18_1\n+ ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ >panTro2.chr20(+):56528760-56528797|panTro2_1\n+ ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ >rheMac2.chr10(-):89144181-89144218|rheMac2_1\n+ ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\n+\n+-----\n+\n+**Example 2b**: Multiple Block Approach **Include hg18 and mm8** and **exclude blocks with missing species**:\n+\n+The following alignment::\n+\n+ ##maf version=1\n+ a score=68686.000000\n+ s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+ s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\n+ s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\n+\n+ a score=10289.000000\n+ s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\n+\n+will be converted to (**note** that the second MAF block, which does not have mm8, is not included in the output)::\n+\n+ >hg18.chr20(+):56827368-56827443|hg18_0\n+ GACAGGGTGCATCTGGGAGGGCCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC\n+ >mm8.chr2(+):173910832-173910893|mm8_0\n+ AGAAGGATCCACCT---------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------\n+\n+------\n+\n+.. class:: infomark\n+\n+**About formats**\n+\n+ **MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes.\n+\n+ - The .maf format is line-oriented. Each multiple alignment ends with a blank line.\n+ - Each sequence in an alignment is on a single line.\n+ - Lines starting with # are considered to be comments.\n+ - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment.\n+ - Some MAF files may contain two optional line types:\n+\n+ - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line;\n+ - An "e" line containing information about the size of the gap between the alignments that span the current block.\n+ </help>\n+ <expand macro="citations" />\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_fasta_concat.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_fasta_concat.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,60 @@ +#!/usr/bin/env python +""" +Read a maf and output a single block fasta file, concatenating blocks + +usage %prog species1,species2 maf_file out_file +""" +# Dan Blankenberg +from __future__ import print_function + +import sys + +from bx.align import maf + +from galaxy.tools.util import maf_utilities + + +def __main__(): + try: + species = maf_utilities.parse_species_option( sys.argv[1] ) + except Exception as e: + maf_utilities.tool_fail( "Error determining species value: %s" % e ) + try: + input_filename = sys.argv[2] + except Exception as e: + maf_utilities.tool_fail( "Error reading MAF filename: %s" % e ) + try: + file_out = open( sys.argv[3], 'w' ) + except Exception as e: + maf_utilities.tool_fail( "Error opening file for output: %s" % e ) + + if species: + print("Restricted to species: %s" % ', '.join( species )) + else: + print("Not restricted to species.") + + if not species: + try: + species = maf_utilities.get_species_in_maf( input_filename ) + except Exception as e: + maf_utilities.tool_fail( "Error determining species in input MAF: %s" % e ) + + for spec in species: + file_out.write( ">" + spec + "\n" ) + try: + for start_block in maf.Reader( open( input_filename, 'r' ) ): + for block in maf_utilities.iter_blocks_split_by_species( start_block ): + block.remove_all_gap_columns() # remove extra gaps + component = block.get_component_by_src_start( spec ) # blocks only have one occurrence of a particular species, so this is safe + if component: + file_out.write( component.text ) + else: + file_out.write( "-" * block.text_size ) + except Exception as e: + maf_utilities.tool_fail( "Your MAF file appears to be malformed: %s" % e ) + file_out.write( "\n" ) + file_out.close() + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_fasta_multiple_sets.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_fasta_multiple_sets.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,61 @@ +#!/usr/bin/env python +""" +Read a maf and output a multiple block fasta file. +""" +# Dan Blankenberg +from __future__ import print_function + +import sys + +from bx.align import maf + +from galaxy.tools.util import maf_utilities + + +def __main__(): + try: + maf_reader = maf.Reader( open( sys.argv[1] ) ) + except Exception as e: + maf_utilities.tool_fail( "Error opening input MAF: %s" % e ) + try: + file_out = open( sys.argv[2], 'w' ) + except Exception as e: + maf_utilities.tool_fail( "Error opening file for output: %s" % e ) + try: + species = maf_utilities.parse_species_option( sys.argv[3] ) + if species: + num_species = len( species ) + else: + num_species = 0 + except Exception as e: + maf_utilities.tool_fail( "Error determining species value: %s" % e ) + try: + partial = sys.argv[4] + except Exception as e: + maf_utilities.tool_fail( "Error determining keep partial value: %s" % e ) + + if species: + print("Restricted to species: %s" % ', '.join( species )) + else: + print("Not restricted to species.") + + for block_num, block in enumerate( maf_reader ): + if species: + block = block.limit_to_species( species ) + if len( maf_utilities.get_species_in_block( block ) ) < num_species and partial == "partial_disallowed": + continue + spec_counts = {} + for component in block.components: + spec, chrom = maf_utilities.src_split( component.src ) + if spec not in spec_counts: + spec_counts[ spec ] = 0 + else: + spec_counts[ spec ] += 1 + file_out.write( "%s\n" % maf_utilities.get_fasta_header( component, { 'block_index': block_num, 'species': spec, 'sequence_index': spec_counts[ spec ] }, suffix="%s_%i_%i" % ( spec, block_num, spec_counts[ spec ] ) ) ) + file_out.write( "%s\n" % component.text ) + file_out.write( "\n" ) + file_out.close() + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_interval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_interval.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,71 @@ +#!/usr/bin/env python + +""" +Read a maf and output intervals for specified list of species. +""" +import os +import sys + +from bx.align import maf + +from galaxy.tools.util import maf_utilities + + +def __main__(): + input_filename = sys.argv[1] + output_filename = sys.argv[2] + output_id = sys.argv[3] + # where to store files that become additional output + database_tmp_dir = sys.argv[4] + primary_spec = sys.argv[5] + species = sys.argv[6].split( ',' ) + all_species = sys.argv[7].split( ',' ) + partial = sys.argv[8] + keep_gaps = sys.argv[9] + out_files = {} + + if "None" in species: + species = [] + + if primary_spec not in species: + species.append( primary_spec ) + if primary_spec not in all_species: + all_species.append( primary_spec ) + + all_species.sort() + for spec in species: + if spec == primary_spec: + out_files[ spec ] = open( output_filename, 'wb+' ) + else: + out_files[ spec ] = open( os.path.join( database_tmp_dir, 'primary_%s_%s_visible_interval_%s' % ( output_id, spec, spec ) ), 'wb+' ) + out_files[ spec ].write( '#chrom\tstart\tend\tstrand\tscore\tname\t%s\n' % ( '\t'.join( all_species ) ) ) + num_species = len( all_species ) + + file_in = open( input_filename, 'r' ) + maf_reader = maf.Reader( file_in ) + + for i, m in enumerate( maf_reader ): + for j, block in enumerate( maf_utilities.iter_blocks_split_by_species( m ) ): + if len( block.components ) < num_species and partial == "partial_disallowed": + continue + sequences = {} + for c in block.components: + spec, chrom = maf_utilities.src_split( c.src ) + if keep_gaps == 'remove_gaps': + sequences[ spec ] = c.text.replace( '-', '' ) + else: + sequences[ spec ] = c.text + sequences = '\t'.join( [ sequences.get( _, '' ) for _ in all_species ] ) + for spec in species: + c = block.get_component_by_src_start( spec ) + if c is not None: + spec2, chrom = maf_utilities.src_split( c.src ) + assert spec2 == spec, Exception( 'Species name inconsistancy found in component: %s != %s' % ( spec, spec2 ) ) + out_files[ spec ].write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( chrom, c.forward_strand_start, c.forward_strand_end, c.strand, m.score, "%s_%s_%s" % (spec, i, j), sequences ) ) + file_in.close() + for file_out in out_files.values(): + file_out.close() + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/maf_to_interval.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/maf_to_interval.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,128 @@\n+<tool id="MAF_To_Interval1" name="MAF to Interval" version="1.0.0">\n+ <description>Converts a MAF formatted file to the Interval format</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <command>python \'$__tool_directory__/maf_to_interval.py\' \'${input1}\' \'${out_file1}\' \'${out_file1.id}\' \'.\' \'${input1.dbkey}\' \'${species}\' \'${input1.metadata.species}\' ${complete_blocks} ${remove_gaps}</command>\n+ <inputs>\n+ <param format="maf" name="input1" type="data" label="MAF file to convert"/>\n+ <param name="species" type="select" label="Select additional species" display="checkboxes" multiple="true" help="The species matching the dbkey of the alignment is always included. A separate history item will be created for each species.">\n+ <options>\n+ <filter type="data_meta" ref="input1" key="species" />\n+ <filter type="remove_value" meta_ref="input1" key="dbkey" />\n+ </options>\n+ </param>\n+ <param name="complete_blocks" type="select" label="Exclude blocks which have a species missing">\n+ <option value="partial_allowed">include blocks with missing species</option>\n+ <option value="partial_disallowed">exclude blocks with missing species</option>\n+ </param>\n+ <param name="remove_gaps" type="select" label="Remove Gap characters from sequences">\n+ <option value="keep_gaps">keep gaps</option>\n+ <option value="remove_gaps">remove gaps</option>\n+ </param>\n+ </inputs>\n+ <outputs>\n+ <data format="interval" name="out_file1" />\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="input1" value="4.maf" dbkey="hg17"/>\n+ <param name="complete_blocks" value="partial_disallowed"/>\n+ <param name="remove_gaps" value="keep_gaps"/>\n+ <param name="species" value="panTro1" />\n+ <output name="out_file1" file="maf_to_interval_out_hg17.interval"/>\n+ <output name="out_file1" file="maf_to_interval_out_panTro1.interval"/>\n+ </test>\n+ </tests>\n+ <help>\n+**What it does**\n+\n+This tool converts every MAF block to a set of genomic intervals describing the position of that alignment block within a corresponding genome. Sequences from aligning species are also included in the output.\n+\n+The interface for this tool contains several options:\n+\n+ * **MAF file to convert**. Choose multiple alignments from history to be converted to BED format.\n+ * **Choose species**. Choose additional species from the alignment to be included in the output\n+ * **Exclude blocks which have a species missing**. if an alignment block does not contain any one of the species found in the alignment set and this option is set to **exclude blocks with missing species**, then coordinates of such a block **will not** be included in the output (see **Example 2** below).\n+ * **Remove Gap characters from sequences**. Gaps can be removed from sequences before they are output.\n+\n+\n+-----\n+\n+**Example 1**: **Include only reference genome** (hg18 in this case) and **include blocks with missing species**:\n+\n+For the following alignment::\n+\n+ ##maf version=1\n+ a score=68686.000000\n+ s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+ s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\n+ s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\n+\n+ a score=10289.000000\n+ s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\n+\n+the tool w'..b'c2\n+ chr20\t56827368\t56827443\t+\t68686.0\thg18_0_0\tCG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\tGACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tAGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\tGACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tGACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+ chr20\t56827443\t56827480\t+\t10289.0\thg18_1_0\t\tATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\t\tATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\tATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\n+\n+\n+-----\n+\n+**Example 2**: **Include hg18 and mm8** and **exclude blocks with missing species**:\n+\n+For the following alignment::\n+\n+ ##maf version=1\n+ a score=68686.000000\n+ s hg18.chr20 56827368 75 + 62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ s panTro2.chr20 56528685 75 + 62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\n+ s rheMac2.chr10 89144112 69 - 94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+ s mm8.chr2 173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\n+ s canFam2.chr24 46551822 67 + 50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\n+\n+ a score=10289.000000\n+ s hg18.chr20 56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\n+ s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\n+\n+the tool will create **two** history items (one for hg18 and one for mm8) containing the following (**note** that both history items contain only one line describing the first alignment block. The second MAF block is not included in the output because it does not contain mm8):\n+\n+History item **1** (for hg18)::\n+\n+ #chrom\tstart\tend\tstrand\tscore\tname\tcanFam2\thg18\tmm8\tpanTro2\trheMac2\n+ chr20\t56827368\t56827443\t+\t68686.0\thg18_0_0\tCG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\tGACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tAGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\tGACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tGACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+\n+\n+History item **2** (for mm8)::\n+\n+ #chrom\tstart\tend\tstrand\tscore\tname\tcanFam2\thg18\tmm8\tpanTro2\trheMac2\n+ chr2\t173910832\t173910893\t+\t68686.0\tmm8_0_0\tCG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\tGACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tAGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\tGACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tGACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+\n+\n+-------\n+\n+.. class:: infomark\n+\n+**About formats**\n+\n+**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes.\n+\n+ - The .maf format is line-oriented. Each multiple alignment ends with a blank line.\n+ - Each sequence in an alignment is on a single line.\n+ - Lines starting with # are considered to be comments.\n+ - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment.\n+ - Some MAF files may contain two optional line types:\n+\n+ - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line;\n+ - An "e" line containing information about the size of the gap between the alignments that span the current block.\n+ </help>\n+ <expand macro="citations" />\n+</tool>\n+\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/vcf_to_maf_customtrack.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/vcf_to_maf_customtrack.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,163 @@ +# Dan Blankenberg +from __future__ import print_function + +import sys +from optparse import OptionParser + +import bx.align.maf +import galaxy_utils.sequence.vcf +from six import Iterator + +UNKNOWN_NUCLEOTIDE = '*' + + +class PopulationVCFParser( Iterator ): + def __init__( self, reader, name ): + self.reader = reader + self.name = name + self.counter = 0 + + def __next__( self ): + rval = [] + vc = next(self.reader) + for i, allele in enumerate( vc.alt ): + rval.append( ( '%s_%i.%i' % ( self.name, i + 1, self.counter + 1 ), allele ) ) + self.counter += 1 + return ( vc, rval ) + + def __iter__( self ): + while True: + yield next(self) + + +class SampleVCFParser( Iterator ): + def __init__( self, reader ): + self.reader = reader + self.counter = 0 + + def __next__( self ): + rval = [] + vc = next(self.reader) + alleles = [ vc.ref ] + vc.alt + + if 'GT' in vc.format: + gt_index = vc.format.index( 'GT' ) + for sample_name, sample_value in zip( vc.sample_names, vc.sample_values ): + gt_indexes = [] + for i in sample_value[ gt_index ].replace( '|', '/' ).replace( '\\', '/' ).split( '/' ): # Do we need to consider phase here? + try: + gt_indexes.append( int( i ) ) + except: + gt_indexes.append( None ) + for i, allele_i in enumerate( gt_indexes ): + if allele_i is not None: + rval.append( ( '%s_%i.%i' % ( sample_name, i + 1, self.counter + 1 ), alleles[ allele_i ] ) ) + self.counter += 1 + return ( vc, rval ) + + def __iter__( self ): + while True: + yield next(self) + + +def main(): + usage = "usage: %prog [options] output_file dbkey inputfile pop_name" + parser = OptionParser( usage=usage ) + parser.add_option( "-p", "--population", action="store_true", dest="population", default=False, help="Create MAF on a per population basis") + parser.add_option( "-s", "--sample", action="store_true", dest="sample", default=False, help="Create MAF on a per sample basis") + parser.add_option( "-n", "--name", dest="name", default='Unknown Custom Track', help="Name for Custom Track") + parser.add_option( "-g", "--galaxy", action="store_true", dest="galaxy", default=False, help="Tool is being executed by Galaxy (adds extra error messaging).") + ( options, args ) = parser.parse_args() + + if len( args ) < 3: + if options.galaxy: + print("It appears that you forgot to specify an input VCF file, click 'Add new VCF...' to add at least input.\n", file=sys.stderr) + parser.error( "Need to specify an output file, a dbkey and at least one input file" ) + + if not ( options.population ^ options.sample ): + parser.error( 'You must specify either a per population conversion or a per sample conversion, but not both' ) + + out = open( args.pop(0), 'wb' ) + out.write( 'track name="%s" visibility=pack\n' % options.name.replace( "\"", "'" ) ) + + maf_writer = bx.align.maf.Writer( out ) + + dbkey = args.pop(0) + + vcf_files = [] + if options.population: + i = 0 + while args: + filename = args.pop( 0 ) + pop_name = args.pop( 0 ).replace( ' ', '_' ) + if not pop_name: + pop_name = 'population_%i' % ( i + 1 ) + vcf_files.append( PopulationVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ), pop_name ) ) + i += 1 + else: + while args: + filename = args.pop( 0 ) + vcf_files.append( SampleVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ) ) + + non_spec_skipped = 0 + for vcf_file in vcf_files: + for vc, variants in vcf_file: + num_ins = 0 + num_dels = 0 + for variant_name, variant_text in variants: + if 'D' in variant_text: + num_dels = max( num_dels, int( variant_text[1:] ) ) + elif 'I' in variant_text: + num_ins = max( num_ins, len( variant_text ) - 1 ) + + alignment = bx.align.maf.Alignment() + ref_text = vc.ref + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) + start_pos = vc.pos - 1 + if num_dels and start_pos: + ref_text = UNKNOWN_NUCLEOTIDE + ref_text + start_pos -= 1 + alignment.add_component( bx.align.maf.Component( + src='%s.%s%s' % ( dbkey, ("chr" if not vc.chrom.startswith("chr") else ""), vc.chrom ), + start=start_pos, size=len( ref_text.replace( '-', '' ) ), + strand='+', src_size=start_pos + len( ref_text ), + text=ref_text ) ) + for variant_name, variant_text in variants: + # FIXME: + # skip non-spec. compliant data, see: http://1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 for format spec + # this check is due to data having indels not represented in the published format spec, + # e.g. 1000 genomes pilot 1 indel data: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/indels/CEU.SRP000031.2010_03.indels.sites.vcf.gz + if variant_text and variant_text[0] in [ '-', '+' ]: + non_spec_skipped += 1 + continue + + # do we need a left padding unknown nucleotide (do we have deletions)? + if num_dels and start_pos: + var_text = UNKNOWN_NUCLEOTIDE + else: + var_text = '' + if 'D' in variant_text: + cur_num_del = int( variant_text[1:] ) + pre_del = min( len( vc.ref ), cur_num_del ) + post_del = cur_num_del - pre_del + var_text = var_text + '-' * pre_del + '-' * num_ins + '-' * post_del + var_text = var_text + UNKNOWN_NUCLEOTIDE * ( len( ref_text ) - len( var_text ) ) + elif 'I' in variant_text: + cur_num_ins = len( variant_text ) - 1 + var_text = var_text + vc.ref + variant_text[1:] + '-' * ( num_ins - cur_num_ins ) + UNKNOWN_NUCLEOTIDE * max( 0, ( num_dels - 1 ) ) + else: + var_text = var_text + variant_text + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) + alignment.add_component( bx.align.maf.Component( + src=variant_name, start=0, + size=len( var_text.replace( '-', '' ) ), strand='+', + src_size=len( var_text.replace( '-', '' ) ), + text=var_text ) ) + maf_writer.write( alignment ) + + maf_writer.close() + + if non_spec_skipped: + print('Skipped %i non-specification compliant indels.' % non_spec_skipped) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c maf/vcf_to_maf_customtrack.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maf/vcf_to_maf_customtrack.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,127 @@ +<tool id="vcf_to_maf_customtrack1" name="VCF to MAF Custom Track" version="1.0.0"> + <description>for display at UCSC</description> + <macros> + <import>macros.xml</import> + </macros> + <command> +python '$__tool_directory__/vcf_to_maf_customtrack.py' '$out_file1' +#if $vcf_source_type.vcf_file + '${vcf_source_type.vcf_file[0].vcf_input.dbkey}' +#else + '?' +#end if +${vcf_source_type.vcf_source} -n '$track_name' +#for $vcf_repeat in $vcf_source_type.vcf_file + '${vcf_repeat.vcf_input}' + #if $vcf_source_type.vcf_source == '-p' + '${vcf_repeat.population_name}' + #end if +#end for +-g + </command> + <inputs> + <param name="track_name" type="text" label="Custom Track Name" value="Galaxy Custom Track" size="30" /> + <conditional name="vcf_source_type"> + <param name="vcf_source" type="select" label="VCF Source Source Type"> + <option value="-p" selected="true">Per Population (file)</option> + <option value="-s">Per Sample</option> + </param> + <when value="-p"> + <repeat name="vcf_file" title="VCF population file" min="1"> + <param format="tabular" name="vcf_input" type="data" label="VCF file"/> + <param name="population_name" type="text" label="Name for this population" value=""/> + </repeat> + </when> + <when value="-s"> + <repeat name="vcf_file" title="VCF sample file" min="1"> + <param format="tabular" name="vcf_input" type="data" label="VCF file"/> + <!-- add column count validator >= 8? --> + </repeat> + </when> + </conditional> + </inputs> + <outputs> + <data format="mafcustomtrack" name="out_file1" /> + </outputs> +<!-- <tests> + <test> + <param name="track_name" value="Galaxy Custom Track"/> + <param name="vcf_source" value="Per Population"/> + <param name="vcf_input" value="vcf_to_maf_in.vcf" ftype="tabular"/> + <param name="population_name" value=""/> + <output name="out_file1" file="vcf_to_maf_population_out.mafcustomtrack"/> + </test> + <test> + <param name="track_name" value="Galaxy Custom Track"/> + <param name="vcf_source" value="Per Sample"/> + <param name="vcf_input" value="vcf_to_maf_in.vcf" ftype="tabular"/> + <output name="out_file1" file="vcf_to_maf_sample_out.mafcustomtrack"/> + </test> + </tests> --> + <help> +**What it does** + +This tool converts a Variant Call Format (VCF) file into a Multiple Alignment Format (MAF) custom track file suitable for display at genome browsers. + +This file should be used for display purposes only (e.g as a UCSC Custom Track). Performing an analysis using the output created by this tool as input is not recommended; the source VCF file should be used when performing an analysis. + +*Unknown nucleotides* are represented as '*' as required to allow the display to draw properly; these include e.g. reference bases which appear before a deletion and are not available without querying the original reference sequence. + +**Example** + +Starting with a VCF:: + + ##fileformat=VCFv3.3 + ##fileDate=20090805 + ##source=myImputationProgramV3.1 + ##reference=1000GenomesPilot-NCBI36 + ##phasing=partial + ##INFO=NS,1,Integer,"Number of Samples With Data" + ##INFO=DP,1,Integer,"Total Depth" + ##INFO=AF,-1,Float,"Allele Frequency" + ##INFO=AA,1,String,"Ancestral Allele" + ##INFO=DB,0,Flag,"dbSNP membership, build 129" + ##INFO=H2,0,Flag,"HapMap2 membership" + ##FILTER=q10,"Quality below 10" + ##FILTER=s50,"Less than 50% of samples have data" + ##FORMAT=GT,1,String,"Genotype" + ##FORMAT=GQ,1,Integer,"Genotype Quality" + ##FORMAT=DP,1,Integer,"Read Depth" + ##FORMAT=HQ,2,Integer,"Haplotype Quality" + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 + 20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1 + 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:-1,-1 + 20 1110696 rs6040355 A G,T 67 0 NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:-1,-1 + 20 1230237 . T . 47 0 NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:-1,-1 + 20 1234567 microsat1 G D4,IGA 50 0 NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 + +Under the following conditions: **VCF Source type:** *Per Population (file)*, **Name for this population:** *CHB+JPT* +Results in the following MAF custom track:: + + track name="Galaxy Custom Track" visibility=pack + ##maf version=1 + a score=0 + s hg18.chr20 14369 1 + 14370 G + s CHB+JPT_1.1 0 1 + 1 A + + a score=0 + s hg18.chr20 17329 1 + 17330 T + s CHB+JPT_1.2 0 1 + 1 A + + a score=0 + s hg18.chr20 1110695 1 + 1110696 A + s CHB+JPT_1.3 0 1 + 1 G + s CHB+JPT_2.3 0 1 + 1 T + + a score=0 + s hg18.chr20 1230236 1 + 1230237 T + s CHB+JPT_1.4 0 1 + 1 . + + a score=0 + s hg18.chr20 1234565 5 + 1234572 *G--*** + s CHB+JPT_1.5 0 1 + 1 *------ + s CHB+JPT_2.5 0 7 + 7 *GGA*** + </help> + <expand macro="citations" /> +</tool> + |
b |
diff -r 000000000000 -r 7621d36a4e9c meme/fimo.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/meme/fimo.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,238 @@\n+<tool id="meme_fimo" name="FIMO" version="0.0.1">\n+ <requirements><requirement type="package">meme</requirement></requirements>\n+ <description>- Find Individual Motif Occurrences</description>\n+ <command interpreter="python">fimo_wrapper.py \'fimo --o "${$html_outfile.files_path}" --verbosity "1"\n+ \n+ #if str( $options_type.options_type_selector ) == \'advanced\':\n+ --max-seq-length "${options_type.max_seq_length}" \n+ --max-stored-scores "${options_type.max_stored_scores }" \n+ --motif-pseudo "${options_type.motif_pseudo}" \n+ ${options_type.norc} \n+ --output-pthresh "${options_type.output_pthresh}" \n+\n+ \n+ #for $motif in $options_type.motifs:\n+ --motif "${motif.motif}"\n+ #end for\n+ \n+ #if str( $options_type.bgfile_type.bgfile_type_selector ) == \'motif-file\':\n+ --bgfile "motif-file"\n+ #elif str( $options_type.bgfile_type.bgfile_type_selector ) == \'motif-file\':\n+ --bgfile "${options_type.bgfile_type.bgfile}"\n+ #end if\n+ \n+ #if str( $options_type.qvalue_type.qvalue_type_selector ) == \'no-qvalue\':\n+ --no-qvalue\n+ #else:\n+ --output-qthresh "${options_type.qvalue_type.output_qthresh}"\n+ #end if\n+ #end if\n+ \n+ "${input_motifs}" \n+ \n+ #if str( $fasta_type.fasta_type_selector ) == \'history\':\n+ "${fasta_type.input_database}"\n+ #else:\n+ "${fasta_type.input_database.fields.path}"\n+ #end if\n+\n+ \'\n+ \n+ \'${html_outfile.files_path}\'\n+ \n+ \'${html_outfile}\'\n+ \n+ \'${interval_outfile}\'\n+ \n+ \'${txt_outfile}\'\n+ \n+ \'${xml_outfile}\'\n+ \n+ \'${gff_outfile}\'\n+ \n+ </command>\n+ <inputs>\n+ <param format="memexml" name="input_motifs" type="data" label="\'MEME output\' formatted file"/>\n+ \n+ <conditional name="fasta_type">\n+ <param name="fasta_type_selector" type="select" label="Source for sequence to search">\n+ <option value="cached">Locally Cached sequences</option>\n+ <option value="history" selected="true">Sequences from your history</option>\n+ </param>\n+ <when value="cached">\n+ <param name="input_database" type="select" label="Genome to search">\n+ <options from_data_table="all_fasta">\n+ </options>\n+ </param>\n+ </when>\n+ <when value="history">\n+ <param format="fasta" name="input_database" type="data" label="Sequences"/>\n+ </when>\n+ </conditional>\n+ \n+ <conditional name="options_type">\n+ <param name="options_type_selector" type="select" label="Options Configuration">\n+ <option value="basic" selected="true">Basic</option>\n+ <option value="advanced">Advanced</option>\n+ </param>\n+ <when value="basic">\n+ <!-- do nothing here -->\n+ </when>\n+ <when value="advanced">\n+ \n+ <conditional name="bgfile_type">\n+ <param name="bgfile_type_selector" type="select" label="Background file type">\n+ <option value="motif-file">Use Frequencies from Motif File</option>\n+ <option value="default" selected="true">Use frequencies from non-redundant database (default)</option>\n+ <option value="bgfile">Use Frequencies from Background File</option>\n+ </param>\n+ <when value="motif-file">\n+ <!-- do nothing here -->\n+ </when>\n+ <when value="default">\n+ <!-- do nothing here -->\n+ </when>\n+ <when value="bgfile">\n+ <param name="bgfile" type="data" format="txt" optional="True" label="Background Model" />\n+ </when>\n+ </conditional>\n+ \n+ <repeat name="motifs" title="Limit to specified motif">\n+ <param name="motif" type="text" value="" label="Specify motif by id" />\n+ </repeat>\n+ \n+ <param name="max_seq_length" type="integer" value="250000000" label="Maximum input sequence length" />\n+ <param name="max_stored_scores" type="integer" value="100000" label="Maximum score count to store" />\n+ <param name="motif_pseudo" type="float" value="0.1" label="Pseudocount to add to counts in motif matrix" />\n+ <param name="norc" label="Do not check reverse'..b'n="1"/>\n+ </option>\n+ </action>\n+ </when>\n+ </conditional>\n+ </actions>\n+ </data>\n+ <data format="tabular" name="txt_outfile" label="${tool.name} on ${on_string} (text)">\n+ <actions>\n+ <conditional name="fasta_type.fasta_type_selector">\n+ <when value="cached">\n+ <action type="metadata" name="dbkey">\n+ <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+ <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+ </option>\n+ </action>\n+ </when>\n+ </conditional>\n+ </actions>\n+ </data>\n+ <data format="tabular" name="gff_outfile" label="${tool.name} on ${on_string} (almost-gff)">\n+ <actions>\n+ <conditional name="fasta_type.fasta_type_selector">\n+ <when value="cached">\n+ <action type="metadata" name="dbkey">\n+ <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+ <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+ </option>\n+ </action>\n+ </when>\n+ </conditional>\n+ </actions>\n+ </data>\n+ <data format="cisml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)">\n+ <actions>\n+ <conditional name="fasta_type.fasta_type_selector">\n+ <when value="cached">\n+ <action type="metadata" name="dbkey">\n+ <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+ <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+ </option>\n+ </action>\n+ </when>\n+ </conditional>\n+ </actions>\n+ </data>\n+ <data format="interval" name="interval_outfile" label="${tool.name} on ${on_string} (interval)">\n+ <actions>\n+ <conditional name="fasta_type.fasta_type_selector">\n+ <when value="cached">\n+ <action type="metadata" name="dbkey">\n+ <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+ <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+ </option>\n+ </action>\n+ </when>\n+ </conditional>\n+ </actions>\n+ </data>\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="input_motifs" value="meme/meme/meme_output_xml_1.xml" ftype="memexml"/>\n+ <param name="fasta_type_selector" value="history"/>\n+ <param name="input_database" value="phiX.fasta" ftype="fasta"/>\n+ <param name="options_type_selector" value="basic"/>\n+ <param name="non_commercial_use" value="True"/>\n+ <output name="html_outfile" file="meme/fimo/fimo_output_html_1.html" lines_diff="12"/>\n+ <output name="txt_outfile" file="meme/fimo/fimo_output_txt_1.txt" lines_diff="0"/>\n+ <output name="gff_outfile" file="meme/fimo/fimo_output_almost-gff_1.txt" lines_diff="0"/>\n+ <output name="xml_outfile" file="meme/fimo/fimo_output_xml_1.xml" lines_diff="8"/>\n+ <output name="interval_outfile" file="meme/fimo/fimo_output_interval_1.txt" lines_diff="0"/>\n+ </test>\n+ </tests>\n+ <help>\n+\n+.. class:: warningmark\n+\n+**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.**\n+\n+.. class:: infomark\n+\n+**To cite FIMO:**\n+`Grant CE, Bailey TL, Noble WS. FIMO: scanning for occurrences of a given motif. Bioinformatics. 2011 Apr 1;27(7):1017-8. <http://www.ncbi.nlm.nih.gov/pubmed/21330290>`_\n+\n+\n+For detailed information on FIMO, click here_. To view the license_.\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*\n+\n+\n+.. _here: http://meme.nbcr.net/meme/fimo-intro.html\n+.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html\n+\n+ </help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c meme/fimo_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/meme/fimo_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,78 @@ +#!/usr/bin/env python +# Dan Blankenberg +""" +Read text output from FIMO and create an interval file. +""" +import os +import shutil +import subprocess +import sys +import tempfile + +from galaxy_utils.sequence.transform import DNA_reverse_complement + +buffsize = 1048576 + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def main(): + assert len( sys.argv ) == 8, "Wrong number of arguments" + sys.argv.pop(0) + fimo_cmd = sys.argv.pop(0) + html_path = sys.argv.pop(0) + html_out = sys.argv.pop(0) + interval_out = sys.argv.pop(0) + txt_out = sys.argv.pop(0) + xml_out = sys.argv.pop(0) + gff_out = sys.argv.pop(0) + + # run fimo + try: + tmp_stderr = tempfile.NamedTemporaryFile() + proc = subprocess.Popen( args=fimo_cmd, shell=True, stderr=tmp_stderr ) + returncode = proc.wait() + tmp_stderr.seek(0) + stderr = '' + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + + if returncode != 0: + raise Exception(stderr) + except Exception as e: + raise Exception('Error running FIMO:\n' + str( e )) + + shutil.move( os.path.join( html_path, 'fimo.txt' ), txt_out ) + shutil.move( os.path.join( html_path, 'fimo.gff' ), gff_out ) + shutil.move( os.path.join( html_path, 'fimo.xml' ), xml_out ) + shutil.move( os.path.join( html_path, 'fimo.html' ), html_out ) + + out_file = open( interval_out, 'wb' ) + out_file.write( "#%s\n" % "\t".join( ( "chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value" ) ) ) + for line in open( txt_out ): + if line.startswith( '#' ): + continue + fields = line.rstrip( "\n\r" ).split( "\t" ) + start, end = int( fields[2] ), int( fields[3] ) + sequence = fields[7] + if start > end: + start, end = end, start # flip start and end, and set strand + strand = "-" + sequence = DNA_reverse_complement( sequence ) # we want sequences relative to strand; FIMO always provides + stranded sequence + else: + strand = "+" + start -= 1 # make 0-based start position + out_file.write( "%s\n" % "\t".join( [ fields[1], str( start ), str( end ), fields[0], fields[4], strand, sequence, fields[5], fields[6] ] ) ) + out_file.close() + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c meme/meme.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/meme/meme.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,353 @@\n+<tool id="meme_meme" name="MEME" version="1.0.0">\n+ <requirements><requirement type=\'package\'>meme</requirement></requirements>\n+ <description>- Multiple Em for Motif Elicitation</description>\n+ <command>meme "$input1" -o "${html_outfile.files_path}" \n+ -nostatus\n+ \n+ ##-p 8 ##number of processors\n+ \n+ #if str( $options_type.options_type_selector ) == \'advanced\':\n+ -sf "${ str( $options_type.sf ).replace( \' \', \'_\' ) }"\n+ -${options_type.alphabet_type.alphabet_type_selector} \n+ -mod "${options_type.mod_type.mod_type_selector}" \n+ -nmotifs "${options_type.nmotifs}" \n+ -wnsites "${options_type.wnsites}"\n+ \n+ #if $options_type.evt < float(\'inf\'):\n+ -evt "${options_type.evt}" \n+ #end if\n+ \n+ #if str( $options_type.mod_type.mod_type_selector ) != \'oops\':\n+ #if str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == \'nsites\':\n+ -nsites "${options_type.mod_type.motif_occurrence_type.nsites}"\n+ #elif str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == \'min_max_sites\':\n+ -minsites "${options_type.mod_type.motif_occurrence_type.minsites}" -maxsites "${options_type.mod_type.motif_occurrence_type.maxsites}"\n+ #end if\n+ #end if\n+ \n+ #if str( $options_type.motif_width_type.motif_width_type_selector ) == \'exact\':\n+ -w "${options_type.motif_width_type.width}"\n+ #else\n+ -minw "${options_type.motif_width_type.minw}" -maxw "${options_type.motif_width_type.maxw}"\n+ #end if\n+ \n+ #if str( $options_type.motif_trim_type.motif_trim_type_selector ) == \'nomatrim\':\n+ -nomatrim\n+ #else\n+ -wg "${options_type.motif_trim_type.wg}" -ws "${options_type.motif_trim_type.ws}" ${options_type.motif_trim_type.noendgaps}\n+ #end if\n+ \n+ #if str( $options_type.bfile ) != \'None\':\n+ -bfile "${options_type.bfile}"\n+ #end if\n+ \n+ #if str( $options_type.pspfile ) != \'None\':\n+ -psp "${options_type.pspfile}"\n+ #end if\n+ \n+ #if str( $options_type.alphabet_type.alphabet_type_selector ) == "dna":\n+ ${options_type.alphabet_type.revcomp} ${options_type.alphabet_type.pal}\n+ #end if\n+ \n+ -maxiter "${options_type.maxiter}" -distance "${options_type.distance}"\n+ \n+ -prior "${options_type.alphabet_type.prior_type.prior_type_selector}"\n+ #if str( $options_type.alphabet_type.prior_type.prior_type_selector ) != \'addone\':\n+ -b "${options_type.alphabet_type.prior_type.prior_b}" \n+ #if str( $options_type.alphabet_type.prior_type.plib ) != \'None\':\n+ -plib "${options_type.alphabet_type.prior_type.plib}"\n+ #end if\n+ #end if\n+ \n+ #if str( $options_type.alphabet_type.spmap_type.spmap_type_selector ) == \'cons\':\n+ -cons "${options_type.alphabet_type.spmap_type.cons}" \n+ #else\n+ -spmap "${options_type.alphabet_type.spmap_type.spmap_type_selector}"\n+ -spfuzz "${options_type.alphabet_type.spmap_type.spfuzz}" \n+ #end if\n+ \n+ #if str( $options_type.branching_type.branching_type_selector ) == \'x_branch\':\n+ -x_branch -bfactor "${options_type.branching_type.bfactor}" -heapsize "${options_type.branching_type.heapsize}"\n+ #end if\n+ \n+ ##-maxsize "1000000" ##remove hardcoded maxsize? should increase number of processors instead\n+ \n+ #end if\n+ \n+ 2>&1 || echo "Error running MEME."\n+ \n+ \n+ && mv ${html_outfile.files_path}/meme.html ${html_outfile}\n+ \n+ && mv ${html_outfile.files_path}/meme.txt ${txt_outfile}\n+ \n+ && mv ${html_outfile.files_path}/meme.xml ${xml_outfile}\n+ \n+ </command>\n+ <inputs>\n+ <param format="fasta" name="input1" type="data" label="Sequences"/>\n+ \n+ <conditional name="options_type">\n+ <param name="options_type_selector" type="select" label="Options Configuration">\n+ <option value="basic" selected="true">Basic</option>\n+ <option value="advanced">Advanced</option>\n+ </param>\n+ <when value="basic">\n+ <!-- do nothing here -->\n+ </when>\n+ <when value="advanced">\n+ \n+ '..b'</conditional>\n+ \n+ <conditional name="motif_trim_type">\n+ <param name="motif_trim_type_selector" type="select" label="Motif trim type">\n+ <option value="nomatrim">No motif trim</option>\n+ <option value="trim" selected="true">Trim motif</option>\n+ </param>\n+ <when value="nomatrim">\n+ <!-- no values here -->\n+ </when>\n+ <when value="trim">\n+ <param name="wg" type="integer" value="11" label="Gap cost" />\n+ <param name="ws" type="integer" value="1" label="Space cost" />\n+ <param name="noendgaps" label="Do not penalize endgaps" type="boolean" truevalue="-noendgaps" falsevalue="" checked="False"/>\n+ </when>\n+ </conditional>\n+ \n+ <param name="bfile" type="data" format="txt" optional="True" label="Background Model" />\n+ <param name="pspfile" type="data" format="txt" optional="True" label="Position-Specific Prior" />\n+ \n+ <param name="maxiter" type="integer" value="50" label="Number of iterations of EM to run" />\n+ <param name="distance" type="float" value="0.001" label="Convergence criterion" />\n+ \n+ <conditional name="branching_type">\n+ <param name="branching_type_selector" type="select" label="x-branching type">\n+ <option value="x_branch">Perform x-branching</option>\n+ <option value="no_x_branch" selected="true">No x-branching</option>\n+ </param>\n+ <when value="no_x_branch">\n+ <!-- no values here -->\n+ </when>\n+ <when value="x_branch">\n+ <param name="bfactor" type="integer" value="3" label="Number of iterations of branching" />\n+ <param name="heapsize" type="integer" value="64" label="Maximum number of heaps to use" />\n+ </when>\n+ </conditional>\n+ \n+ </when>\n+ </conditional>\n+ \n+ <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False">\n+ <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator>\n+ </param>\n+ \n+ </inputs>\n+ <outputs>\n+ <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/>\n+ <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (text)"/>\n+ <data format="memexml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/>\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="input1" value="meme/meme/meme_input_1.fasta" ftype="fasta" dbkey="hg19"/>\n+ <param name="options_type_selector" value="basic"/>\n+ <param name="non_commercial_use" value="True"/>\n+ <output name="html_outfile" file="meme/meme/meme_output_html_1.html" lines_diff="12"/>\n+ <output name="txt_outfile" file="meme/meme/meme_output_txt_1.txt" lines_diff="12"/>\n+ <output name="xml_outfile" file="meme/meme/meme_output_xml_1.xml" lines_diff="8"/>\n+ </test>\n+ </tests>\n+ <help>\n+\n+.. class:: warningmark\n+\n+**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.**\n+\n+If you want to specify sequence weights, you must include them at the top of your input FASTA file.\n+\n+.. class:: infomark\n+\n+**To cite MEME:**\n+Timothy L. Bailey and Charles Elkan, "Fitting a mixture model by expectation maximization to discover motifs in biopolymers", Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology, pp. 28-36, AAAI Press, Menlo Park, California, 1994. \n+\n+\n+For detailed information on MEME, click here_. To view the license_.\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*\n+\n+.. _here: http://meme.nbcr.net/meme/meme-intro.html\n+.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html\n+\n+ </help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c metag_tools/blat_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metag_tools/blat_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,112 @@ +#!/usr/bin/env python + +import os +import sys +import tempfile + +assert sys.version_info[:2] >= (2.4) + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR ): + nib_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR + nib_path = '' + nibs = {} + for i, line in enumerate( open( nib_file ) ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( "#" ): + fields = line.split( '\t' ) + if len( fields ) < 3: + continue + if fields[0] == 'seq': + nibs[( fields[1] )] = fields[2] + if dbkey in nibs: + nib_path = nibs[( dbkey )] + return nib_path + + +def check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR ): + twobit_file = "%s/twobit.loc" % GALAXY_DATA_INDEX_DIR + twobit_path = '' + twobits = {} + for i, line in enumerate( open( twobit_file ) ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( "#" ): + fields = line.split( '\t' ) + if len( fields ) < 2: + continue + twobits[( fields[0] )] = fields[1] + if dbkey in twobits: + twobit_path = twobits[( dbkey )] + return twobit_path + + +def __main__(): + # I/O + source_format = sys.argv[1] # 0: dbkey; 1: upload file + target_file = sys.argv[2] + query_file = sys.argv[3] + output_file = sys.argv[4] + min_iden = sys.argv[5] + tile_size = sys.argv[6] + one_off = sys.argv[7] + + try: + float(min_iden) + except: + stop_err('Invalid value for minimal identity.') + + try: + test = int(tile_size) + assert test >= 6 and test <= 18 + except: + stop_err('Invalid value for tile size. DNA word size must be between 6 and 18.') + + try: + test = int(one_off) + assert test >= 0 and test <= int(tile_size) + except: + stop_err('Invalid value for mismatch numbers in the word') + + GALAXY_DATA_INDEX_DIR = sys.argv[8] + + all_files = [] + if source_format == '0': + # check target genome + dbkey = target_file + nib_path = check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR ) + twobit_path = check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR ) + if not os.path.exists( nib_path ) and not os.path.exists( twobit_path ): + stop_err("No sequences are available for %s, request them by reporting this error." % dbkey) + + # check the query file, see whether all of them are legitimate sequence + if nib_path and os.path.isdir( nib_path ): + compress_files = os.listdir(nib_path) + target_path = nib_path + elif twobit_path: + compress_files = [twobit_path] + target_path = "" + else: + stop_err("Requested genome build has no available sequence.") + + for file in compress_files: + file = "%s/%s" % ( target_path, file ) + file = os.path.normpath(file) + all_files.append(file) + else: + all_files = [target_file] + + for detail_file_path in all_files: + output_tempfile = tempfile.NamedTemporaryFile().name + command = "blat %s %s %s -oneOff=%s -tileSize=%s -minIdentity=%s -mask=lower -noHead -out=pslx 2>&1" % ( detail_file_path, query_file, output_tempfile, one_off, tile_size, min_iden ) + os.system( command ) + os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) + os.remove( output_tempfile ) + + +if __name__ == '__main__': + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c metag_tools/blat_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metag_tools/blat_wrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,99 @@ +<tool id="blat_wrapper" name="BLAT" version="1.0.0"> + <description> compare sequencing reads against UCSC genome builds</description> + <command interpreter="python"> + #if $source.source_select=="database" #blat_wrapper.py 0 $source.dbkey $input_query $output1 $iden $tile_size $one_off + #else #blat_wrapper.py 1 $source.input_target $input_query $output1 $iden $tile_size $one_off + #end if# ${GALAXY_DATA_INDEX_DIR} + </command> + <inputs> + <conditional name="source"> + <param name="source_select" type="select" label="Target source"> + <option value="database">Genome Build</option> + <option value="input_ref">Your Upload File</option> + </param> + <when value="database"> + <param name="dbkey" type="genomebuild" label="Genome" /> + </when> + <when value="input_ref"> + <param name="input_target" type="data" format="fasta" label="Reference sequence" /> + </when> + </conditional> + <param name="input_query" type="data" format="fasta" label="Sequence file"/> + <param name="iden" type="float" size="15" value="90.0" label="Minimal identity (-minIdentity)" /> + <param name="tile_size" type="integer" size="15" value="11" label="Minimal size of exact match (-tileSize)" help="Must be between 6 and 18."/> + <param name="one_off" type="integer" size="15" value="0" label="Number of mismatch in the word (-oneOff)" help="Must be between 0 and 2." /> + </inputs> + <outputs> + <data name="output1" format="tabular"/> + </outputs> + <requirements> + <requirement type="binary">blat</requirement> + </requirements> + <tests> + <test> + <param name="source_select" value="database" /> + <param name="dbkey" value="eschColi_K12" /> + <param name="input_query" value="blat_wrapper_test1.fa" ftype="fasta"/> + <param name="iden" value="90.0" /> + <param name="tile_size" value="11" /> + <param name="one_off" value="0" /> + <output name="output1" file="blat_wrapper_test1.out" /> + </test> + </tests> + <help> + +.. class:: warningmark + +Using a smaller word size (*Minimal Size of Exact Match*) will increase the computational time. + +.. class:: warningmark + +Using a larger mismatch number (*Number of Mismatch in the Word*) will increase the computational time. + +----- + +**What it does** + +This tool currently uses the **BLAT** alignment program. Your short reads file is searched against a genome build or another uploaded file. + +----- + +**Example** + +- Input a multiple fasta file:: + + >seq1 + TGGTAATGGTGGTTTTTTTTTTTTTTTTTTATTTTT + +- Use the default settings: + + - alignment identity must be higher than or equal to 90%. + + - minimal size of exact match to trigger an alignment is 11. + + - allow 0 mismatches in the above exact match size. + +- Search against ce2 (C. elegans March 2004), partial result:: + + 25 1 0 0 0 0 0 0 + seq1 36 10 36 chrI 15080483 9704438 9704464 1 26, 10, 9704438, ggttttttttttttttttttattttt, ggtttttttttttttttttttttttt, + 27 0 0 0 0 0 1 32 + seq1 36 9 36 chrI 15080483 1302536 1302595 2 21,6, 9,30, 1302536,1302589, tggtttttttttttttttttt,attttt, tggtttttttttttttttttt,attttt, + +----- + +**Parameters** + +- *Minimal Identity* (**-minIdentity**) : In percent, the minimum sequence identity between the query and target alignment. Default is 90. + +- *Minimal Size of Exact Match* (**-tileSize**) : The size of a match that will trigger an alignment. Default is 11. Usually between 8 and 12. Must be between 6 and 18. + +- *Number of Mismatch in the Word* (**-oneOff**) : The number of mismatches allowed in the word (tile size) and still triggers an alignment. Default is 0. + +----- + +**Reference** + + **BLAT**: Kent, W James, BLAT--the BLAST-like alignment tool. (2002) Genome Research:12(4) 656-664. + + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c metag_tools/shrimp_color_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metag_tools/shrimp_color_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,116 @@ +#!/usr/bin/env python +""" +SHRiMP wrapper : Color space +""" + +import os +import os.path +import re +import sys +import tempfile + +assert sys.version_info[:2] >= (2.4) + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def __main__(): + # SHRiMP path + shrimp = 'rmapper-cs' + + # I/O + input_target_file = sys.argv[1] # fasta + input_query_file = sys.argv[2] + shrimp_outfile = sys.argv[3] # shrimp output + + # SHRiMP parameters + spaced_seed = '1111001111' + seed_matches_per_window = '2' + seed_hit_taboo_length = '4' + seed_generation_taboo_length = '0' + seed_window_length = '115.0' + max_hits_per_read = '100' + max_read_length = '1000' + kmer = '-1' + sw_match_value = '100' + sw_mismatch_value = '-150' + sw_gap_open_ref = '-400' + sw_gap_open_query = '-400' + sw_gap_ext_ref = '-70' + sw_gap_ext_query = '-70' + sw_crossover_penalty = '-140' + sw_full_hit_threshold = '68.0' + sw_vector_hit_threshold = '60.0' + + # TODO: put the threshold on each of these parameters + if len(sys.argv) > 4: + try: + if sys.argv[4].isdigit(): + spaced_seed = sys.argv[4] + else: + stop_err('Error in assigning parameter: Spaced seed.') + except: + stop_err('Spaced seed must be a combination of 1s and 0s.') + + seed_matches_per_window = sys.argv[5] + seed_hit_taboo_length = sys.argv[6] + seed_generation_taboo_length = sys.argv[7] + seed_window_length = sys.argv[8] + max_hits_per_read = sys.argv[9] + max_read_length = sys.argv[10] + kmer = sys.argv[11] + sw_match_value = sys.argv[12] + sw_mismatch_value = sys.argv[13] + sw_gap_open_ref = sys.argv[14] + sw_gap_open_query = sys.argv[15] + sw_gap_ext_ref = sys.argv[16] + sw_gap_ext_query = sys.argv[17] + sw_crossover_penalty = sys.argv[18] + sw_full_hit_threshold = sys.argv[19] + sw_vector_hit_threshold = sys.argv[20] + + # temp file for shrimp log file + shrimp_log = tempfile.NamedTemporaryFile().name + + # SHRiMP command + command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-x', sw_crossover_penalty, '-h', sw_full_hit_threshold, '-v', sw_vector_hit_threshold, input_query_file, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) + + try: + os.system(command) + except Exception as e: + stop_err(str(e)) + + # check SHRiMP output: count number of lines + num_hits = 0 + if shrimp_outfile: + for i, line in enumerate(open(shrimp_outfile)): + line = line.rstrip('\r\n') + if not line or line.startswith('#'): + continue + try: + line.split() + num_hits += 1 + except Exception as e: + stop_err(str(e)) + + if num_hits == 0: # no hits generated + err_msg = '' + if shrimp_log: + for i, line in enumerate(open(shrimp_log)): + if line.startswith('error'): # deal with memory error: + err_msg += line # error: realloc failed: Cannot allocate memory + if re.search('Reads Matched', line): # deal with zero hits + if int(line[8:].split()[2]) == 0: + err_msg = 'Zero hits found.\n' + stop_err('SHRiMP Failed due to:\n' + err_msg) + + # remove temp. files + if os.path.exists(shrimp_log): + os.remove(shrimp_log) + + +if __name__ == '__main__': + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c metag_tools/shrimp_color_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metag_tools/shrimp_color_wrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,172 @@\n+<tool id="shrimp_color_wrapper" name="SHRiMP for Color-space" version="1.0.0">\n+ <description>reads mapping against reference sequence </description>\n+ <requirements>\n+ <requirement type="binary">rmapper-cs</requirement>\n+ </requirements>\n+ <command interpreter="">\n+python \'$__tool_directory__/shrimp_color_wrapper.py\' \'$input_target\' \'$input_query\' \'$output1\'\n+#if $param.skip_or_full == "full"\n+ \'$param.spaced_seed\' $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_crossover_penalty $param.sw_full_hit_threshold $param.sw_vector_hit_threshold\n+#end if\n+ </command>\n+ <inputs>\n+ <param name="input_query" type="data" format="csfasta" label="Align sequencing reads" help="No dataset? Read tip below"/>\n+ <param name="input_target" type="data" format="fasta" label="against reference" />\n+ <conditional name="param">\n+ <param name="skip_or_full" type="select" label="SHRiMP settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List">\n+ <option value="skip">Commonly used</option>\n+ <option value="full">Full Parameter List</option>\n+ </param>\n+ <when value="skip" />\n+ <when value="full">\n+ <param name="spaced_seed" type="text" size="30" value="1111001111" label="Spaced Seed" />\n+ <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" />\n+ <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" />\n+ <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" />\n+ <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/>\n+ <param name="max_hits_per_read" type="integer" size="10" value="100" label="Maximum Hits per Read" />\n+ <param name="max_read_length" type="integer" size="10" value="1000" label="Maximum Read Length" />\n+ <param name="kmer" type="integer" size="10" value="-1" label="Kmer Std. Deviation Limit" help="-1 as None"/>\n+ <param name="sw_match_value" type="integer" size="10" value="100" label="S-W Match Value" />\n+ <param name="sw_mismatch_value" type="integer" size="10" value="-150" label="S-W Mismatch Value" />\n+ <param name="sw_gap_open_ref" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Reference)" />\n+ <param name="sw_gap_open_query" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Query)" />\n+ <param name="sw_gap_ext_ref" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Reference)" />\n+ <param name="sw_gap_ext_query" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Query)" />\n+ <param name="sw_crossover_penalty" type="integer" size="10" value="-140" label="S-W Crossover Penalty" />\n+ <param name="sw_full_hit_threshold" type="float" size="10" value="68.0" label="S-W Full Hit Threshold" help="in percent'..b'1\'s dictate positions that\n+ must match. A string of all 1\'s will result in a simple kmer scan.\n+ -n Seed Matches per Window (default: 2)\n+ The number of seed matches per window dictates how many seeds\n+ must match within some window length of the genome before that\n+ region is considered for Smith-Waterman alignment. A lower\n+ value will increase sensitivity while drastically increasing\n+ running time. Higher values will have the opposite effect.\n+ -t Seed Hit Taboo Length (default: 4)\n+ The seed taboo length specifies how many target genome bases\n+ or colours must exist prior to a previous seed match in order\n+ to count another seed match as a hit.\n+ -9 Seed Generation Taboo Length (default: 0)\n+\n+ -w Seed Window Length (default: 115.00%)\n+ This parameter specifies the genomic span in bases (or colours)\n+ in which *seed_matches_per_window* must exist before the read\n+ is given consideration by the Simth-Waterman alignment machinery.\n+ -o Maximum Hits per Read (default: 100)\n+ This parameter specifies how many hits to remember for each read.\n+ If more hits are encountered, ones with lower scores are dropped\n+ to make room.\n+ -r Maximum Read Length (default: 1000)\n+ This parameter specifies the maximum length of reads that will\n+ be encountered in the dataset. If larger reads than the default\n+ are used, an appropriate value must be passed to *rmapper*.\n+ -d Kmer Std. Deviation Limit (default: -1 [None])\n+ This option permits pruning read kmers, which occur with\n+ frequencies greater than *kmer_std_dev_limit* standard\n+ deviations above the average. This can shorten running\n+ time at the cost of some sensitivity.\n+ *Note*: A negative value disables this option.\n+ -m S-W Match Value (default: 100)\n+ The value applied to matches during the Smith-Waterman score calculation.\n+ -i S-W Mismatch Value (default: -150)\n+ The value applied to mismatches during the Smith-Waterman\n+ score calculation.\n+ -g S-W Gap Open Penalty (Reference) (default: -400)\n+ The value applied to gap opens along the reference sequence\n+ during the Smith-Waterman score calculation.\n+ *Note*: Note that for backward compatibility, if -g is set\n+ and -q is not set, the gap open penalty for the query will\n+ be set to the same value as specified for the reference.\n+ -q S-W Gap Open Penalty (Query) (default: -400)\n+ The value applied to gap opens along the query sequence during\n+ the Smith-Waterman score calculation.\n+ -e S-W Gap Extend Penalty (Reference) (default: -70)\n+ The value applied to gap extends during the Smith-Waterman score calculation.\n+ *Note*: Note that for backward compatibility, if -e is set\n+ and -f is not set, the gap exten penalty for the query will\n+ be set to the same value as specified for the reference.\n+ -f S-W Gap Extend Penalty (Query) (default: -70)\n+ The value applied to gap extends during the Smith-Waterman score calculation.\n+ -x\n+ -h S-W Full Hit Threshold (default: 68.00%)\n+ In letter-space, this parameter determines the threshold\n+ score for both vectored and full Smith-Waterman alignments.\n+ Any values less than this quantity will be thrown away.\n+ *Note* This option differs slightly in meaning between letter-space and color-space.\n+ -v\n+ </help>\n+ <citations>\n+ <citation type="doi">10.1371/journal.pcbi.1000386</citation>\n+ </citations>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c metag_tools/shrimp_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metag_tools/shrimp_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,642 @@\n+#!/usr/bin/env python\n+"""\n+TODO\n+1. decrease memory usage\n+2. multi-fasta fastq file, ex. 454\n+3. split reads into small chuncks?\n+\n+SHRiMP wrapper\n+\n+Inputs:\n+1. reference seq\n+2. reads\n+\n+Outputs:\n+1. table of 8 columns:\n+ chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage\n+2. SHRiMP output\n+\n+Parameters:\n+ -s Spaced Seed (default: 111111011111)\n+ -n Seed Matches per Window (default: 2)\n+ -t Seed Hit Taboo Length (default: 4)\n+ -9 Seed Generation Taboo Length (default: 0)\n+ -w Seed Window Length (default: 115.00%)\n+ -o Maximum Hits per Read (default: 100)\n+ -r Maximum Read Length (default: 1000)\n+ -d Kmer Std. Deviation Limit (default: -1 [None])\n+\n+ -m S-W Match Value (default: 100)\n+ -i S-W Mismatch Value (default: -150)\n+ -g S-W Gap Open Penalty (Reference) (default: -400)\n+ -q S-W Gap Open Penalty (Query) (default: -400)\n+ -e S-W Gap Extend Penalty (Reference) (default: -70)\n+ -f S-W Gap Extend Penalty (Query) (default: -70)\n+ -h S-W Hit Threshold (default: 68.00%)\n+\n+Command:\n+%rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log>\n+\n+SHRiMP output:\n+>7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13\n+>7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14\n+>7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16\n+"""\n+from __future__ import print_function\n+\n+import os\n+import os.path\n+import re\n+import sys\n+import tempfile\n+\n+assert sys.version_info[:2] >= (2.4)\n+\n+\n+def stop_err( msg ):\n+ sys.stderr.write( "%s\\n" % msg )\n+ sys.exit()\n+\n+\n+def reverse_complement(s):\n+ complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n", ".": ".", "-": "-"}\n+ reversed_s = []\n+ for i in s:\n+ reversed_s.append(complement_dna[i])\n+ reversed_s.reverse()\n+ return "".join(reversed_s)\n+\n+\n+def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size):\n+ invalid_editstring_char = 0\n+ all_score_file = score_files.split(\',\')\n+\n+ if len(all_score_file) != hit_per_read:\n+ stop_err(\'One or more query files is missing. Please check your dataset.\')\n+\n+ temp_table_name = tempfile.NamedTemporaryFile().name\n+ temp_table = open(temp_table_name, \'w\')\n+\n+ outfile = open(table_outfile, \'w\')\n+\n+ # reference seq: not a single fasta seq\n+ refseq = {}\n+ chrom_cov = {}\n+ seq = \'\'\n+ title = None\n+\n+ for i, line in enumerate(open(ref_file)):\n+ line = line.rstrip()\n+ if not line or line.startswith(\'#\'):\n+ continue\n+\n+ if line.startswith(\'>\'):\n+ if seq:\n+ if title in refseq:\n+ pass\n+ else:\n+ refseq[title] = seq\n+ chrom_cov[title] = {}\n+ seq = \'\'\n+ title = line[1:]\n+ else:\n+ seq += line\n+ if seq:\n+ if title not in refseq:\n+ refseq[title] = seq\n+ chrom_cov[title] = {}\n+\n+ # find hits: one end and/or the other\n+ hits = {}\n+ for i, line in enumerate(open(result_file)):\n+ line = line.rstrip()\n+ if not line or line.startswith(\'#\'):\n+ continue\n+\n+ # FORMAT: readname contigname strand contigstart cont'..b"ef, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])\n+\n+ try:\n+ os.system(command)\n+ except Exception as e:\n+ if os.path.exists(query_fasta):\n+ os.remove(query_fasta)\n+ if os.path.exists(query_qual):\n+ os.remove(query_qual)\n+ stop_err(str(e))\n+\n+ else: # paired\n+ command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])\n+ command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log])\n+\n+ try:\n+ os.system(command_end1)\n+ os.system(command_end2)\n+ except Exception as e:\n+ if os.path.exists(query_fasta_end1):\n+ os.remove(query_fasta_end1)\n+ if os.path.exists(query_fasta_end2):\n+ os.remove(query_fasta_end2)\n+ if os.path.exists(query_qual_end1):\n+ os.remove(query_qual_end1)\n+ if os.path.exists(query_qual_end2):\n+ os.remove(query_qual_end2)\n+ stop_err(str(e))\n+\n+ # check SHRiMP output: count number of lines\n+ num_hits = 0\n+ if shrimp_outfile:\n+ for i, line in enumerate(open(shrimp_outfile)):\n+ line = line.rstrip('\\r\\n')\n+ if not line or line.startswith('#'):\n+ continue\n+ try:\n+ line.split()\n+ num_hits += 1\n+ except Exception as e:\n+ stop_err(str(e))\n+\n+ if num_hits == 0: # no hits generated\n+ err_msg = ''\n+ if shrimp_log:\n+ for i, line in enumerate(open(shrimp_log)):\n+ if line.startswith('error'): # deal with memory error:\n+ err_msg += line # error: realloc failed: Cannot allocate memory\n+ if re.search('Reads Matched', line): # deal with zero hits\n+ if int(line[8:].split()[2]) == 0:\n+ err_msg = 'Zero hits found.\\n'\n+ stop_err('SHRiMP Failed due to:\\n' + err_msg)\n+\n+ # convert to table\n+ if type_of_reads == 'single':\n+ generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size)\n+ else:\n+ generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1 + ',' + query_qual_end2, table_outfile, hit_per_read, insertion_size)\n+\n+ # remove temp. files\n+ if type_of_reads == 'single':\n+ if os.path.exists(query_fasta):\n+ os.remove(query_fasta)\n+ if os.path.exists(query_qual):\n+ os.remove(query_qual)\n+ else:\n+ if os.path.exists(query_fasta_end1):\n+ os.remove(query_fasta_end1)\n+ if os.path.exists(query_fasta_end2):\n+ os.remove(query_fasta_end2)\n+ if os.path.exists(query_qual_end1):\n+ os.remove(query_qual_end1)\n+ if os.path.exists(query_qual_end2):\n+ os.remove(query_qual_end2)\n+\n+ if os.path.exists(shrimp_log):\n+ os.remove(shrimp_log)\n+\n+\n+if __name__ == '__main__':\n+ __main__()\n" |
b |
diff -r 000000000000 -r 7621d36a4e9c metag_tools/shrimp_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metag_tools/shrimp_wrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,273 @@\n+<tool id="shrimp_wrapper" name="SHRiMP for Letter-space" version="1.0.0">\n+ <description>reads mapping against reference sequence </description>\n+ <requirements>\n+ <requirement type="binary">rmapper-ls</requirement>\n+ </requirements>\n+ <command>\n+python \'$__tool_directory__/shrimp_wrapper.py\' \'$input_target\' \'$output1\' \'$output2\'\n+#if $type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"\n+ \'$type_of_reads.input_query\'\n+#elif $type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip")\n+ \'$type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size\'\n+#elif $type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"\n+ \'$type_of_reads.input_query\' \'$param.spaced_seed\' $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold\n+#elif $type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"\n+ \'$type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size\' \'$param.spaced_seed\' $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold\n+#end if\n+ </command>\n+ <inputs>\n+ <conditional name="type_of_reads">\n+ <param name="single_or_paired" type="select" label="Single- or Paired-ends">\n+ <option value="single">Single-end</option>\n+ <option value="paired">Paired-end</option>\n+ </param>\n+ <when value="single">\n+ <param name="input_query" type="data" format="fastqsolexa" label="Align sequencing reads" help="No dataset? Read tip below"/>\n+ </when>\n+ <when value="paired">\n+ <param name="insertion_size" type="integer" size="5" value="600" label="Insertion length between two ends" help="bp" />\n+ <param name="input1" type="data" format="fastqsolexa" label="Align sequencing reads, one end" />\n+ <param name="input2" type="data" format="fastqsolexa" label="and the other end" />\n+ </when>\n+ </conditional>\n+ <param name="input_target" type="data" format="fasta" label="against reference" />\n+ <conditional name="param">\n+ <param name="skip_or_full" type="select" label="SHRiMP settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List">\n+ <option value="skip">Commonly used</option>\n+ <option value="full">Full Parameter List</option>\n+ </param>\n+ <when value="skip" />\n+ <when value="full">\n+ <param name="spaced_seed" type="text" size="30" value="111111011111" label="Spaced Seed" />\n+ <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" />\n+ <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" />\n+ <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" />\n+ <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/>\n+ <param name="max_hits_per_read" type="integer" size="10" value="100" '..b'atching, whereas 1\'s dictate positions that\n+ must match. A string of all 1\'s will result in a simple kmer scan.\n+ -n Seed Matches per Window (default: 2)\n+ The number of seed matches per window dictates how many seeds\n+ must match within some window length of the genome before that\n+ region is considered for Smith-Waterman alignment. A lower\n+ value will increase sensitivity while drastically increasing\n+ running time. Higher values will have the opposite effect.\n+ -t Seed Hit Taboo Length (default: 4)\n+ The seed taboo length specifies how many target genome bases\n+ or colors must exist prior to a previous seed match in order\n+ to count another seed match as a hit.\n+ -9 Seed Generation Taboo Length (default: 0)\n+\n+ -w Seed Window Length (default: 115.00%)\n+ This parameter specifies the genomic span in bases (or colours)\n+ in which *seed_matches_per_window* must exist before the read\n+ is given consideration by the Simth-Waterman alignment machinery.\n+ -o Maximum Hits per Read (default: 100)\n+ This parameter specifies how many hits to remember for each read.\n+ If more hits are encountered, ones with lower scores are dropped\n+ to make room.\n+ -r Maximum Read Length (default: 1000)\n+ This parameter specifies the maximum length of reads that will\n+ be encountered in the dataset. If larger reads than the default\n+ are used, an appropriate value must be passed to *rmapper*.\n+ -d Kmer Std. Deviation Limit (default: -1 [None])\n+ This option permits pruning read kmers, which occur with\n+ frequencies greater than *kmer_std_dev_limit* standard\n+ deviations above the average. This can shorten running\n+ time at the cost of some sensitivity.\n+ *Note*: A negative value disables this option.\n+ -m S-W Match Value (default: 100)\n+ The value applied to matches during the Smith-Waterman score calculation.\n+ -i S-W Mismatch Value (default: -150)\n+ The value applied to mismatches during the Smith-Waterman\n+ score calculation.\n+ -g S-W Gap Open Penalty (Reference) (default: -400)\n+ The value applied to gap opens along the reference sequence\n+ during the Smith-Waterman score calculation.\n+ *Note*: Note that for backward compatibility, if -g is set\n+ and -q is not set, the gap open penalty for the query will\n+ be set to the same value as specified for the reference.\n+ -q S-W Gap Open Penalty (Query) (default: -400)\n+ The value applied to gap opens along the query sequence during\n+ the Smith-Waterman score calculation.\n+ -e S-W Gap Extend Penalty (Reference) (default: -70)\n+ The value applied to gap extends during the Smith-Waterman score calculation.\n+ *Note*: Note that for backward compatibility, if -e is set\n+ and -f is not set, the gap exten penalty for the query will\n+ be set to the same value as specified for the reference.\n+ -f S-W Gap Extend Penalty (Query) (default: -70)\n+ The value applied to gap extends during the Smith-Waterman score calculation.\n+ -h S-W Hit Threshold (default: 68.00%)\n+ In letter-space, this parameter determines the threshold\n+ score for both vectored and full Smith-Waterman alignments.\n+ Any values less than this quantity will be thrown away.\n+ *Note* This option differs slightly in meaning between letter-space and color-space.\n+ </help>\n+ <citations>\n+ <citation type="doi">10.1371/journal.pcbi.1000386</citation>\n+ </citations>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/bwa_solid2fastq_modified.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/bwa_solid2fastq_modified.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,89 @@ +#!/usr/bin/perl -w + +# Author: lh3 +# Note: Ideally, this script should be written in C. It is a bit slow at present. + +use strict; +use warnings; +use Getopt::Std; + +my %opts; +my $version = '0.1.3'; +my $usage = qq{ +Usage: solid2fastq.pl <paired> <outfile1> <outfile2> <F3.csfasta> <F3.qual> <R3.csfasta> <R3.qual> + +Note: <in.title> is the string showed in the `# Title:' line of a + ".csfasta" read file. Then <in.title>F3.csfasta is read sequence + file and <in.title>F3_QV.qual is the quality file. If + <in.title>R3.csfasta is present, this script assumes reads are + paired; otherwise reads will be regarded as single-end. + + The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3 + tag and `2' for F3. Usually you may want to use short <out.prefix> + to save diskspace. Long <out.prefix> also causes troubles to maq. + +}; + +getopts('', \%opts); +die($usage) if (@ARGV != 7); +my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV; +my (@fhr, @fhw); +my $fn = ''; +my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual); +if ($is_paired eq "yes") { # paired end + for (0 .. 3) { + $fn = $fn_suff[$_]; + $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); + open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); + } + open($fhw[0], "|gzip >$outfile2") || die; + open($fhw[1], "|gzip >$outfile1") || die; + my (@df, @dr); + @df = &read1(1); @dr = &read1(2); + while (@df && @dr) { + if ($df[0] eq $dr[0]) { # mate pair + print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; + @df = &read1(1); @dr = &read1(2); + } + } + close($fhr[$_]) for (0 .. $#fhr); + close($fhw[$_]) for (0 .. $#fhw); +} else { # single end + for (0 .. 1) { + my $fn = "$fn_suff[$_]"; + $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); + open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); + } + open($fhw[2], "|gzip >$outfile1") || die; + my @df; + while (@df = &read1(1, $fhr[0], $fhr[1])) { + print {$fhw[2]} $df[1]; + } + close($fhr[$_]) for (0 .. $#fhr); + close($fhw[2]); +} + +sub read1 { + my $i = shift(@_); + my $j = ($i-1)<<1; + my ($key, $seq); + my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); + while (<$fhs>) { + my $t = <$fhq>; + if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { + $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines + #print $key; + die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t); + my $name = "$1_$2_$3/$i"; + $_ = substr(<$fhs>, 2); + tr/0123./ACGTN/; + my $s = $_; + $_ = <$fhq>; + s/^(\d+)\s*//; + s/(\d+)\s*/chr($1+33)/eg; + $seq = qq/\@$name\n$s+\n$_\n/; + last; + } + } + return defined($seq)? ($key, $seq) : (); +} |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/fastq_conversions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/fastq_conversions.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,45 @@ +#!/usr/bin/env python + +""" +Performs various conversions around Sanger FASTQ data + +usage: %prog [options] + -c, --command=c: Command to run + -i, --input=i: Input file to be converted + -o, --outputFastqsanger=o: FASTQ Sanger converted output file for sol2std + -s, --outputFastqsolexa=s: FASTQ Solexa converted output file + -f, --outputFasta=f: FASTA converted output file + +usage: %prog command input_file output_file +""" + +import os +import sys + +from bx.cookbook import doc_optparse + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def __main__(): + # Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + + cmd = "fq_all2std.pl %s %s > %s" + if options.command == 'sol2std': + cmd = cmd % (options.command, options.input, options.outputFastqsanger) + elif options.command == 'std2sol': + cmd = cmd % (options.command, options.input, options.outputFastqsolexa) + elif options.command == 'fq2fa': + cmd = cmd % (options.command, options.input, options.outputFasta) + try: + os.system(cmd) + except Exception as eq: + stop_err("Error converting data format.\n" + str(eq)) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/fastq_conversions.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/fastq_conversions.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,133 @@ +<tool id="fastq_conversions" name="FASTQ Conversions" version="1.0.0"> + <description>converts between FASTQ data and other data formats</description> + <command interpreter="python"> + fastq_conversions.py + --command=$conversionType.type + --input=$input + #if $conversionType.type == "sol2std": + --outputFastqsanger=$outputFastqsanger + #else: + --outputFastqsanger="None" + #end if + #if $conversionType.type == "std2sol": + --outputFastqsolexa=$outputFastqsolexa + #else: + --outputFastqsolexa="None" + #end if + #if $conversionType.type == "fq2fa": + --outputFasta=$outputFasta + #else: + --outputFasta="None" + #end if + </command> + <inputs> + <conditional name="conversionType"> + <param name="type" type="select" label="What type of conversion do you want to do?"> + <option value="sol2std">Solexa/Illumina FASTQ to standard Sanger FASTQ</option> + <option value="std2sol">Standard Sanger FASTQ to Solexa/Illumina FASTQ</option> + <option value="fq2fa">Various FASTQ to FASTA</option> + </param> + <when value="sol2std"> + <param name="input" type="data" format="fastqsolexa" label="File to convert" /> + </when> + <when value="std2sol"> + <param name="input" type="data" format="fastqsanger" label="File to convert" /> + </when> + <when value="fq2fa"> + <param name="input" type="data" format="fastqsolexa, fastqsanger" label="File to convert" /> + </when> + </conditional> + </inputs> + <outputs> + <data name="outputFastqsanger" format="fastqsanger"> + <filter>conversionType['type'] == 'sol2std'</filter> + </data> + <data name="outputFastqsolexa" format="fastqsolexa"> + <filter>conversionType['type'] == 'std2sol'</filter> + </data> + <data name="outputFasta" format="fasta"> + <filter>conversionType['type'] == 'fq2fa'</filter> + </data> + </outputs> + <tests> + <test> + <param name="type" value="sol2std" /> + <param name="input" value="fastq_conv_in1.fastq" ftype="fastqsolexa" /> + <output name="outputFastqsanger" file="fastq_conv_out1.fastqsanger" /> + </test> + <test> + <param name="type" value="std2sol" /> + <param name="input" value="1.fastqsanger" ftype="fastqsanger" /> + <output name="outputFastqsolexa" file="fastq_conv_out2.fastqsolexa" /> + </test> + <test> + <param name="type" value="fq2fa" /> + <param name="input" value="1.fastqsanger" ftype="fastqsanger" /> + <output name="outputFasta" file="fastq_conv_out4.fasta" /> + </test> + </tests> + <help> +**What it does** + +This tool offers several conversions options relating to the FASTQ format. + +----- + +**Examples** + +- Converting the Solexa/Illumina FASTQ data:: + + @081017-and-081020:1:1:1715:1759 + GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC + + + II#IIIIIII$5+.(9IIIIIII$%*$G$A31I&&B + +- will produce the following Sanger FASTQ data:: + + @081017-and-081020:1:1:1715:1759 + GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC + + + ++!+++++++!!!!!"+++++++!!!!)!%!!+!!%! + +- Converting standard Sanger FASTQ:: + + @1831_573_1004/1 + AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG + + + ><C&&9952+C>5<.?<79,=42<292:<(9/-7 + @1831_573_1050/1 + TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT + + + ;@@17?@=>7??@A8?==@4A?A4)&+.'&+'1, + +- will produce the following Solexa/Illumina FASTQ data:: + + @1831_573_1004/1 + AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG + + + ][bEEXXTQJb]T[M^[VXK\SQ[QXQY[GXNLV + @1831_573_1050/1 + TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT + + + Z__PV^_\]V^^_`W^\\_S`^`SHEJMFEJFPK + +- Converting the Sanger FASTQ data:: + + @1831_573_1004/1 + AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG + + + ><C&&9952+C>5<.?<79,=42<292:<(9/-7 + @1831_573_1050/1 + TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT + + + ;@@17?@=>7??@A8?==@4A?A4)&+.'&+'1, + +- will produce the following FASTA data:: + + >1831_573_1004/1 + AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG + >1831_573_1050/1 + TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/fastq_gen_conv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/fastq_gen_conv.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,177 @@ +""" +Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary. + +usage: %prog [options] + -i, --input=i: Input FASTQ candidate file + -r, --origType=r: Original type + -a, --allOrNot=a: Whether or not to check all blocks + -b, --blocks=b: Number of blocks to check + -o, --output=o: Output file + +usage: %prog input_file oroutput_file +""" + +import math +import sys + +from bx.cookbook import doc_optparse + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def all_bases_valid(seq): + """Confirm that the sequence contains only bases""" + valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N'] + for base in seq: + if base not in valid_bases: + return False + return True + + +def __main__(): + # Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + orig_type = options.origType + if orig_type == 'sanger' and options.allOrNot == 'not': + max_blocks = int(options.blocks) + else: + max_blocks = -1 + fin = open(options.input, 'r') + fout = open(options.output, 'w') + range_min = 1000 + range_max = -5 + block_num = 0 + bad_blocks = 0 + base_len = -1 + line_count = 0 + lines = [] + line = fin.readline() + while line: + if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks: + fout.write(line) + if line_count % 4 == 0: + block_num += 1 + line_count += 1 + elif line.strip(): + # the line that starts a block, with a name + if line_count % 4 == 0 and line.startswith('@'): + lines.append(line) + else: + # if we expect a sequence of bases + if line_count % 4 == 1 and all_bases_valid(line.strip()): + lines.append(line) + base_len = len(line.strip()) + # if we expect the second name line + elif line_count % 4 == 2 and line.startswith('+'): + lines.append(line) + # if we expect a sequence of qualities and it's the expected length + elif line_count % 4 == 3: + split_line = line.strip().split() + # decimal qualities + if len(split_line) == base_len: + # convert + phred_list = [] + for ch in split_line: + int_ch = int(ch) + if int_ch < range_min: + range_min = int_ch + if int_ch > range_max: + range_max = int_ch + if int_ch >= 0 and int_ch <= 93: + phred_list.append(chr(int_ch + 33)) + # make sure we haven't lost any quality values + if len(phred_list) == base_len: + # print first three lines + for l in lines: + fout.write(l) + # print converted quality line + fout.write(''.join(phred_list)) + # reset + lines = [] + base_len = -1 + # abort if so + else: + bad_blocks += 1 + lines = [] + base_len = -1 + # ascii qualities + elif len(split_line[0]) == base_len: + qualities = [] + # print converted quality line + if orig_type == 'illumina': + for c in line.strip(): + if ord(c) - 64 < range_min: + range_min = ord(c) - 64 + if ord(c) - 64 > range_max: + range_max = ord(c) - 64 + if ord(c) < 64 or ord(c) > 126: + bad_blocks += 1 + base_len = -1 + lines = [] + break + else: + qualities.append( chr( ord(c) - 31 ) ) + quals = ''.join(qualities) + elif orig_type == 'solexa': + for c in line.strip(): + if ord(c) - 64 < range_min: + range_min = ord(c) - 64 + if ord(c) - 64 > range_max: + range_max = ord(c) - 64 + if ord(c) < 59 or ord(c) > 126: + bad_blocks += 1 + base_len = -1 + lines = [] + break + else: + p = 10.0 ** ( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0 ** ( ( ord(c) - 64 ) / -10.0 ) ) + qualities.append( chr( int( -10.0 * math.log10( p ) ) + 33 ) ) + quals = ''.join(qualities) + else: # 'sanger' + for c in line.strip(): + if ord(c) - 33 < range_min: + range_min = ord(c) - 33 + if ord(c) - 33 > range_max: + range_max = ord(c) - 33 + if ord(c) < 33 or ord(c) > 126: + bad_blocks += 1 + base_len = -1 + lines = [] + break + else: + qualities.append(c) + quals = ''.join(qualities) + # make sure we don't have bad qualities + if len(quals) == base_len: + # print first three lines + for l in lines: + fout.write(l) + # print out quality line + fout.write(quals + '\n') + # reset + lines = [] + base_len = -1 + else: + bad_blocks += 1 + base_len = -1 + lines = [] + # mark the successful end of a block + block_num += 1 + line_count += 1 + line = fin.readline() + fout.close() + fin.close() + if range_min != 1000 and range_min != -5: + outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max) + else: + outmsg = '' + if bad_blocks > 0: + outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks) + sys.stdout.write(outmsg) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/fastq_gen_conv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/fastq_gen_conv.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,103 @@ +<tool id="fastq_gen_conv" name="FASTQ Groomer" version="1.0.0"> + <description>converts any FASTQ to Sanger</description> + <command> +python '$__tool_directory__/fastq_gen_conv.py' + --input='$input' + --origType=$origTypeChoice.origType + #if $origTypeChoice.origType == "sanger": + --allOrNot=$origTypeChoice.howManyBlocks.allOrNot + #if $origTypeChoice.howManyBlocks.allOrNot == "not": + --blocks=$origTypeChoice.howManyBlocks.blocks + #else: + --blocks="None" + #end if + #else: + --allOrNot="None" + --blocks="None" + #end if + --output='$output' + </command> + <inputs> + <param name="input" type="data" format="fastq" label="Groom this dataset" /> + <conditional name="origTypeChoice"> + <param name="origType" type="select" label="How do you think quality values are scaled?" help="See below for explanation"> + <option value="solexa">Solexa/Illumina 1.0</option> + <option value="illumina">Illumina 1.3+</option> + <option value="sanger">Sanger (validation only)</option> + </param> + <when value="solexa" /> + <when value="illumina" /> + <when value="sanger"> + <conditional name="howManyBlocks"> + <param name="allOrNot" type="select" label="Since your fastq is already in Sanger format you can check it for consistency"> + <option value="all">Check all (may take a while)</option> + <option selected="true" value="not">Check selected number of blocks</option> + </param> + <when value="all" /> + <when value="not"> + <param name="blocks" type="integer" value="1000" label="How many blocks (four lines each) do you want to check?" /> + </when> + </conditional> + </when> + </conditional> + </inputs> + <outputs> + <data name="output" format="fastqsanger"/> + </outputs> + <tests> + <test> + <param name="input" value="fastq_gen_conv_in1.fastq" ftype="fastq" /> + <param name="origType" value="solexa" /> + <output name="output" ftype="fastqsanger" file="fastq_gen_conv_out1.fastqsanger" /> + </test> + <test> + <param name="input" value="fastq_gen_conv_in2.fastq" ftype="fastq" /> + <param name="origType" value="sanger" /> + <param name="allOrNot" value="not" /> + <param name="blocks" value="3" /> + <output name="output" ftype="fastqsanger" file="fastq_gen_conv_out2.fastqsanger" /> + </test> + </tests> + <help> +**What it does** + +Galaxy pipeline for mapping of Illumina data requires data to be in fastq format with quality values conforming to so called "Sanger" format. Unfortunately there are many other types of fastq. Thus the main objective of this tool is to "groom" multiple types of fastq into Sanger-conforming fastq that can be used in downstream application such as mapping. + +.. class:: infomark + +**TIP**: If the input dataset is already in Sanger format the tool does not perform conversion. However validation (described below) is still performed. + +----- + +**Types of fastq datasets** + +A good description of fastq datasets can be found `here`__, while a description of Galaxy's fastq "logic" can be found `here`__. Because ranges of quality values within different types of fastq datasets overlap it very difficult to detect them automatically. This tool supports conversion of two commonly found types (Solexa/Illumina 1.0 and Illumina 1.3+) into fastq Sanger. + + .. __: http://en.wikipedia.org/wiki/FASTQ_format + .. __: https://wiki.galaxyproject.org/Learn/Datatypes#Fastq + +.. class:: warningmark + +**NOTE** that there is also a type of fastq format where quality values are represented by a list of space-delimited integers (e.g., 40 40 20 15 -5 20 ...). This tool **does not** handle such fastq. If you have such a dataset, it needs to be converted into ASCII-type fastq (where quality values are encoded by characters) by "Numeric-to-ASCII" utility before it can accepted by this tool. + +----- + +**Validation** + +In addition to converting quality values to Sanger format the tool also checks the input dataset for consistency. Specifically, it performs these four checks: + +- skips empty lines +- checks that blocks are properly formed by making sure that: + + #. there are four lines per block + #. the first line starts with "@" + #. the third line starts with "+" + #. lengths of second line (sequences) and the fourth line (quality string) are identical + +- checks that quality values are within range for the chosen fastq format (e.g., the format provided by the user in **How do you think quality values are scaled?** drop down. + +To see exactly what the tool does you can take a look at its source code `here`__. + + .. __: http://bitbucket.org/galaxy/galaxy-central/src/tip/tools/next_gen_conversion/fastq_gen_conv.py + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/solid2fastq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/solid2fastq.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,201 @@ +#!/usr/bin/env python + +import optparse +import sqlite3 +import string +import sys +import tempfile + +import six + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def solid2sanger( quality_string, min_qual=0 ): + sanger = "" + quality_string = quality_string.rstrip( " " ) + for qv in quality_string.split(" "): + try: + if int( qv ) < 0: + qv = '0' + if int( qv ) < min_qual: + return False + break + sanger += chr( int( qv ) + 33 ) + except: + pass + return sanger + + +def Translator(frm='', to='', delete=''): + if len(to) == 1: + to = to * len(frm) + if six.PY2: + trans = string.maketrans(frm, to) + else: + trans = str.maketrans(frm, to) + + def callable(s): + return s.translate(trans, delete) + + return callable + + +def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode=False, trim_first_base=False, pair_end_flag='', min_qual=0, table_name=None ): + # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter, + # which can have three values: fastq, txt, and db + # fastq = fastq format + # txt = space delimited format with defline, reads, and qvs + # dp = dump data into sqlite3 db. + # IMPORTNAT! If out = db two optins must be provided: + # 1. f_out must be a db connection object initialized with sqlite3.connect() + # 2. table_name must be provided + if out == 'db': + cursor = f_out.cursor() + sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name + cursor.execute(sql) + + lines = [] + line = " " + while line: + for f in [ f_reads, f_qual ]: + line = f.readline().rstrip( '\n\r' ) + while line.startswith( '#' ): + line = f.readline().rstrip( '\n\r' ) + lines.append( line ) + + if lines[0].startswith( '>' ) and lines[1].startswith( '>' ): + if lines[0] != lines[1]: + stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data') + + defline = lines[0][1:] + if trim_name and ( defline[ len(defline) - 3: ] == "_F3" or defline[ len(defline) - 3: ] == "_R3" ): + defline = defline[ :len(defline) - 3 ] + + elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ): + if trim_first_base: + lines[0] = lines[0][1:] + if double_encode: + de = Translator(frm="0123.", to="ACGTN") + lines[0] = de(lines[0]) + qual = solid2sanger( lines[1], int( min_qual ) ) + if qual: + if out == 'fastq': + f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) + if out == 'txt': + f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) ) + if out == 'db': + cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) ) + lines = [] + + +def main(): + usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]" + parser = optparse.OptionParser(usage=usage) + parser.add_option( + '--fr', '--f_reads', + metavar="F3_CSFASTA_FILE", + dest='fr', + help='Name of F3 file with color space reads') + parser.add_option( + '--fq', '--f_qual', + metavar="F3_QUAL_FILE", + dest='fq', + help='Name of F3 file with color quality values') + parser.add_option( + '--fout', '--f3_fastq_output', + metavar="F3_OUTPUT", + dest='fout', + help='Name for F3 output file') + parser.add_option( + '--rr', '--r_reads', + metavar="R3_CSFASTA_FILE", + dest='rr', + default=False, + help='Name of R3 file with color space reads') + parser.add_option( + '--rq', '--r_qual', + metavar="R3_QUAL_FILE", + dest='rq', + default=False, + help='Name of R3 file with color quality values') + parser.add_option( + '--rout', + metavar="R3_OUTPUT", + dest='rout', + help='Name for F3 output file') + parser.add_option( + '-q', '--min_qual', + dest='min_qual', + default='-1000', + help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000') + parser.add_option( + '-t', '--trim_name', + dest='trim_name', + action='store_true', + default=False, + help='Trim _R3 and _F3 off read names. Default is False') + parser.add_option( + '-f', '--trim_first_base', + dest='trim_first_base', + action='store_true', + default=False, + help='Remove the first base of reads in color-space. Default is False') + parser.add_option( + '-d', '--double_encode', + dest='de', + action='store_true', + default=False, + help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False') + + options, args = parser.parse_args() + + if not ( options.fout and options.fr and options.fq ): + parser.error(""" + One or more of the three required paremetrs is missing: + (1) --fr F3.csfasta file + (2) --fq F3.qual file + (3) --fout name of output file + Use --help for more info + """) + + fr = open( options.fr, 'r' ) + fq = open( options.fq, 'r' ) + f_out = open( options.fout, 'w' ) + + if options.rr and options.rq: + rr = open( options.rr, 'r' ) + rq = open( options.rq, 'r' ) + if not options.rout: + parser.error("Provide the name for f3 output using --rout option. Use --help for more info") + r_out = open( options.rout, 'w' ) + + db = tempfile.NamedTemporaryFile() + + try: + con = sqlite3.connect(db.name) + cur = con.cursor() + except: + stop_err('Cannot connect to %s\n') % db.name + + merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" ) + merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" ) + cur.execute('create index f3_name on f3( name )') + cur.execute('create index r3_name on r3( name )') + + cur.execute('select * from f3,r3 where f3.name = r3.name') + for item in cur: + f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) ) + r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) ) + + else: + merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual ) + + f_out.close() + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/solid2fastq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/solid2fastq.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,154 @@ +<tool id="solid2fastq" name="Convert" version="1.0.0"> + <description>SOLiD output to fastq</description> + <command interpreter="python"> + #if $is_run.paired == "no" #solid2fastq.py --fr=$input1 --fq=$input2 --fout=$out_file1 -q $qual $trim_name $trim_first_base $double_encode + #elif $is_run.paired == "yes" #solid2fastq.py --fr=$input1 --fq=$input2 --fout=$out_file1 --rr=$input3 --rq=$input4 --rout=$out_file2 -q $qual $trim_name $trim_first_base $double_encode + #end if# + </command> + <inputs> + <param name="input1" type="data" format="csfasta" label="Select reads"/> + <param name="input2" type="data" format="qualsolid" label="Select qualities"/> + <conditional name="is_run"> + <param name="paired" type="select" label="Is this a mate-pair run?"> + <option value="no" selected="true">No</option> + <option value="yes">Yes</option> + </param> + <when value="yes"> + <param name="input3" type="data" format="csfasta" label="Select Reverse reads"/> + <param name="input4" type="data" format="qualsolid" label="Select Reverse qualities"/> + </when> + <when value="no"> + </when> + </conditional> + <param name="qual" label="Remove reads containing color qualities below this value" type="integer" value="0"/> + <param name="trim_name" type="select" label="Trim trailing "_F3" and "_R3" ?"> + <option value="-t" selected="true">Yes</option> + <option value="">No</option> + </param> + <param name="trim_first_base" type="select" label="Trim first base?"> + <option value="-f">Yes (BWA)</option> + <option value="" selected="true">No (bowtie)</option> + </param> + <param name="double_encode" type="select" label="Double encode?"> + <option value="-d">Yes (BWA)</option> + <option value="" selected="true">No (bowtie)</option> + </param> + </inputs> + <outputs> + <data format="fastqcssanger" name="out_file1"/> + <data format="fastqcssanger" name="out_file2"> + <filter>is_run['paired'] == 'yes'</filter> + </data> + </outputs> + <tests> + <test> + <param name="input1" value="fr.csfasta" ftype="csfasta"/> + <param name="input2" value="fr.qualsolid" ftype="qualsolid" /> + <param name="paired" value="no"/> + <param name="qual" value="0" /> + <param name="trim_first_base" value="No" /> + <param name="trim_name" value="No" /> + <param name="double_encode" value="No"/> + <output name="out_file1" file="solid2fastq_out_1.fastq"/> + </test> + <test> + <param name="input1" value="fr.csfasta" ftype="csfasta"/> + <param name="input2" value="fr.qualsolid" ftype="qualsolid" /> + <param name="paired" value="yes"/> + <param name="input3" value="rr.csfasta" ftype="csfasta"/> + <param name="input4" value="rr.qualsolid" ftype="qualsolid" /> + <param name="qual" value="0" /> + <param name="trim_first_base" value="No" /> + <param name="trim_name" value="Yes" /> + <param name="double_encode" value="No"/> + <output name="out_file1" file="solid2fastq_out_2.fastq"/> + <output name="out_file2" file="solid2fastq_out_3.fastq"/> + </test> + </tests> +<help> +**What it does** + +Converts output of SOLiD instrument (versions 3.5 and earlier) to fastq format suitable for bowtie, bwa, and PerM mappers. + +-------- + +**Input datasets** + +Below are examples of forward (F3) reads and quality scores: + +Reads:: + + >1831_573_1004_F3 + T00030133312212111300011021310132222 + >1831_573_1567_F3 + T03330322230322112131010221102122113 + +Quality scores:: + + >1831_573_1004_F3 + 4 29 34 34 32 32 24 24 20 17 10 34 29 20 34 13 30 34 22 24 11 28 19 17 34 17 24 17 25 34 7 24 14 12 22 + >1831_573_1567_F3 + 8 26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 32 10 13 6 32 10 6 16 11 + + +**Mate pairs** + +If your data is from a mate-paired run, you will have additional read and quality datasets that will look similar to the ones above with one exception: the names of reads will be ending with "_R3". +In this case choose **Yes** from the *Is this a mate-pair run?* drop down and you will be able to select R reads. When processing mate pairs this tool generates two output files: one for F3 reads and the other for R3 reads. +The reads are guaranteed to be paired -- mated reads will be in the same position in F3 and R3 fastq file. However, because pairing is verified it may take a while to process an entire SOLiD run (several hours). + +------ + +**Explanation of parameters** + +**Remove reads containing color qualities below this value** - any read that contains as least one color call with quality lower than the specified value **will not** be reported. + +**Trim trailing "_F3" and "_R3"?** - does just that. Not necessary for bowtie. Required for BWA. + +**Trim first base?** - SOLiD reads contain an adapter base such as the first T in this read:: + + >1831_573_1004_F3 + T00030133312212111300011021310132222 + +this option removes this base leaving only color calls. Not necessary for bowtie. Required for BWA. + +**Double encode?** - converts color calls (0123.) to pseudo-nucleotides (ACGTN). Not necessary for bowtie. Required for BWA. + +------ + +**Examples of output** + +When all parameters are left "as-is" you will get this (using reads and qualities shown above):: + + @1831_573_1004 + T00030133312212111300011021310132222 + + + %%>CCAA9952+C>5C.?C79,=42C292:C(9/-7 + @1831_573_1004 + T03330322230322112131010221102122113 + + + );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, + +Setting *Trim first base from reads* to **Yes** will produce this:: + + @1831_573_1004 + 00030133312212111300011021310132222 + + + %%>CCAA9952+C>5C.?C79,=42C292:C(9/-7 + @1831_573_1004 + 03330322230322112131010221102122113 + + + );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, + +Finally, setting *Double encode* to **Yes** will yield:: + + @1831_573_1004 + TAAATACTTTCGGCGCCCTAAACCAGCTCACTGGGG + + + %%>CCAA9952+C>5C.?C79,=42C292:C(9/-7 + @1831_573_1004 + TATTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT + + + );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/solid_to_fastq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/solid_to_fastq.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,74 @@ +#!/usr/bin/env python + +""" +Converts SOLiD data to Sanger FASTQ format. + +usage: %prog [options] + -i, --input1=i: Forward reads file + -q, --input2=q: Forward qual file + -I, --input3=I: Reverse reads file + -Q, --input4=Q: Reverse qual file + -o, --output1=o: Forward output + -r, --output2=r: Reverse output + +usage: %prog forward_reads_file forwards_qual_file reverse_reads_file(or_None) reverse_qual_file(or_None) output_file ouptut_id output_dir +""" + +import os +import sys +import tempfile + +from bx.cookbook import doc_optparse + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def replaceNeg1(fin, fout): + line = fin.readline() + while line.strip(): + fout.write(line.replace('-1', '1')) + line = fin.readline() + fout.seek(0) + return fout + + +def __main__(): + # Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + # common temp file setup + tmpf = tempfile.NamedTemporaryFile() # forward reads + tmpqf = tempfile.NamedTemporaryFile() + tmpqf = replaceNeg1(open(options.input2, 'r'), tmpqf) + # if paired-end data (have reverse input files) + if options.input3 != "None" and options.input4 != "None": + tmpr = tempfile.NamedTemporaryFile() # reverse reads + # replace the -1 in the qualities file + tmpqr = tempfile.NamedTemporaryFile() + tmpqr = replaceNeg1(open(options.input4, 'r'), tmpqr) + cmd1 = "%s/bwa_solid2fastq_modified.pl 'yes' %s %s %s %s %s %s 2>&1" % (os.path.split(sys.argv[0])[0], tmpf.name, tmpr.name, options.input1, tmpqf.name, options.input3, tmpqr.name) + try: + os.system(cmd1) + os.system('gunzip -c %s >> %s' % (tmpf.name, options.output1)) + os.system('gunzip -c %s >> %s' % (tmpr.name, options.output2)) + except Exception as eq: + stop_err("Error converting data to fastq format.\n" + str(eq)) + tmpr.close() + tmpqr.close() + # if single-end data + else: + cmd1 = "%s/bwa_solid2fastq_modified.pl 'no' %s %s %s %s %s %s 2>&1" % (os.path.split(sys.argv[0])[0], tmpf.name, None, options.input1, tmpqf.name, None, None) + try: + os.system(cmd1) + os.system('gunzip -c %s >> %s' % (tmpf.name, options.output1)) + except Exception as eq: + stop_err("Error converting data to fastq format.\n" + str(eq)) + tmpqf.close() + tmpf.close() + sys.stdout.write('converted SOLiD data') + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c next_gen_conversion/solid_to_fastq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/next_gen_conversion/solid_to_fastq.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,101 @@ +<tool id="solid_to_fastq" name="SOLiD-to-FASTQ" version="1.0.0"> + <description>converts SOLiD data to FASTQ data</description> + <command interpreter="python"> + solid_to_fastq.py + --input1=$input1 + --input2=$input2 + #if $paired.pairedSingle == "single": + --input3="None" + --input4="None" + #else: + --input3=$input3 + --input4=$input4 + #end if + --output1=$output1 + #if $paired.pairedSingle == "single": + --output2="None" + #else: + --output2=$output2 + #end if + </command> + <inputs> + <conditional name="paired"> + <param name="pairedSingle" type="select" label="Is this library mate-paired?"> + <option value="single">Single</option> + <option value="paired">Paired</option> + </param> + <when value="single"> + <param name="input1" type="data" format="csfasta" label="F3 read file" /> + <param name="input2" type="data" format="qualsolid" label="F3 qual file" /> + </when> + <when value="paired"> + <param name="input1" type="data" format="csfasta" label="F3 read file" /> + <param name="input2" type="data" format="qualsolid" label="F3 qual file" /> + <param name="input3" type="data" format="csfasta" label="R3 read file" /> + <param name="input4" type="data" format="qualsolid" label="R3 qual file" /> + </when> + </conditional> + </inputs> + <outputs> + <!-- Variable number of outputs. Either one (for single-end) or two (for paired-end) --> + <data name="output1" format="fastqsanger"/> + <data name="output2" format="fastqsanger"> + <filter>paired['pairedSingle'] == 'paired'</filter> + </data> + </outputs> + <tests> + <test> + <param name="pairedSingle" value="single" /> + <param name="input1" value="s2fq_phiX.csfasta" ftype="csfasta" /> + <param name="input2" value="s2fq_phiX.qualsolid" ftype="qualsolid" /> + <output name="output1" file="s2fq_out1.fastqsanger" /> + </test> + <test> + <param name="pairedSingle" value="paired" /> + <param name="input1" value="s2fq_paired_F3.csfasta" ftype="csfasta" /> + <param name="input2" value="s2fq_paired_F3_QV.qualsolid" ftype="qualsolid" /> + <param name="input3" value="s2fq_paired_R3.csfasta" ftype="csfasta" /> + <param name="input4" value="s2fq_paired_R3_QV.qualsolid" ftype="qualsolid" /> + <output name="output1" file="s2fq_out2.fastqsanger" /> + <!-- testing framework does not deal with multiple outputs yet + <output name="output2" file="s2fq_out3.fastqsanger" /> + --> + </test> + </tests> + <help> + +**What it does** + +This tool takes reads and quality files and converts them to FASTQ data ( Sanger variant ). Any -1 qualities are converted to 1 before being converted to FASTQ. Note that it also converts sequences to base pairs. + +----- + +**Example** + +- Converting the following sequences:: + + >1831_573_1004_F3 + T00030133312212111300011021310132222 + >1831_573_1567_F3 + T03330322230322112131010221102122113 + +- and quality scores:: + + >1831_573_1004_F3 + 4 29 34 34 32 32 24 24 20 17 10 34 29 20 34 13 30 34 22 24 11 28 19 17 34 17 24 17 25 34 7 24 14 12 22 + >1831_573_1567_F3 + 8 26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 32 10 13 6 32 10 6 16 11 + +- will produce the following Sanger FASTQ data:: + + @1831_573_1004/1 + AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG + + + >CCAA9952+C>5C.?C79,=42C292:C(9/-7 + @1831_573_1567/1 + TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT + + + ;@@17?@=>7??@A8?==@4A?A4)A+.'A+'1, + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c ngs_simulation/ngs_simulation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngs_simulation/ngs_simulation.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,280 @@\n+#!/usr/bin/env python\n+"""\n+Runs Ben\'s simulation.\n+\n+usage: %prog [options]\n+ -i, --input=i: Input genome (FASTA format)\n+ -g, --genome=g: If built-in, the genome being used\n+ -l, --read_len=l: Read length\n+ -c, --avg_coverage=c: Average coverage\n+ -e, --error_rate=e: Error rate (0-1)\n+ -n, --num_sims=n: Number of simulations to run\n+ -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1)\n+ -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1)\n+ -p, --output_png=p: Plot output\n+ -s, --summary_out=s: Whether or not to output a file with summary of all simulations\n+ -m, --output_summary=m: File name for output summary of all simulations\n+ -f, --new_file_path=f: Directory for summary output files\n+"""\n+# removed output of all simulation results on request (not working)\n+# -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds)\n+# -o, --output=o: Base name for summary output for each run\n+from __future__ import print_function\n+\n+import itertools\n+import os\n+import random\n+import sys\n+import tempfile\n+\n+from bx.cookbook import doc_optparse\n+from rpy import r\n+\n+\n+def stop_err( msg ):\n+ sys.stderr.write( \'%s\\n\' % msg )\n+ sys.exit()\n+\n+\n+def __main__():\n+ # Parse Command Line\n+ options, args = doc_optparse.parse( __doc__ )\n+ # validate parameters\n+ error = \'\'\n+ try:\n+ read_len = int( options.read_len )\n+ if read_len <= 0:\n+ raise Exception(\' greater than 0\')\n+ except TypeError as e:\n+ error = \': %s\' % str( e )\n+ if error:\n+ stop_err( \'Make sure your number of reads is an integer value%s\' % error )\n+ error = \'\'\n+ try:\n+ avg_coverage = int( options.avg_coverage )\n+ if avg_coverage <= 0:\n+ raise Exception(\' greater than 0\')\n+ except Exception as e:\n+ error = \': %s\' % str( e )\n+ if error:\n+ stop_err( \'Make sure your average coverage is an integer value%s\' % error )\n+ error = \'\'\n+ try:\n+ error_rate = float( options.error_rate )\n+ if error_rate >= 1.0:\n+ error_rate = 10 ** ( -error_rate / 10.0 )\n+ elif error_rate < 0:\n+ raise Exception(\' between 0 and 1\')\n+ except Exception as e:\n+ error = \': %s\' % str( e )\n+ if error:\n+ stop_err( \'Make sure the error rate is a decimal value%s or the quality score is at least 1\' % error )\n+ try:\n+ num_sims = int( options.num_sims )\n+ except TypeError as e:\n+ stop_err( \'Make sure the number of simulations is an integer value: %s\' % str( e ) )\n+ if options.polymorphism != \'None\':\n+ polymorphisms = [ float( p ) for p in options.polymorphism.split( \',\' ) ]\n+ else:\n+ stop_err( \'Select at least one polymorphism value to use\' )\n+ if options.detection_thresh != \'None\':\n+ detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( \',\' ) ]\n+ else:\n+ stop_err( \'Select at least one detection threshold to use\' )\n+\n+ # mutation dictionaries\n+ hp_dict = { \'A\': \'G\', \'G\': \'A\', \'C\': \'T\', \'T\': \'C\', \'N\': \'N\' } # heteroplasmy dictionary\n+ mt_dict = { \'A\': \'C\', \'C\': \'A\', \'G\': \'T\', \'T\': \'G\', \'N\': \'N\'} # misread dictionary\n+\n+ # read fasta file to seq string\n+ all_lines = open( options.input, \'rb\' ).readlines()\n+ seq = \'\'\n+ for line in all_lines:\n+ line = line.rstrip()\n+ if line.startswith(\'>\'):\n+ pass\n+ else:\n+ seq += line.upper()\n+ seq_len = len( seq )\n+\n+ # output file name template\n+# removed output of all simulation results on request (not working)\n+# if options.sim_results == "true":\n+# out_name_template = os.path.join( options.new_file_path, \'primary_output%s_\' + options.output + \'_visible_tabular\' )\n+# else:\n+# out_name_template = tempfile.NamedTemporaryFile().name + \'_%s\'\n+ out_name_template = tempfile.NamedT'..b' output error sums and genome size to summary file\n+ output.write( \'%d\\t%d\\n\' % ( fpos, fneg ) )\n+ sim_count += 1\n+ # close output up\n+ output.close()\n+\n+ # Parameters (heteroplasmy, error threshold, colours)\n+ r( \'\'\'\n+ het=c(%s)\n+ err=c(%s)\n+ grade = (0:32)/32\n+ hues = rev(gray(grade))\n+ \'\'\' % ( \',\'.join( [ str( p ) for p in polymorphisms ] ), \',\'.join( [ str( d ) for d in detection_threshes ] ) ) )\n+\n+ # Suppress warnings\n+ r( \'options(warn=-1)\' )\n+\n+ # Create allsum (for FP) and allneg (for FN) objects\n+ r( \'allsum <- data.frame()\' )\n+ for polymorphism in polymorphisms:\n+ for detection_thresh in detection_threshes:\n+ output = outputs[ polymorphism ][ detection_thresh ]\n+ cmd = \'\'\'\n+ ngsum = read.delim(\'%s\', header=T)\n+ ngsum$fprate <- ngsum$FP/%s\n+ ngsum$hetcol <- %s\n+ ngsum$errcol <- %s\n+ allsum <- rbind(allsum, ngsum)\n+ \'\'\' % ( output, seq_len, polymorphism, detection_thresh )\n+ r( cmd )\n+\n+ if os.path.getsize( output ) == 0:\n+ for p in outputs.keys():\n+ for d in outputs[ p ].keys():\n+ sys.stderr.write(outputs[ p ][ d ] + \' \' + str( os.path.getsize( outputs[ p ][ d ] ) ) + \'\\n\')\n+\n+ if options.summary_out == "true":\n+ r( \'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\\t", row.names=FALSE)\' % options.output_summary )\n+\n+ # Summary objects (these could be printed)\n+ r( \'\'\'\n+ tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean)\n+ tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean)\n+ cat(\'\\nFalse Positive Rate Summary\\n\\t\', file=\'%s\', append=T, sep=\'\\t\')\n+ write.table(format(tr_pos, digits=4), file=\'%s\', append=T, quote=F, sep=\'\\t\')\n+ cat(\'\\nFalse Negative Rate Summary\\n\\t\', file=\'%s\', append=T, sep=\'\\t\')\n+ write.table(format(tr_neg, digits=4), file=\'%s\', append=T, quote=F, sep=\'\\t\')\n+ \'\'\' % tuple( [ options.output_summary ] * 4 ) )\n+\n+ # Setup graphs\n+ r( \'\'\'\n+ png(\'%s\', width=800, height=500, units=\'px\', res=250)\n+ layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10))\n+ \'\'\' % options.output_png )\n+\n+ # Main title\n+ genome = \'\'\n+ if options.genome:\n+ genome = \'%s: \' % options.genome\n+ r( \'\'\'\n+ par(mar=c(0,0,0,0))\n+ plot(1, type=\'n\', axes=F, xlab=\'\', ylab=\'\')\n+ text(1,1,paste(\'%sVariation in False Positives and Negatives (\', %s, \' simulations, coverage \', %s,\')\', sep=\'\'), font=2, family=\'sans\', cex=0.7)\n+ \'\'\' % ( genome, options.num_sims, options.avg_coverage ) )\n+\n+ # False positive boxplot\n+ r( \'\'\'\n+ par(mar=c(5,4,2,2), las=1, cex=0.35)\n+ boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85)\n+ title(main=\'False Positives\', xlab=\'false positive rate\', ylab=\'\')\n+ \'\'\' )\n+\n+ # False negative heatmap (note zlim command!)\n+ num_polys = len( polymorphisms )\n+ num_dets = len( detection_threshes )\n+ r( \'\'\'\n+ par(mar=c(5,4,2,1), las=1, cex=0.35)\n+ image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab=\'\', ylab=\'\', axes=F, border=1)\n+ axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs=\'i\')\n+ axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85)\n+ title(main=\'False Negatives\', xlab=\'minor allele frequency\', ylab=\'detection threshold\')\n+ \'\'\' % ( num_polys, num_dets, num_polys, num_dets ) )\n+\n+ # Scale alongside\n+ r( \'\'\'\n+ par(mar=c(2,2,2,3), las=1)\n+ image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab=\'\', ylab=\'\', xaxt=\'n\', las=1, cex.axis=0.85)\n+ title(main=\'Key\', cex=0.35)\n+ mtext(\'false negative rate\', side=1, cex=0.35)\n+ \'\'\' )\n+\n+ # Close graphics\n+ r( \'\'\'\n+ layout(1)\n+ dev.off()\n+ \'\'\' )\n+\n+\n+if __name__ == "__main__":\n+ __main__()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c ngs_simulation/ngs_simulation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngs_simulation/ngs_simulation.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,222 @@\n+<tool id="ngs_simulation" name="Simulate" version="1.0.0">\n+<!--<tool id="ngs_simulation" name="Simulate" force_history_refresh="True" version="1.0.0">-->\n+ <description>Illumina runs</description>\n+ <requirements>\n+ <requirement type="package" version="1.0.3">rpy</requirement>\n+ </requirements>\n+ <command interpreter="python">\n+ ngs_simulation.py\n+ #if $in_type.input_type == "built-in"\n+ --input="${in_type.genome.fields.path}"\n+ --genome=$in_type.genome\n+ #else\n+ --input="${ in_type.input1 }"\n+ #end if\n+ --read_len=$read_len\n+ --avg_coverage=$avg_coverage\n+ --error_rate=$error_rate\n+ --num_sims=$num_sims\n+ --polymorphism=$polymorphism\n+ --detection_thresh=$detection_thresh\n+ --output_png="${ output_png }"\n+ --summary_out=$summary_out\n+ --output_summary="${ output_summary }"\n+ --new_file_path="."\n+ </command>\n+<!-- If want to include all simulation results file\n+ sim_results=$sim_results\n+ output=$output.id\n+-->\n+ <inputs>\n+ <conditional name="in_type">\n+ <param name="input_type" type="select" label="Use a built-in FASTA file or one from the history?">\n+ <option value="built-in">Built-in</option>\n+ <option value="history">History file</option>\n+ </param>\n+ <when value="built-in">\n+ <param name="genome" type="select" label="Select a built-in genome" help="if your genome of interest is not listed - contact Galaxy team">\n+ <options from_data_table="ngs_sim_fasta" />\n+ </param>\n+ </when>\n+ <when value="history">\n+ <param name="input1" type="data" format="fasta" label="Input genome (FASTA format)" />\n+ </when>\n+ </conditional>\n+ <param name="read_len" type="integer" value="76" label="Read length" />\n+ <param name="avg_coverage" type="integer" value="200" label="Average coverage" />\n+ <param name="error_rate" type="float" value="0.001" label="Error rate or quality score" help="Quality score if integer 1 or greater; error rate if between 0 and 1" />\n+ <param name="num_sims" type="integer" value="100" label="The number of simulations to run" />\n+ <param name="polymorphism" type="select" multiple="true" label="Frequency/ies for minor allele">\n+ <option value="0.001">0.001</option>\n+ <option value="0.002">0.002</option>\n+ <option value="0.003">0.003</option>\n+ <option value="0.004">0.004</option>\n+ <option value="0.005">0.005</option>\n+ <option value="0.006">0.006</option>\n+ <option value="0.007">0.007</option>\n+ <option value="0.008">0.008</option>\n+ <option value="0.009">0.009</option>\n+ <option value="0.01">0.01</option>\n+ <option value="0.02">0.02</option>\n+ <option value="0.03">0.03</option>\n+ <option value="0.04">0.04</option>\n+ <option value="0.05">0.05</option>\n+ <option value="0.06">0.06</option>\n+ <option value="0.07">0.07</option>\n+ <option value="0.08">0.08</option>\n+ <option value="0.09">0.09</option>\n+ <option value="0.1">0.1</option>\n+ <option value="0.2">0.2</option>\n+ <option value="0.3">0.3</option>\n+ <option value="0.4">0.4</option>\n+ <option value="0.5">0.5</option>\n+ <option value="0.6">0.6</option>\n+ <option value="0.7">0.7</option>\n+ <option value="0.8">0.8</option>\n+ <option value="0.9">0.9</option>\n+ <option value="1.0">1.0</option>\n+ <validator type="no_options" message="You must select at least one value" />\n+ </param>\n+ <param name="detection_thresh" type="select" multiple="true" label="Detection thresholds">\n+ <option value="0.001">0.001</option>\n+ <option value="0.002">0.002</option>\n+ <option value="0.003">0.003</option>\n+ <option value="0.004">0.004</option>\n+ <option value="0.005">0.005</option>\n+ <option value="0.006">0.006</option>\n+ <option value="0.007">0.007</option>\n+ <option value="0.008'..b'ed output files.\n+ -->\n+ <!--\n+ <test>\n+ <param name="input_type" value="history" />\n+ <param name="input1" value="ngs_simulation_in1.fasta" ftype="fasta" />\n+ <param name="read_len" value="76" />\n+ <param name="avg_coverage" value="200" />\n+ <param name="error_rate" value="0.001" />\n+ <param name="num_sims" value="25" />\n+ <param name="polymorphism" value="0.02,0.04,0.1" />\n+ <param name="detection_thresh" value="0.01,0.02" />\n+ <param name="summary_out" value="true" />\n+ <output name="output_png" file="ngs_simulation_out1.png" />\n+ <output name="output_summary" file="ngs_simulation_out2.tabular" />\n+ </test>\n+ <test>\n+ <param name="input_type" value="built-in" />\n+ <param name="genome" value="pUC18" />\n+ <param name="read_len" value="50" />\n+ <param name="avg_coverage" value="150" />\n+ <param name="error_rate" value="0.005" />\n+ <param name="num_sims" value="25" />\n+ <param name="polymorphism" value="0.001,0.005" />\n+ <param name="detection_thresh" value="0.001,0.002" />\n+ <param name="summary_out" value="false" />\n+ <output name="output_png" file="ngs_simulation_out3.png" />\n+ </test>\n+ -->\n+ </tests>\n+ <help>\n+\n+**What it does**\n+\n+This tool simulates an Illumina run and provides plots of false positives and false negatives. It allows for a range of simulation parameters to be set. Note that this simulation sets only one (randomly chosen) position in the genome as polymorphic, according to the value specified. Superimposed on this are "sequencing errors", which are uniformly (and randomly) distributed. Polymorphisms are assigned using the detection threshold, so if the detection threshold is set to the same as the minor allele frequency, the expected false negative rate is 50%.\n+\n+**Parameter list**\n+\n+These are the parameters that should be set for the simulation::\n+\n+ Read length (which is the same for all reads)\n+ Average Coverage\n+ Frequency for Minor Allele\n+ Sequencing Error Rate\n+ Detection Threshold\n+ Number of Simulations\n+\n+You also should choose to use either a built-in genome or supply your own FASTA file.\n+\n+**Output**\n+\n+There are one or two. The first is a png that contains two different plots and is always generated. The second is optional and is a text file with some summary information about the simulations that were run. Below are some example outputs for a 10-simulation run on phiX with the default settings::\n+\n+ Read length 76\n+ Average coverage 200\n+ Error rate/quality score 0.001\n+ Number of simulations 100\n+ Frequencies for minor allele 0.002\n+ 0.004\n+ Detection thresholds 0.003\n+ 0.005\n+ 0.007\n+ Include summary file Yes\n+\n+Plot output (png):\n+\n+.. image:: ${static_path}/images/ngs_simulation.png\n+\n+Summary output (txt)::\n+\n+ FP FN GENOMESIZE.5386 fprate hetcol errcol\n+ Min. : 71.0 Min. :0.0 Mode:logical Min. :0.01318 Min. :0.004 Min. :0.007\n+ 1st Qu.:86.0 1st Qu.:1.0 NA\'s:10 1st Qu.:0.01597 1st Qu.:0.004 1st Qu.:0.007\n+ Median :92.5 Median :1.0 NA Median :0.01717 Median :0.004 Median :0.007\n+ Mean :93.6 Mean :0.9 NA Mean :0.01738 Mean :0.004 Mean :0.007\n+ 3rd Qu.:100.8 3rd Qu.:1.0 NA 3rd Qu.:0.01871 3rd Qu.:0.004 3rd Qu.:0.007\n+ Max. :123.0 Max. :1.0 NA Max. :0.02284 Max. :0.004 Max. :0.007\n+ \n+ False Positive Rate Summary\n+ 0.003 0.005 0.007\n+ 0.001 0.17711 0.10854 0.01673\n+ 0.009 0.18049 0.10791 0.01738\n+\n+ False Negative Rate Summary\n+ 0.003 0.005 0.007\n+ 0.001 1.0 0.8 1.0\n+ 0.009 0.4 0.7 0.9\n+\n+\n+ </help>\n+</tool>\n+\n+\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/BEAM2_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/BEAM2_wrapper.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# +# Galaxy wrapper for Yu Zhang's BEAM2 adds two new options +# significance=foo renames significance.txt to foo after BEAM2 is run +# posterior=bar renames posterior.txt to bar after BEAM2 is run +# + +set -e + +export PATH=$PATH:$(dirname $0) + +## options +significance= +posterior= +new_args= +map= +ped= + +TFILE="/tmp/BEAM2.$$.tmp" + +## separate significance and posterior arguments from arguments to BEAM2 +until [ $# -eq 0 ] +do + case $1 in + significance=*) + significance=${1#significance=} + ;; + posterior=*) + posterior=${1#posterior=} + ;; + map=*) + map=${1#map=} + ;; + ped=*) + ped=${1#ped=} + ;; + *) + if [ -z "$new_args" ]; then + new_args=$1 + else + new_args="$new_args $1" + fi + ;; + esac + + shift +done + +## convert input for use with BEAM2 +lped_to_geno.pl $map $ped > $TFILE +if [ $? -ne 0 ]; then + echo "failed: lped_to_geno.pl $map $ped > $TFILE" + exit 1 +fi + +## run BEAM2 +BEAM2 $TFILE $new_args 1>/dev/null +if [ $? -ne 0 ]; then + echo "failed: BEAM2 $TFILE $new_args" + exit 1 +fi + +mergeSnps.pl significance.txt $TFILE +if [ $? -ne 0 ]; then + echo "failed: mergeSnps.pl significance.txt $TFILE" + exit 1 +fi + +## move output files +mv significance.txt $significance +mv posterior.txt $posterior + +## cleanup +rm -f $TFILE + |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/beam.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/beam.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,141 @@ +<tool id="hgv_beam" name="BEAM" version="1.0.0"> + <description>significant single- and multi-locus SNP associations in case-control studies</description> + + <command interpreter="bash"> + BEAM2_wrapper.sh map=${input.extra_files_path}/${input.metadata.base_name}.map ped=${input.extra_files_path}/${input.metadata.base_name}.ped $burnin $mcmc $pvalue significance=$significance posterior=$posterior + </command> + + <inputs> + <param format="lped" name="input" type="data" label="Dataset"/> + <param name="burnin" label="Number of MCMC burn-in steps" type="integer" value="200" /> + <param name="mcmc" label="Number of MCMC sampling steps" type="integer" value="200" /> + <param name="pvalue" label="Significance cutoff (after Bonferroni adjustment)" type="float" value="0.05" /> + </inputs> + + <outputs> + <data format="tabular" name="significance" /> + <data format="tabular" name="posterior" /> + </outputs> + + <requirements> + <requirement type="package">beam</requirement> + <requirement type="binary">mv</requirement> + <requirement type="binary">rm</requirement> + </requirements> + + <!-- broken. will be fixed soon. + <tests> + <test> + <param name='input' value='gpass_and_beam_input' ftype='lped' > + <metadata name='base_name' value='gpass_and_beam_input' /> + <composite_data value='gpass_and_beam_input.ped' /> + <composite_data value='gpass_and_beam_input.map' /> + <edit_attributes type='name' value='gpass_and_beam_input' /> + </param> + <param name="burnin" value="200"/> + <param name="mcmc" value="200"/> + <param name="pvalue" value="0.05"/> + <output name="significance" file="beam_output1.tab"/> + <output name="posterior" file="beam_output2.tab"/> + </test> + </tests> + --> + + <help> +.. class:: infomark + +This tool can take a long time to run, depending on the number of SNPs, the +sample size, and the number of MCMC steps specified. If you have hundreds +of thousands of SNPs, it may take over a day. The main tasks that slow down +this tool are searching for interactions and dynamically partitioning the +SNPs into blocks. Optimization is certainly possible, but hasn't been done +yet. **If your only interest is to detect SNPs with primary effects (i.e., +single-SNP associations), please use the GPASS tool instead.** + +----- + +**Dataset formats** + +The input dataset must be in lped_ format. The output datasets are both tabular_. +(`Dataset missing?`_) + +.. _lped: ${static_path}/formatHelp.html#lped +.. _tabular: ${static_path}/formatHelp.html#tabular +.. _Dataset missing?: ${static_path}/formatHelp.html + +----- + +**What it does** + +BEAM (Bayesian Epistasis Association Mapping) uses a Markov Chain Monte Carlo (MCMC) method to infer SNP block structures and detect both single-marker +and interaction effects from case-control SNP data. +This tool also partitions SNPs into blocks based on linkage disequilibrium (LD). The method utilized is Bayesian, so the outputs are posterior probabilities of association, along with block partitions. An advantage of this method is that it provides uncertainty measures for the associations and block partitions, and it scales well from small to large sample sizes. It is powerful in detecting gene-gene interactions, although slow for large datasets. + +----- + +**Example** + +- input map file:: + + 1 rs0 0 738547 + 1 rs1 0 5597094 + 1 rs2 0 9424115 + etc. + +- input ped file:: + + 1 1 0 0 1 1 G G A A A A A A A A A G A A G G G G A A G G G G G G A A A A A G A A G G A G A G A A G G A A G G A A G G A G A A G G A A G G A A A G A G G G A G G G G G A A A G A A G G G G G G G G A G A A A A A A A A + 1 1 0 0 1 1 G G A G G G A A A A A G A A G G G G G G A A G G A G A G G G G G A G G G A G A A G G A G G G A A G G G G A G A G G G A G A A A A G G G G A G A G G G A G A A A A A G G G A G G G A G G G G G A A G G A G + etc. + +- first output file, significance.txt:: + + ID chr position results + rs0 chr1 738547 10 20 score= 45.101397 , df= 8 , p= 0.000431 , N=1225 + +- second output file, posterior.txt:: + + id: chr position marginal + interaction = total posterior + 0: 1 738547 0.0000 + 0.0000 = 0.0000 + 1: 1 5597094 0.0000 + 0.0000 = 0.0000 + 2: 1 9424115 0.0000 + 0.0000 = 0.0000 + 3: 1 13879818 0.0000 + 0.0000 = 0.0000 + 4: 1 13934751 0.0000 + 0.0000 = 0.0000 + 5: 1 16803491 0.0000 + 0.0000 = 0.0000 + 6: 1 17236854 0.0000 + 0.0000 = 0.0000 + 7: 1 18445387 0.0000 + 0.0000 = 0.0000 + 8: 1 21222571 0.0000 + 0.0000 = 0.0000 + etc. + + id: chr position block_boundary | allele counts in cases and controls + 0: 1 738547 1.000 | 156 93 251 | 169 83 248 + 1: 1 5597094 1.000 | 323 19 158 | 328 16 156 + 2: 1 9424115 1.000 | 366 6 128 | 369 11 120 + 3: 1 13879818 1.000 | 252 31 217 | 278 32 190 + 4: 1 13934751 1.000 | 246 64 190 | 224 58 218 + 5: 1 16803491 1.000 | 91 160 249 | 91 174 235 + 6: 1 17236854 1.000 | 252 43 205 | 249 44 207 + 7: 1 18445387 1.000 | 205 66 229 | 217 56 227 + 8: 1 21222571 1.000 | 353 9 138 | 352 8 140 + etc. + + The "id" field is an internally used index. + +----- + +**References** + +Zhang Y, Liu JS. (2007) +Bayesian inference of epistatic interactions in case-control studies. +Nat Genet. 39(9):1167-73. Epub 2007 Aug 26. + +Zhang Y, Zhang J, Liu JS. (2010) +Block-based bayesian epistasis association mapping with application to WTCCC type 1 diabetes data. +Submitted. + + </help> + <citations> + <citation type="doi">10.1038/ng2110</citation> + <citation type="doi">10.1214/11-AOAS469</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/gpass.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/gpass.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,79 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use File::Temp qw/ tempfile /; + +$ENV{'PATH'} .= ':' . dirname($0); + +#this is a wrapper for gpass that converts a linkage pedigree file to input +#for this program + +my($map, $ped, $out, $fdr) = @ARGV; + +if (!$map or !$ped or !$out or !$fdr) { die "missing args\n"; } + +my($fh, $name) = tempfile(); +#by default this file is removed when these variable go out of scope +print $fh "map=$map ped=$ped\n"; +close $fh; #converter will overwrite, just keep name + +#run converter +system("lped_to_geno.pl $map $ped > $name") == 0 + or die "system lped_to_geno.pl $map $ped > $name failed\n"; + +#system("cp $name tmp.middle"); + +#run GPASS +system("gpass $name -o $out -fdr $fdr 1>/dev/null") == 0 + or die "system gpass $name -o $out -fdr $fdr, failed\n"; + +#merge SNP data with results +merge(); + +exit; + +######################################## + +#merge the input and output files so have SNP data with result +sub merge { + open(FH, $out) or die "Couldn't open $out, $!\n"; + my %res; + my @ind; + while (<FH>) { + chomp; + my $line = $_; + if ($line =~ /^(\d+)/) { $res{$1} = $line; push(@ind, $1); } + else { $res{'index'} = $line; } + } + close FH; + if (!@ind) { return; } #no results, leave alone + @ind = sort { $a <=> $b } @ind; + $res{'index'} =~ s/Index/#ID\tchr\tposition/; + #read input file to get SNP data + open(FH, $name) or die "Couldn't open $name, $!\n"; + my $i = 0; #index is 0 based not counting header line + my $c = shift @ind; + while (<FH>) { + chomp; + if (/^ID/) { next; } + my @f = split(/\s+/); + if ($i == $c) { + $res{$i} =~ s/^$i/$f[0]\t$f[1]\t$f[2]/; + if (!@ind) { last; } + $c = shift @ind; + } + $i++; + } + close FH; + #now reprint results with SNP data included + open(FH, ">", $out) or die "Couldn't write to $out, $!\n"; + print FH $res{'index'}, "\n"; + delete $res{'index'}; + foreach $i (keys %res) { + print FH $res{$i}, "\n"; + } + close FH; +} + |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/gpass.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/gpass.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,115 @@ +<tool id="hgv_gpass" name="GPASS" version="1.0.0"> + <description>significant single-SNP associations in case-control studies</description> + + <command interpreter="perl"> + gpass.pl ${input1.extra_files_path}/${input1.metadata.base_name}.map ${input1.extra_files_path}/${input1.metadata.base_name}.ped $output $fdr + </command> + + <inputs> + <param name="input1" type="data" format="lped" label="Dataset"/> + <param name="fdr" type="float" value="0.05" label="FDR"/> + </inputs> + + <outputs> + <data name="output" format="tabular" /> + </outputs> + + <requirements> + <requirement type="package">gpass</requirement> + </requirements> + + <!-- we need to be able to set the seed for the random number generator + <tests> + <test> + <param name='input1' value='gpass_and_beam_input' ftype='lped' > + <metadata name='base_name' value='gpass_and_beam_input' /> + <composite_data value='gpass_and_beam_input.ped' /> + <composite_data value='gpass_and_beam_input.map' /> + <edit_attributes type='name' value='gpass_and_beam_input' /> + </param> + <param name="fdr" value="0.05" /> + <output name="output" file="gpass_output.txt" /> + </test> + </tests> + --> + + <help> +**Dataset formats** + +The input dataset must be in lped_ format, and the output is tabular_. +(`Dataset missing?`_) + +.. _lped: ${static_path}/formatHelp.html#lped +.. _tabular: ${static_path}/formatHelp.html#tab +.. _Dataset missing?: ${static_path}/formatHelp.html + +----- + +**What it does** + +GPASS (Genome-wide Poisson Approximation for Statistical Significance) +detects significant single-SNP associations in case-control studies at a user-specified FDR. Unlike previous methods, this tool can accurately approximate the genome-wide significance and FDR of SNP associations, while adjusting for millions of multiple comparisons, within seconds or minutes. + +The program has two main functionalities: + +1. Detect significant single-SNP associations at a user-specified false + discovery rate (FDR). + + *Note*: a "typical" definition of FDR could be + FDR = E(# of false positive SNPs / # of significant SNPs) + + This definition however is very inappropriate for association mapping, since SNPs are + highly correlated. Our FDR is + defined differently to account for SNP correlations, and thus will obtain + a proper FDR in terms of "proportion of false positive loci". + +2. Approximate the significance of a list of candidate SNPs, adjusting for + multiple comparisons. If you have isolated a few SNPs of interest and want + to know their significance in a GWAS, you can supply the GWAS data and let + the program specifically test those SNPs. + + +*Also note*: the number of SNPs in a study cannot be both too small and at the same +time too clustered in a local region. A few hundreds of SNPs, or tens of SNPs +spread in different regions, will be fine. The sample size cannot be too small +either; around 100 or more individuals (case + control combined) will be fine. +Otherwise use permutation. + +----- + +**Example** + +- input map file:: + + 1 rs0 0 738547 + 1 rs1 0 5597094 + 1 rs2 0 9424115 + etc. + +- input ped file:: + + 1 1 0 0 1 1 G G A A A A A A A A A G A A G G G G A A G G G G G G A A A A A G A A G G A G A G A A G G A A G G A A G G A G A A G G A A G G A A A G A G G G A G G G G G A A A G A A G G G G G G G G A G A A A A A A A A + 1 1 0 0 1 1 G G A G G G A A A A A G A A G G G G G G A A G G A G A G G G G G A G G G A G A A G G A G G G A A G G G G A G A G G G A G A A A A G G G G A G A G G G A G A A A A A G G G A G G G A G G G G G A A G G A G + etc. + +- output dataset, showing significant SNPs and their p-values and FDR:: + + #ID chr position Statistics adj-Pvalue FDR + rs35 chr1 136606952 4.890849 0.991562 0.682138 + rs36 chr1 137748344 4.931934 0.991562 0.795827 + rs44 chr2 14423047 7.712832 0.665086 0.218776 + etc. + +----- + +**Reference** + +Zhang Y, Liu JS. (2010) +Fast and accurate significance approximation for genome-wide association studies. +Submitted. + + </help> + <citations> + <citation type="doi">10.1198/jasa.2011.ap10657</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/ldtools.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/ldtools.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,114 @@ +<tool id="hgv_ldtools" name="LD" version="1.0.0"> + <description>linkage disequilibrium and tag SNPs</description> + + <command interpreter="bash"> + ldtools_wrapper.sh rsquare=$rsquare freq=$freq input=$input output=$output + </command> + + <inputs> + <param format="tabular" name="input" type="data" label="Dataset"/> + <param name="rsquare" label="r<sup>2</sup> threshold" type="float" value="0.64"> + <validator type="in_range" message="rsquare must be in range [0.00, 1.00]" min="0.00" max="1.00" /> + </param> + <param name="freq" label="Minimum allele frequency threshold" type="float" value="0.00"> + <validator type="in_range" message="freq must be in range (0.00, 0.50]" min="0.00" max="0.50" /> + </param> + </inputs> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <tests> + <test> + <param name="input" value="ldInput1.txt" /> + <param name="rsquare" value="0.64" /> + <param name="freq" value="0.00" /> + <output name="output" file="ldOutput1.txt" /> + </test> + </tests> + + <help> +**Dataset formats** + +The input and output datasets are tabular_. +(`Dataset missing?`_) + +.. _tabular: ${static_path}/formatHelp.html#tab +.. _Dataset missing?: ${static_path}/formatHelp.html + +----- + +**What it does** + +This tool can be used to analyze the patterns of linkage disequilibrium +(LD) between polymorphic sites in a locus. SNPs are grouped based on the +threshold level of LD as measured by r\ :sup:`2` (regardless of genomic +position), and a representative "tag SNP" is reported for each group. +The other SNPs in the group are in LD with the tag SNP, but not necessarily +with each other. + +The underlying algorithm is the same as the one used in ldSelect (Carlson +et al. 2004). However, this tool is implemented to be much faster and more +efficient than ldSelect. + +The input is a tabular file with genotype information for each individual +at each SNP site, in exactly four columns: site ID, sample ID, and the +two allele nucleotides. + +----- + +**Example** + +- input file:: + + rs2334386 NA20364 G T + rs2334386 NA20363 G G + rs2334386 NA20360 G G + rs2334386 NA20359 G G + rs2334386 NA20358 G G + rs2334386 NA20356 G G + rs2334386 NA20357 G G + rs2334386 NA20350 G G + rs2334386 NA20349 G G + rs2334386 NA20348 G G + rs2334386 NA20347 G G + rs2334386 NA20346 G G + rs2334386 NA20345 G G + rs2334386 NA20344 G G + rs2334386 NA20342 G G + etc. + +- output file:: + + rs2238748 rs2793064,rs6518516,rs6518517,rs2283641,rs5993533,rs715590,rs2072123,rs2105421,rs2800954,rs1557847,rs807750,rs807753,rs5993488,rs8138035,rs2800980,rs2525079,rs5992353,rs712966,rs2525036,rs807743,rs1034727,rs807744,rs2074003 + rs2871023 rs1210715,rs1210711,rs5748189,rs1210709,rs3788298,rs7284649,rs9306217,rs9604954,rs1210703,rs5748179,rs5746727,rs5748190,rs5993603,rs2238766,rs885981,rs2238763,rs5748165,rs9605996,rs9606001,rs5992398 + rs7292006 rs13447232,rs5993665,rs2073733,rs1057457,rs756658,rs5992395,rs2073760,rs739369,rs9606017,rs739370,rs4493360,rs2073736 + rs2518840 rs1061325,rs2283646,rs362148,rs1340958,rs361956,rs361991,rs2073754,rs2040771,rs2073740,rs2282684 + rs2073775 rs10160,rs2800981,rs807751,rs5993492,rs2189490,rs5747997,rs2238743 + rs5747263 rs12159924,rs2300688,rs4239846,rs3747025,rs3747024,rs3747023,rs2300691 + rs433576 rs9605439,rs1109052,rs400509,rs401099,rs396012,rs410456,rs385105 + rs2106145 rs5748131,rs2013516,rs1210684,rs1210685,rs2238767,rs2277837 + rs2587082 rs2257083,rs2109659,rs2587081,rs5747306,rs2535704,rs2535694 + rs807667 rs2800974,rs756651,rs762523,rs2800973,rs1018764 + rs2518866 rs1206542,rs807467,rs807464,rs807462,rs712950 + rs1110661 rs1110660,rs7286607,rs1110659,rs5992917,rs1110662 + rs759076 rs5748760,rs5748755,rs5748752,rs4819925,rs933461 + rs5746487 rs5992895,rs2034113,rs2075455,rs1867353 + rs5748212 rs5746736,rs4141527,rs5748147,rs5748202 + etc. + +----- + +**Reference** + +Carlson CS, Eberle MA, Rieder MJ, Yi Q, Kruglyak L, Nickerson DA. (2004) +Selecting a maximally informative set of single-nucleotide polymorphisms for +association analyses using linkage disequilibrium. +Am J Hum Genet. 74(1):106-20. Epub 2003 Dec 15. + + </help> + <citations> + <citation type="doi">10.1086/381000</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/ldtools_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/ldtools_wrapper.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# +# Galaxy wrapper for Aakrosh Ratan's ldtools +# + +set -e + +export PATH=$PATH:$(dirname $0) + +## pagetag options +input= +rsquare=0.64 +freq=0.00 +sample=### + +## senatag options +excluded=### +required=### +output= + +until [ $# -eq 0 ] +do + case $1 in + rsquare=*) + rsquare=${1#rsquare=} + ;; + freq=*) + freq=${1#freq=} + ;; + input=*) + input=${1#input=} + ;; + output=*) + output=${1#output=} + ;; + *) + if [ -z "$new_args" ]; then + new_args=$1 + else + new_args="$new_args $1" + fi + ;; + esac + + shift +done + +## run pagetag +pagetag.py --rsquare $rsquare --freq $freq $input snps.txt neighborhood.txt &> /dev/null +if [ $? -ne 0 ]; then + echo "failed: pagetag.py --rsquare $rsquare --freq $freq $input snps.txt neighborhood.txt" + exit 1 +fi + +## run sentag +senatag.py neighborhood.txt snps.txt > $output 2> /dev/null +if [ $? -ne 0 ]; then + echo "failed: senatag.py neighborhood.txt snps.txt" + exit 1 +fi + +## cleanup +rm -f snps.txt neighborhood.txt + |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/linkToDavid.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/linkToDavid.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,59 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +################################################### +# linkToDavid.pl +# Generates a link to David for a list of gene IDs. +################################################### + +if (!@ARGV or scalar @ARGV != 4) { + print "usage: linkToDavid.pl infile.tab 1basedCol idType outfile\n"; + exit 1; +} + +my $in = shift @ARGV; +my $col = shift @ARGV; +my $type = shift @ARGV; +my $out = shift @ARGV; + +if ($col < 1) { + print "ERROR the column number should be 1 based counting\n"; + exit 1; +} +my @gene; +open(FH, $in) or die "Couldn't open $in, $!\n"; +while (<FH>) { + chomp; + my @f = split(/\t/); + if (scalar @f < $col) { + print "ERROR there is no column $col in $in\n"; + exit 1; + } + if ($f[$col-1]) { push(@gene, $f[$col-1]); } +} +close FH or die "Couldn't close $in, $!\n"; + +if (scalar @gene > 400) { + print "ERROR David only allows 400 genes submitted via a link\n"; + exit 1; +} + +my $link = 'http://david.abcc.ncifcrf.gov/api.jsp?type=TYPE&ids=GENELIST&tool=summary'; + +my $g = join(",", @gene); +$link =~ s/GENELIST/$g/; +$link =~ s/TYPE/$type/; +#print output +if (length $link > 2048) { + print "ERROR too many genes to fit in URL, please select a smaller set\n"; + exit; +} +open(FH, ">", $out) or die "Couldn't open $out, $!\n"; +print FH "<html><head><title>DAVID link</title></head><body>\n", + '<A TARGET=_BLANK HREF="', $link, '">click here to send of identifiers to DAVID</A>', "\n", + '</body></html>', "\n"; +close FH or die "Couldn't close $out, $!\n"; + +exit; |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/linkToDavid.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/linkToDavid.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,114 @@ +<tool id="hgv_david" name="DAVID" version="1.0.1"> + <description>functional annotation for a list of genes</description> + + <command interpreter="perl"> + linkToDavid.pl $input $numerical_column $type $out_file1 + </command> + + <inputs> + <param name="input" type="data" format="tabular" label="Dataset" /> + <param name="numerical_column" type="data_column" data_ref="input" label="Column with identifiers" /> + <param name="type" label="Identifier type" type="select"> + <option value="AFFYMETRIX_3PRIME_IVT_ID">AFFYMETRIX_3PRIME_IVT_ID</option> + <option value="AFFYMETRIX_EXON_GENE_ID">AFFYMETRIX_EXON_GENE_ID</option> + <option value="AFFYMETRIX_SNP_ID">AFFYMETRIX_SNP_ID</option> + <option value="AGILENT_CHIP_ID">AGILENT_CHIP_ID</option> + <option value="AGILENT_ID">AGILENT_ID</option> + <option value="AGILENT_OLIGO_ID">AGILENT_OLIGO_ID</option> + <option value="ENSEMBL_GENE_ID">ENSEMBL_GENE_ID</option> + <option value="ENSEMBL_TRANSCRIPT_ID">ENSEMBL_TRANSCRIPT_ID</option> + <option value="ENTREZ_GENE_ID">ENTREZ_GENE_ID</option> + <option value="FLYBASE_GENE_ID">FLYBASE_GENE_ID</option> + <option value="FLYBASE_TRANSCRIPT_ID">FLYBASE_TRANSCRIPT_ID</option> + <option value="GENBANK_ACCESSION">GENBANK_ACCESSION</option> + <option value="GENOMIC_GI_ACCESSION">GENOMIC_GI_ACCESSION</option> + <option value="GENPEPT_ACCESSION">GENPEPT_ACCESSION</option> + <option value="ILLUMINA_ID">ILLUMINA_ID</option> + <option value="IPI_ID">IPI_ID</option> + <option value="MGI_ID">MGI_ID</option> + <option value="OFFICIAL_GENE_SYMBOL" selected="true">OFFICIAL_GENE_SYMBOL</option> + <option value="PFAM_ID">PFAM_ID</option> + <!--option value="PIR_ACCESSION">PIR_ACCESSION</option--> + <option value="PIR_ID">PIR_ID</option> + <option value="PROTEIN_GI_ACCESSION">PROTEIN_GI_ACCESSION</option> + <!--option value="PIR_NREF_ID">PIR_NREF_ID</option--> + <option value="REFSEQ_GENOMIC">REFSEQ_GENOMIC</option> + <option value="REFSEQ_MRNA">REFSEQ_MRNA</option> + <option value="REFSEQ_PROTEIN">REFSEQ_PROTEIN</option> + <option value="REFSEQ_RNA">REFSEQ_RNA</option> + <option value="RGD_ID">RGD_ID</option> + <option value="SGD_ID">SGD_ID</option> + <option value="TAIR_ID">TAIR_ID</option> + <option value="UCSC_GENE_ID">UCSC_GENE_ID</option> + <option value="UNIGENE">UNIGENE</option> + <option value="UNIPROT_ACCESSION">UNIPROT_ACCESSION</option> + <option value="UNIPROT_ID">UNIPROT_ID</option> + <option value="UNIREF100_ID">UNIREF100_ID</option> + <option value="WORMBASE_GENE_ID">WORMBASE_GENE_ID</option> + <option value="WORMPEP_ID">WORMPEP_ID</option> + <option value="ZFIN_ID">ZFIN_ID</option> + </param> + </inputs> + + <outputs> + <data format="html" name="out_file1" /> + </outputs> + + <tests> + <test> + <param name="input" ftype="tabular" value="linkToDavid.tabular" /> + <param name="numerical_column" value="1" /> + <param name="type" value="ENTREZ_GENE_ID" /> + <output name="out_file1" file="linkToDavid_1.out" /> + </test> + </tests> + + <help> + .. class:: infomark + +The list is limited to 400 IDs. + +----- + +**Dataset formats** + +The input dataset is in tabular_ format. The output dataset is html_ with +a link to the DAVID website as described below. +(`Dataset missing?`_) + +.. _tabular: ${static_path}/formatHelp.html#tab +.. _html: ${static_path}/formatHelp.html#html +.. _Dataset missing?: ${static_path}/formatHelp.html + +----- + +**What it does** + +This tool creates a link to the Database for Annotation, +Visualization, and Integrated Discovery (DAVID) website at NIH, +sending a list of IDs from the selected column of a tabular +Galaxy dataset. To follow the created link, click on the +eye icon once the Galaxy tool has finished running. + +DAVID provides a comprehensive set of functional annotation tools +to help investigators discover biological meaning behind large +lists of genes. + +----- + +**References** + +Huang DW, Sherman BT, Lempicki RA. (2009) Systematic and integrative analysis +of large gene lists using DAVID bioinformatics resources. +Nat Protoc. 4(1):44-57. + +Dennis G, Sherman BT, Hosack DA, Yang J, Gao W, Lane HC, Lempicki RA. (2003) +DAVID: database for annotation, visualization, and integrated discovery. +Genome Biol. 4(5):P3. Epub 2003 Apr 3. + + </help> + <citations> + <citation type="doi">10.1038/nprot.2008.211</citation> + <citation type="doi">10.1186/gb-2003-4-5-p3</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/linkToGProfile.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/linkToGProfile.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,89 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +################################################### +# linkToGProfile.pl +# Generates a link to gprofile for a list of gene IDs. +# g:Profiler a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200 +################################################### + +if (!@ARGV or scalar @ARGV < 4) { + print "usage: linkToGProfile.pl infile.tab idType outfile -gene=1basedCol -chr=1basedCol -start=1basedCol -end=1basedCol\n"; + exit 1; +} + +my $in = shift @ARGV; +my $type = shift @ARGV; +my $out = shift @ARGV; + +my $col = 9999; #large unrealistic default +my $chr = 9999; +my $st = 9999; +my $end = 9999; +foreach (@ARGV) { + if (/gene=(\d+)/) { $col = $1; } + elsif (/chr=(\d+)/) { $chr = $1; } + elsif (/start=(\d+)/) { $st = $1; } + elsif (/end=(\d+)/) { $end = $1; } + elsif (/region=1/) { $type = 'region'; } +} + +if ($col < 1 or $chr < 1 or $st < 1 or $end < 1) { + print "ERROR the column number should be 1 based counting\n"; + exit 1; +} +my @gene; +my @pos; +open(FH, $in) or die "Couldn't open $in, $!\n"; +while (<FH>) { + chomp; + my @f = split(/\t/); + if ($type ne 'region') { + if (scalar @f < $col) { + print "ERROR there is no column $col in $in for type $type\n"; + exit 1; + } + if ($f[$col-1]) { push(@gene, $f[$col-1]); } + }else { + if (scalar @f < $chr or scalar @f < $st or scalar @f < $end) { + print "ERROR there is not enough columns ($chr,$st,$end) in $in\n"; + exit 1; + } + if ($f[$chr-1]) { + $f[$chr-1] =~ s/chr//; + push(@pos, "$f[$chr-1]:$f[$st-1]:$f[$end-1]"); + } + } +} +close FH or die "Couldn't close $in, $!\n"; + +#region_query = 1 for coordinates X:1:10 +#can now do POST method +#http://biit.cs.ut.ee/gprofiler/index.cgi?organism=hsapiens&query=pax6&term=&analytical=1&user_thr=1&sort_by_structure=1&output=txt +my $g = join("+", @gene) if @gene; +if (@pos) { $g = join("+", @pos); } +my %params = ( +"analytical"=>1, +"organism"=>"hsapiens", +"query"=>$g, +"term"=>"", +"output"=>"png", +"prefix"=>$type, +"user_thr"=>"1.00" +); +if (@pos) { $params{"region_query"} = 1; } + +open(FH, ">", $out) or die "Couldn't open $out, $!\n"; +print FH "<html><head><title>g:Profiler link</title></head><body>\n"; +print FH '<form method="POST" action="http://biit.cs.ut.ee/gprofiler/index.cgi">'; +foreach my $k (keys %params) { + print FH "<input type='hidden' name='$k' value='$params{$k}'>\n"; +} +print FH '<input type="Submit" name="foo" value="Send to g:Profiler">'; +print FH '</form></body></html>', "\n"; +close FH or die "Couldn't close $out, $!\n"; + +#also do link that prints text that could be pulled back into Galaxy? +exit; |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/linkToGProfile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/linkToGProfile.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,93 @@ +<tool id="hgv_linkToGProfile" name="g:Profiler" version="1.0.0"> + <description>tools for functional profiling of gene lists</description> + + <command interpreter="perl"> + linkToGProfile.pl $input $type $out_file1 -region=$region -gene=$genes -chr=${input.metadata.chromCol} -start=${input.metadata.startCol} -end=${input.metadata.endCol} + </command> + + <inputs> + <param name="input" type="data" format="tabular" label="Dataset" /> + <param name="genes" type="data_column" data_ref="input" label="Column with identifiers" /> + <param name="region" type="select" label="Or use genomic intervals"> + <option value="0" selected="true">No</option> + <option value="1">Yes</option> + </param> + <param name="type" label="Identifier type if numeric" type="select"> + <option value="ENTREZGENE_ACC" selected="true">Entrez Gene Acc</option> + <option value="MIM_MORBID">OMIM Morbid Map</option> + <option value="MIM_GENE">OMIM Gene ID</option> + <option value="AFFY_HUGENE_1_0_ST_V1">AFFY_HUGENE_1_0_ST_V1</option> + <option value="HGNC_AUTOMATIC_GENE_ACC">HGNC_AUTOMATIC_GENE_ACC</option> + <option value="HGNC_MB001_ACC">HGNC_MB001_ACC</option> + <option value="HGNC_ACC">HGNC_ACC</option> + <option value="WIKIGENE_ACC">WIKIGENE_ACC</option> + <option value="DBASS5_ACC">DBASS5_ACC</option> + <option value="ILLUMINA_HUMANWG_6_V1">ILLUMINA_HUMANWG_6_V1</option> + <option value="AFFY_HUEX_1_0_ST_V2">AFFY_HUEX_1_0_ST_V2</option> + <option value="DBASS3_ACC">DBASS3_ACC</option> + </param> + </inputs> + + <outputs> + <data format="html" name="out_file1" /> + </outputs> + + <tests> + <test> + <param name="input" ftype="tabular" value="linkToGProfile.tabular" /> + <param name="genes" value="2" /> + <param name="type" value="ENTREZGENE_ACC" /> + <output name="out_file1" file="linkToGProfile_1.out" /> + </test> + </tests> + + <help> +**Dataset formats** + +The input dataset is tabular_ with a column of identifiers. +The output dataset is html_ with a link to g:Profiler. +(`Dataset missing?`_) + +.. _tabular: ${static_path}/formatHelp.html#tab +.. _html: ${static_path}/formatHelp.html#html +.. _Dataset missing?: ${static_path}/formatHelp.html + +----- + +**What it does** + +This tool creates a link to the g:GOSt tool (Gene Group Functional +Profiling), which is part of the g:Profiler site at the University +of Tartu in Estonia. g:GOSt retrieves the most significant Gene +Ontology (GO) terms, KEGG and REACTOME pathways, and TRANSFAC motifs +for a user-specified group of genes, proteins, or microarray probes. +g:GOSt also allows analysis of ranked or ordered lists of genes, +visual browsing of GO graph structure, interactive visualization of +retrieved results, and many other features. Multiple testing +corrections are applied to extract only statistically important +results. + +The g:GOSt form is pre-filled with gene, protein, or microarray probe +IDs from the selected column of a tabular Galaxy dataset. Or you +can chose to use the genomic coordinates (must be lastest build used by +Ensembl). The coordinates don't have to be genes they can be for +SNPs, and g:GOst will map to the gene ID. To follow +the created link, click on the eye icon when the Galaxy tool has +finished running. Once at the g:Profiler site, scroll down to see +the g:GOSt results. You can also adjust the options in the g:GOSt +form to your liking, or use the row of links between the form and +the results to run other g:Profiler tools using the same list of IDs. + +----- + +**Reference** + +Reimand J, Kull M, Peterson H, Hansen J, Vilo J. (2007) g:Profiler -- a web-based +toolset for functional profiling of gene lists from large-scale experiments. +Nucleic Acids Res. 35(Web Server issue):W193-200. Epub 2007 May 3. + + </help> + <citations> + <citation type="doi">10.1093/nar/gkm226</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/lped_to_geno.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/lped_to_geno.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,104 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +#convert from a MAP and PED file to a genotype file (format desc from PLINK) +#assumes not many SNPs but lots of individuals +# transposed formats are used when lots of SNPs (TPED, TFAM) + +if (!@ARGV or scalar @ARGV ne 2) { + print "usage: lped_to_geno.pl infile.map infile.ped > outfile\n"; + exit; +} + +my $map = shift @ARGV; +my $ped = shift @ARGV; + +my @snp; #array to hold SNPs from map file +open(FH, $map) or die "Couldn't open $map, $!\n"; +while (<FH>) { + chomp; + my @f = split(/\s+/); #3 or 4 columns + #chrom ID [distance|morgans] position + if (!exists $f[3]) { $f[3] = $f[2]; } #only 3 columns + #have to leave in so know which to skip later + #if ($f[3] < 0) { next; } #way of excluding SNPs + #if ($f[0] eq '0') { next; } #unplaced SNP + if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } + push(@snp, "$f[0]:$f[3]:$f[1]"); +} +close FH or die "Couldn't finish $map, $!\n"; + +#rows are individuals, columns are SNPs (7 & up) +#familyId indId fatherId motherId sex phenotype(-9|0|1|2) alleles.... +#need to print row per SNP +my @allele; #alleles to go with @snp +my @pheno; #marker for phenotype +open(FH, $ped) or die "Couldn't open $ped, $!\n"; +while (<FH>) { + chomp; + my @f = split(/\s+/); + if (!defined $f[5]) { die "ERROR undefined phenotype $f[0] $f[1] $f[2] $f[3] $f[4]\n"; } + #-9 is always unknown, 0 unknown or unaffected, 1|2 is affected + #either -9|0|1 or 0|1|2 + push(@pheno, $f[5]); + my $j = 0; + for(my $i = 6; $i< $#f; $i+=2) { + if (!$allele[$j]) { $allele[$j] = ''; } + #can be ACTG or 1234 (for haploview etc) or 0 for missing + if ($f[$i] eq '1') { $f[$i] = 'A'; } + elsif ($f[$i] eq '2') { $f[$i] = 'C'; } + elsif ($f[$i] eq '3') { $f[$i] = 'G'; } + elsif ($f[$i] eq '4') { $f[$i] = 'T'; } + if ($f[$i+1] eq '1') { $f[$i+1] = 'A'; } + elsif ($f[$i+1] eq '2') { $f[$i+1] = 'C'; } + elsif ($f[$i+1] eq '3') { $f[$i+1] = 'G'; } + elsif ($f[$i+1] eq '4') { $f[$i+1] = 'T'; } + $f[$i] = uc($f[$i]); + $f[$i+1] = uc($f[$i+1]); + $allele[$j] .= " $f[$i]$f[$i+1]"; + $j++; + } + if ($j > scalar @snp) { + die "ERROR: more allele columns in the ped file than there are SNP positions in the map file.\n"; + } +} +close FH or die "Couldn't close $ped, $!\n"; + +print "ID Chr Pos"; +my $max = 0; +foreach (@pheno) { if ($_ > $max) { $max = $_; } } +if ($max > 1) { + foreach (@pheno) { if ($_ > 0) { print " ", $_ - 1; }} #go from 1/2 to 0/1 +}else { + foreach (@pheno) { print " $_"; } +} +print "\n"; +for(my $i =0; $i <= $#snp; $i++) { #foreach snp + $allele[$i] =~ /(\w)/; + my $nt = $1; + my $j = 0; + my @t = split(/:/, $snp[$i]); + if ($t[0] eq 'chr0' or $t[1] < 0) { next; } #skip this SNP + if ($t[0] eq 'chrX') { $t[0] = 'chr23'; } + elsif ($t[0] eq 'chrY') { $t[0] = 'chr24'; } + elsif ($t[0] eq 'chrXY') { $t[0] = 'chr23'; } + elsif ($t[0] eq 'chrMT') { $t[0] = 'chr25'; } + print "$t[2] $t[0] $t[1]"; + $allele[$i] =~ s/^\s+//; +my $test = 0; + foreach my $p (split(/\s+/, $allele[$i])) { + if ($pheno[$j] > 0 or ($max == 1 && $pheno[$j] > -1)) { #pheno 0 or -9 skip + #change AA BB AB to 2 0 1 + if ($p eq "$nt$nt") { print " 2"; } + elsif ($p =~ /$nt/) { print " 1"; } + else { print " 0"; } +$test++; + } + $j++; + } + print "\n"; +} + +exit; |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/lps.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/lps.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,323 @@\n+<tool id="hgv_lps" name="LPS" version="1.0.0">\n+ <description>LASSO-Patternsearch algorithm</description>\n+\n+ <command interpreter="bash">\n+ lps_tool_wrapper.sh $lambda_fac $input_file $label_column $output_file $log_file\n+ Initialization 0\n+ #if $advanced.options == "true":\n+ Sample $advanced.sample\n+ Verbosity $advanced.verbosity\n+ Standardize $advanced.standardize\n+ initialLambda $advanced.initialLambda\n+ #if $advanced.continuation.continuation == "1":\n+ Continuation $advanced.continuation.continuation\n+ continuationSteps $advanced.continuation.continuationSteps\n+ accurateIntermediates $advanced.continuation.accurateIntermediates\n+ #end if\n+ printFreq $advanced.printFreq\n+ #if $advanced.newton.newton == "1":\n+ Newton $advanced.newton.newton\n+ NewtonThreshold $advanced.newton.newtonThreshold\n+ #end if\n+ HessianSampleFraction $advanced.hessianSampleFraction\n+ BB 0\n+ Monotone 0\n+ FullGradient $advanced.fullGradient\n+ GradientFraction $advanced.gradientFraction\n+ InitialAlpha $advanced.initialAlpha\n+ AlphaIncrease $advanced.alphaIncrease\n+ AlphaDecrease $advanced.alphaDecrease\n+ AlphaMax $advanced.alphaMax\n+ c1 $advanced.c1\n+ MaxIter $advanced.maxIter\n+ StopTol $advanced.stopTol\n+ IntermediateTol $advanced.intermediateTol\n+ FinalOnly $advanced.finalOnly\n+ #end if\n+ </command>\n+\n+ <inputs>\n+ <param name="input_file" type="data" format="tabular" label="Dataset"/>\n+ <param name="label_column" type="data_column" data_ref="input_file" numerical="true" label="Label column" help="Column containing outcome labels: +1 or -1."/>\n+ <param name="lambda_fac" label="Lambda_fac" type="float" value="0.03" help="Target value of the regularization parameter, expressed as a fraction of the calculated lambda_max.">\n+ <validator type="in_range" message="0.00 < lambda_fac <= 1.00" min="0.00" max="1.00"/>\n+ </param>\n+ <conditional name="advanced">\n+ <param name="options" type="select" label="Advanced Options">\n+ <option value="false" selected="true">Hide advanced options</option>\n+ <option value="true">Show advanced options</option>\n+ </param>\n+ <when value="false">\n+ <!-- no options -->\n+ </when>\n+ <when value="true">\n+ <!-- HARDCODED: \'Sample\' we don\'t support passing an array -->\n+ <param name="sample" type="float" value="1.0" label="Sample fraction" help="Sample this fraction of the data set.">\n+ <validator type="in_range" message="0.0 <= sample <= 1.0" min="0.0" max="1.0"/>\n+ </param>\n+ <!-- HARDCODED: \'Initialization\' = 0 :: Initialize at beta=0 -->\n+ <param name="verbosity" type="select" format="integer" label="Verbosity">\n+ <option value="0" selected="true">Little output</option>\n+ <option value="1">More output</option>\n+ <option value="2">Still more output</option>\n+ </param>\n+ <param name="standardize" type="select" format="integer" label="Standardize" help="Scales and shifts each column so that it has mean zero and variance 1.">\n+ <option value="0" selected="true">Don\'t standardize</option>\n+ <option value="1">Standardize</option>\n+ </param>\n+ <param name="initialLambda" type="float" value="0.8" label="Initial lambda" help="First value of lambda to be used in the continuation scheme, expressed as a fraction of lambda_max.">\n+ <validator type="in_range" message="0.0 < initialLambda < 1.0" min="0.0" max="1.0"/>\n+ </param>\n+ <conditional name="continuation">\n+ <param name="continuation" type="select" format="integer" label="Continuation" help="Use continuation strategy to start with a larger value of lambda, decreasing it successively to lambda_fac.">\n+ <option value="0" selected="true">Don\'t use continuation</option>\n+ '..b'\n+no regularization. At the high end, when Lambda_fac = 1, there will be\n+"too much" regularization, and all of the weights will equal zero.\n+\n+The LPS tool creates two output datasets. The first, called the results\n+file, is a tabular dataset containing one column of weights for each\n+value of the regularization parameter lambda that was tried. The weight\n+columns are in order from left to right by decreasing values of lambda.\n+The first N-1 rows in each column are the weights for the N-1 attributes\n+in your input dataset. The final row is a constant, the intercept.\n+\n+Let **x** be a row from your input dataset and let **b** be a column\n+from the results file. To compute the probability that row **x** has\n+a label value of +1:\n+\n+ Probability(row **x** has label value = +1) = 1 / [1 + exp{**x** \\* **b**\\[1..N-1\\] + **b**\\[N\\]}]\n+\n+where **x** \\* **b**\\[1..N-1\\] represents matrix multiplication.\n+\n+The second output dataset, called the log file, is a text file which\n+contains additional data about the fitted L1-regularized logistic\n+regression model. These data include the number of features, the\n+computed value of lambda_max, the actual values of lambda used, the\n+optimal values of the log-likelihood and regularized log-likelihood\n+functions, the number of non-zeros, and the number of iterations.\n+\n+Website: http://pages.cs.wisc.edu/~swright/LPS/\n+\n+-----\n+\n+**Example**\n+\n+- input file::\n+\n+ +1 1 0 0 0 0 1 0 1 1 ...\n+ +1 1 1 1 0 0 1 0 1 1 ...\n+ +1 1 0 1 0 1 0 1 0 1 ...\n+ etc.\n+\n+- output results file::\n+\n+ 0\n+ 0\n+ 0\n+ 0\n+ 0.025541\n+ etc.\n+\n+- output log file::\n+\n+ Data set has 100 vectors with 50 features.\n+ calculateLambdaMax: n=50, m=100, m+=50, m-=50\n+ computed value of lambda_max: 5.0000e-01\n+ \n+ lambda=2.96e-02 solution:\n+ optimal log-likelihood function value: 6.46e-01\n+ optimal *regularized* log-likelihood function value: 6.79e-01\n+ number of nonzeros at the optimum: 5\n+ number of iterations required: 43\n+ etc.\n+\n+-----\n+\n+**References**\n+\n+Koh K, Kim S-J, Boyd S. (2007)\n+An interior-point method for large-scale l1-regularized logistic regression.\n+Journal of Machine Learning Research. 8:1519-1555.\n+\n+Shi W, Wahba G, Wright S, Lee K, Klein R, Klein B. (2008)\n+LASSO-Patternsearch algorithm with application to ophthalmology and genomic data.\n+Stat Interface. 1(1):137-153.\n+\n+<!--\n+Wright S, Novak R, Figueiredo M. (2009)\n+Sparse reconstruction via separable approximation.\n+IEEE Transactions on Signal Processing. 57:2479-2403.\n+\n+Shi J, Yin W, Osher S, Sajda P. (2010)\n+A fast hybrid algorithm for large scale l1-regularized logistic regression.\n+Journal of Machine Learning Research. 11:713-741.\n+\n+Byrd R, Chin G, Neveitt W, Nocedal J. (2010)\n+On the use of stochastic Hessian information in unconstrained optimization.\n+Technical Report. Northwestern University. June 16, 2010.\n+\n+Wright S. (2010)\n+Accelerated block-coordinate relaxation for regularized optimization.\n+Technical Report. University of Wisconsin. August 10, 2010.\n+-->\n+\n+ </help>\n+ <citations>\n+ <citation type="bibtex">@ARTICLE{Kim07aninterior-point,\n+ author = {Seung-jean Kim and Kwangmoo Koh and Michael Lustig and Stephen Boyd and Dimitry Gorinevsky},\n+ title = {An interior-point method for large-scale l1-regularized logistic regression},\n+ journal = {Journal of Machine Learning Research},\n+ year = {2007},\n+ volume = {8},\n+ pages = {1519--1555},\n+}</citation>\n+ <citation type="bibtex">@ARTICLE{Shi08lasso-patternsearchalgorithm,\n+ author = {Weiliang Shi and Grace Wahba and Stephen Wright and Kristine Lee and Ronald Klein and Barbara Klein},\n+ title = {LASSO-Patternsearch Algorithm with Application to Ophthalmology and Genomic Data},\n+ journal= {Stat Interface},\n+ year = {2008},\n+ volume = {1},\n+ number = {1},\n+ pages = {137--153}\n+}</citation>\n+ </citations>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/lps_tool_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/lps_tool_wrapper.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# script for execution of deployed applications +# +# Sets up the MCR environment for the current $ARCH and executes +# the specified command. +# + +export PATH=$PATH:$(dirname $0) + +MCRROOT=${MCRROOT:-/galaxy/software/linux2.6-x86_64/bin/MCR-7.11/v711} +MWE_ARCH=glnxa64 + +if [ "$MWE_ARCH" = "sol64" ] ; then + LD_LIBRARY_PATH=.:/usr/lib/lwp:${MCRROOT}/runtime/glnxa64 +else + LD_LIBRARY_PATH=.:${MCRROOT}/runtime/glnxa64 +fi + +LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/bin/glnxa64 +LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/sys/os/glnxa64 + +if [ "$MWE_ARCH" = "maci" -o "$MWE_ARCH" = "maci64" ]; then + DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/System/Library/Frameworks/JavaVM.framework/JavaVM:/System/Library/Frameworks/JavaVM.framework/Libraries +else + MCRJRE=${MCRROOT}/sys/java/jre/glnxa64/jre/lib/amd64 + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/native_threads + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/server + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/client + LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE} +fi + +XAPPLRESDIR=${MCRROOT}/X11/app-defaults + +export LD_LIBRARY_PATH XAPPLRESDIR + +lps_tool $* + +exit 0 |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/master2gd_snp.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/master2gd_snp.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,221 @@ +#!/usr/bin/perl -w +use strict; + +#convert from master variant file to snp table (Webb format?) +#new format for version 2.0, also different format for cancer normal pairs +#set columns for 2.0 version Cancer format +my $aCnt1 = 21; +my $aCnt2 = 22; + +#snp table format: +#1. chr +#2. position (0 based) +#3. ref allele +#4. second allele +#5. overall quality +#foreach individual (6-9, 10-13, ...) +#a. count of allele in 3 +#b. count of allele in 4 +#c. genotype call (-1, or count of ref allele) +#d. quality of genotype call (quality of non-ref allele from masterVar) + +if (!@ARGV) { + print "usage: master2gd_snp.pl masterVar.txt[.gz|.bz2] [-tab=snpTable.txt -addColsOnly -build=hg19 -name=na ] > newSnpTable.txt\n"; + exit; +} + +my $in = shift @ARGV; +my $tab; +my $tabOnly; +my $build; +my $name; +foreach (@ARGV) { + if (/-tab=(.*)/) { $tab = $1; } + elsif (/-addColsOnly/) { $tabOnly = 1; } + elsif (/-build=(.*)/) { $build = $1; } + elsif (/-name=(.*)/) { $name = $1; } +} + +#WARNING loads snp table in memory, this could take > 1G ram +my %old; +my $colcnt = 0; +my @head; +if ($tab) { + open(FH, $tab) or die "Couldn't open $tab, $!\n"; + while (<FH>) { + chomp; + if (/^#/) { push(@head, $_); next; } + my @f = split(/\t/); + $old{"$f[0]:$f[1]"} = join("\t", @f); + $colcnt = scalar @f; + } + close FH or die "Couldn't close $tab, $!\n"; +} + +if ($in =~ /.gz$/) { + open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; +}elsif ($in =~ /.bz2$/) { + open(FH, "bzcat $in |") or die "Couldn't open $in, $!\n"; +}else { + open(FH, $in) or die "Couldn't open $in, $!\n"; +} +prepHeader(); +if (@head) { #keep old header, add new? + print join("\n", @head), "\n"; +} +while (<FH>) { + chomp; + #FORMAT_VERSION 2.0 + if (/^#FORMAT_VERSION\s+1\./) { + $aCnt1 = 16; + $aCnt2 = 17; + } + if (/^#/) { next; } + if (/^>/) { next; } #headers + if (/^\s*$/) { next; } + my @f = split(/\t/); + if (!$f[6]) { next; } #WHAT? most likely still zipped? + if ($f[6] ne 'snp') { next; } #table only has substitutions + if ($f[5] eq 'het-alt') { next; } #skip heterozygous with no ref match + if ($f[5] =~ /(hom|het)/) { #zygosity #haploid chrX and chrY? + my $a = $f[7]; #reference allele + my $a2; + my $freq; + my $freq2; + my $sc; + my $alt; + my $g = 1; #genotype == ref allele count + if ($f[8] eq $f[9]) { #should be homozygous? + $a2 = $f[8]; + $g = 0; + $sc = $f[10]; #is this the best one to use? or smallest? + }else { + if ($a ne $f[8]) { $a2 = $f[8]; $alt = 8; } + elsif ($a ne $f[9]) { $a2 = $f[9]; $alt = 9; } + } + if (defined $f[10] && defined $f[11] && $alt) { #VAF score in 2.0 format + $sc = $f[$alt+2]; + } + #version 1.12 columns 16 & 17, version 2.0 Cancer columns 21 & 22 + if (defined $f[$aCnt1] && defined $f[$aCnt2] && $alt) { + if ($alt == 8) { + $freq = $f[$aCnt2]; + $freq2 = $f[$aCnt1]; + }elsif ($alt == 9) { + $freq = $f[$aCnt1]; + $freq2 = $f[$aCnt2]; + } + }elsif (defined $f[$aCnt1]) { + $freq = 0; + $freq2 = $f[$aCnt1]; + } + #if starting a new table or new SNP in old table + #add option to only build on current table? + if (!$tab) { + print "$f[2]\t$f[3]\t$a\t$a2\t-1"; + }elsif (!$tabOnly && !exists $old{"$f[2]:$f[3]"}) { + print "$f[2]\t$f[3]\t$a\t$a2\t-1"; + }elsif (exists $old{"$f[2]:$f[3]"}) { + print $old{"$f[2]:$f[3]"}; + $old{"$f[2]:$f[3]"} = ''; #unset so we know it is printed + }elsif ($tabOnly && !exists $old{"$f[2]:$f[3]"}) { + next; #skip this one entirely + } + if ($colcnt && !exists $old{"$f[2]:$f[3]"}) { + #new SNP pad for missing individuals + my $i = 5; + while ($i < $colcnt) { + print "\t-1\t-1\t-1\t-1"; + $i += 4; + } + } + #add columns for individual + print "\t$freq\t$freq2\t$g\t$sc\n"; + }elsif ($f[5] eq 'hap') { + my $g = 0; + my $freq = 0; + my $freq2 = 0; + if (defined $f[10]) { $freq2 = $f[10]; } + my $sc = -1; + if (defined $f[$aCnt1]) { $sc = $f[$aCnt1]; } + if ($f[8]) { + if (!$tab) { + print "$f[2]\t$f[3]\t$f[7]\t$f[8]\t-1"; + }elsif (!$tabOnly && !exists $old{"$f[2]:$f[3]"}) { + print "$f[2]\t$f[3]\t$f[7]\t$f[8]\t-1"; + }elsif (exists $old{"$f[2]:$f[3]"}) { + print $old{"$f[2]:$f[3]"}; + $old{"$f[2]:$f[3]"} = ''; #unset so we know it is printed + }elsif ($tabOnly && !exists $old{"$f[2]:$f[3]"}) { + next; #skip this one entirely + } + if ($colcnt && !exists $old{"$f[2]:$f[3]"}) { + #new SNP pad for missing individuals + my $i = 5; + while ($i < $colcnt) { + print "\t-1\t-1\t-1\t-1"; + $i += 4; + } + } + #add columns for individual + print "\t$freq\t$freq2\t$g\t$sc\n"; + } + } +} +close FH or die "Couldn't close $in, $!\n"; + +#if adding to a snp table, now we need to finish those not in the latest set +foreach my $k (keys %old) { + if ($old{$k} ne '') { #not printed yet + print $old{$k}, "\t-1\t-1\t-1\t-1\n"; #plus blank for this one + } +} + +exit; + +#parse old header and add or create new +sub prepHeader { + if (!$build) { $build = 'hg19'; } #set default + my @cnames; + my @ind; + my $n; + if (@head) { #parse previous header + my $h = join("", @head); #may split between lines + if ($h =~ /"column_names":\[(.*?)\]/) { + my @t = split(/,/, $1); + foreach (@t) { s/"//g; } + @cnames = @t; + $n = $cnames[$#cnames]; + $n =~ s/Q//; + $n++; + } + if ($h =~ /"dbkey":"(.*?)"/) { $build = $1; } + if ($h =~ /"individuals":\[(.*)\]/) { + my $t = $1; + $t =~ s/\]\].*/]/; #remove if there is more categories + @ind = split(/,/, $t); + } + }else { #start new header + @cnames = ("chr", "pos", "A", "B", "Q"); + $n = 1; + } + #add current + if (!$name) { $name= 'na'; } + my $stcol = $colcnt + 1; + if ($stcol == 1) { $stcol = 6; } #move past initial columns + push(@ind, "[\"$name\",$stcol]"); + push(@cnames, "${n}A", "${n}B", "${n}G", "${n}Q"); + #reassign head + undef @head; + foreach (@cnames) { $_ = "\"$_\""; } #quote name + $head[0] = "#{\"column_names\":[" . join(",", @cnames) . "],"; + $head[1] = "#\"individuals\":[" . join(",", @ind) . "],"; + $head[2] = "#\"dbkey\":\"$build\",\"pos\":2,\"rPos\":2,\"ref\":1,\"scaffold\":1,\"species\":\"$build\"}"; +} +####End + +##example header +#{"column_names":["chr","pos","A","B","Q","1A","1B","1G","1Q","2A","2B","2G","2Q","3A","3B","3G","3Q","4A","4B","4G","4Q","5A","5B","5G","5Q","6A","6B","6G","6Q","7A","7B","7G","7Q","8A","8B","8G", +#"8Q","9A","9B","9G","9Q","10A","10B","10G","10Q"],"dbkey":"hg19","individuals":[["Boh_15M",6],["Boh_19M",10],["Paya_27F",14],["Paya_2F",18],["Paya_32F",22],["Ruil_2M",26],["Ruil_36M",30],["Ruil_3M", +#34],["Ruil_40",38],["Ruil_47F",42]],"pos":2,"rPos":2,"ref":1,"scaffold":1,"species":"hg19"} +#chr1 10290 C T 46.4 0 2 0 7 1 2 0 4 3 2 1 22 0 0 -1 0 1 0 1 4 0 2 0 7 0 0 -1 0 2 3 1 14 0 1 0 4 1 1 1 6 |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/master2gd_snp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/master2gd_snp.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,86 @@ +<tool id="master2gd_snp" name="MasterVar to gd_snp" hidden="false" version="1.0.0"> + <description>Convert from MasterVar to gd_snp table</description> + <command interpreter="perl"> + #if $snptab.tab2 == "yes" + #if $snptab.colsOnly == "addColsOnly" #master2gd_snp.pl $input1 -tab=$snptab.input2 -name=$indName -build=${input1.metadata.dbkey} -addColsOnly > $out_file1 + #else #master2gd_snp.pl $input1 -tab=$snptab.input2 -name=$indName -build=${input1.metadata.dbkey} > $out_file1 + #end if + #else #master2gd_snp.pl $input1 -name=$indName -build=${input1.metadata.dbkey} > $out_file1 + #end if + </command> + <inputs> + <param format="tab" name="input1" type="data" label="Complete Genomics MasterVar dataset" /> + <conditional name="snptab"> + <param name="tab2" type="select" label="Append to gd_snp table in history"> + <option value="yes">yes</option> + <option value="no" selected="true">no</option> + </param> + <when value="yes"> + <param format="gd_snp" name="input2" type="data" label="gd_snp table" /> + <param name="colsOnly" type="select" label="Skip new SNPs"> + <option value="" selected="true">no</option> + <option value="addColsOnly">yes</option> + </param> + </when> + <when value="no"> <!-- do nothing --> + </when> + </conditional> + <param name="indName" type="text" size="20" label="Label for new individual/group" value="na" /> + </inputs> + <outputs> + <data format="gd_snp" name="out_file1" /> + </outputs> + <tests> + <test> + <param name='input1' value='masterVarTest.txt' ftype='tab' /> + <param name='tab2' value='no' /> + <param name='indName' value='na' /> + <output name="output" file="master2snp_output.txt" /> + </test> + </tests> + + <help> +**Dataset formats** + +The input dataset is in the MasterVar_ format provided by the Complete Genomics +analysis process (Galaxy considers this to be tabular_, but it must have the +columns specified for MasterVar). +The output dataset is a gd_snp_ table. (`Dataset missing?`_) + +.. _Dataset missing?: ./static/formatHelp.html +.. _gd_snp: ./static/formatHelp.html#gd_snp +.. _MasterVar: ./static/formatHelp.html#mastervar +.. _tabular: ./static/formatHelp.html#tab + +----- + +**What it does** + +This converts a Complete Genomics MasterVar file to gd_snp format, +so it can be used with the genome diversity tools. +It can either +start a new dataset or append to an old one. When appending, if any new SNPs +appear only in the MasterVar file they can either be skipped or backfilled with +"-1" (unknown) for previous individuals/groups in the gd_snp dataset. +Positions homozygous for the reference are skipped. + + +----- + +**Examples** + +- input MasterVar file:: + + 934 2 chr1 41980 41981 hom snp A G G 76 97 dbsnp.86:rs806721 425 1 1 1 2 -170 ERVL-E-int:ERVL:47.4 2 1.17 N + 935 2 chr1 41981 42198 hom ref = = = -170 1.17 N + 1102 2 chr1 53205 53206 het-ref snp G C G 93 127 dbsnp.100:rs2854676 477 7 30 0 37 -127 2 1.17 N + etc. + +- output:: + + chr1 41980 A G -1 0 1 0 76 + chr1 53205 G C -1 30 7 1 93 + etc. + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/master2pg.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/master2pg.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,131 @@ +#!/usr/bin/perl -w +use strict; + +#convert from master variant file to pgSnp +my $snpsOnly = 1; #flag for if doing SNPs or indels +if (@ARGV && $ARGV[0] eq 'indel') { shift @ARGV; $snpsOnly = 0; } +my $in = shift @ARGV; +open(FH, $in) or die "Couldn't open input file $in, $!\n"; + +while (<FH>) { + chomp; + if (/^#/) { next; } + if (/^>/) { next; } #headers + if (/^\s*$/) { next; } + my @f = split(/\t/); + if (!$f[5]) { next; } #WHAT? most likely still zipped? + if ($f[5] =~ /(hom|het)/) { #zygosity #haploid chrX and chrY? + #only get snps for now + if ($snpsOnly && $f[6] eq 'snp') { #varType + my $a; + my $c = 2; + my $freq; + my $sc; + if ($f[8] eq $f[9]) { #should be homozygous? + $a = $f[8]; + $c = 1; + }else { + $a = "$f[8]/$f[9]"; + } + if (defined $f[10] && $c == 1) { + $sc = $f[10]; + }elsif (defined $f[10] && defined $f[11] && $c == 2) { + $sc = "$f[10],$f[11]"; + } + if (defined $f[16] && $c == 1) { + $freq = $f[16]; + }elsif (defined $f[16] && defined $f[17] && $c == 2) { + $freq = "$f[16],$f[17]"; + } + print "$f[2]\t$f[3]\t$f[4]\t$a\t$c\t$freq\t$sc\n"; + }elsif (!$snpsOnly) { + if ($f[8] =~ /^\s*$/) { undef $f[8]; } + if ($f[9] =~ /^\s*$/) { undef $f[9]; } + my $a; + my $c = 2; + #do indels + if ($f[6] eq "ins") { + if (defined $f[8] && defined $f[9] && $f[8] eq $f[9]) { $a = $f[8]; $c = 1; } + elsif (defined $f[8] && defined $f[9] && $f[8] ne '?' && $f[9] ne '?') { + $a = "$f[8]/$f[9]"; + }elsif (!defined $f[8] && defined $f[9]) { + $a = "$f[9]/-"; + }elsif (defined $f[8] && !defined $f[9]) { + $a = "$f[8]/-"; + } + }elsif ($f[6] eq "del") { + if (!defined $f[8] && !defined $f[9]) { + $a = '-'; #homozygous deletion + $c = 1; + }elsif (!defined $f[8] && defined $f[9]) { + $a = "$f[9]/-"; + }elsif (defined $f[8] && !defined $f[9]) { + $a = "$f[8]/-"; + } + }elsif ($f[6] eq "sub") { #multiple nt substitutions + if ($f[8] eq $f[9]) { + $a = $f[8]; + $c = 1; + }else { + $a = "$f[8]/$f[9]"; + } + }elsif ($f[6] eq "complex") { #treat same as multi-nt sub + if ($f[5] =~ /het-alt/ && !defined $f[8]) { $f[8] = '-'; } + if ($f[5] =~ /het-alt/ && !defined $f[9]) { $f[9] = '-'; } + if (defined $f[8] && defined $f[9] && $f[8] eq $f[9]) { + $c = 1; + $a = $f[8]; + }elsif (defined $f[8] && defined $f[9]) { + $a = "$f[8]/$f[9]"; + } + } + my $sc = ''; + my $freq = ''; + if (defined $f[10] && $c == 1) { + $sc = $f[10]; + }elsif (defined $f[10] && defined $f[11] && $c == 2) { + $sc = "$f[10],$f[11]"; + } + if (defined $f[16] && $c == 1) { + $freq = $f[16]; + }elsif (defined $f[16] && defined $f[17] && $c == 2) { + $freq = "$f[16],$f[17]"; + } + if ($a) { + print "$f[2]\t$f[3]\t$f[4]\t$a\t$c\t$freq\t$sc\n"; + } + } + }elsif ($f[5] eq 'hap' && $f[6] eq 'snp' && $snpsOnly) { + my $c = 1; + my $freq = ''; + if (defined $f[10]) { $freq = $f[10]; } + my $sc = ''; + if (defined $f[16]) { $sc = $f[16]; } + if ($f[8]) { + print "$f[2]\t$f[3]\t$f[4]\t$f[8]\t$c\t$freq\t$sc\n"; + } + }elsif ($f[5] eq 'hap' && !$snpsOnly && $f[6] =~ /(del|ins|sub)/) { + if ($f[8] =~ /^\s*$/) { undef $f[8]; } + my $a; + my $c = 1; + #do indels + if ($f[6] eq "ins") { + $a = $f[8]; + }elsif ($f[6] eq "del") { + $a = '-'; #deletion + }elsif ($f[6] eq "sub") { #multiple nt substitutions + $a = $f[8]; + } + my $sc = ''; + my $freq = ''; + if (defined $f[10]) { $sc = $f[10]; } + if (defined $f[16]) { $freq = $f[16]; } + if ($a) { + print "$f[2]\t$f[3]\t$f[4]\t$a\t$c\t$freq\t$sc\n"; + } + } +} + +close FH or die "Couldn't close $in, $!\n"; + +exit; |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/master2pg.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/master2pg.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,66 @@ +<tool id="master2pgSnp" name="MasterVar to pgSnp" hidden="false" version="1.0.0"> + <description>Convert from MasterVar to pgSnp format</description> + <command interpreter="perl"> + master2pg.pl $indel $input1 > $out_file1 + </command> + <inputs> + <param format="tab" name="input1" type="data" label="Complete Genomics MasterVar dataset" /> + <param name="indel" type="select" label="Convert indels"> + <option value="" selected="true">no</option> + <option value="indel">yes</option> + </param> + </inputs> + <outputs> + <data format="interval" name="out_file1" /> + </outputs> + <tests> + <test> + <param name='input1' value='masterVarTest.txt' ftype='tab' /> + <param name='indel' value="" /> + <output name="output" file="masterVar_output.txt" /> + </test> + </tests> + + <help> +**Dataset formats** + +The input dataset is in the MasterVar_ format provided by the Complete Genomics +analysis process (Galaxy considers this to be tabular_, but it must have the +columns specified for MasterVar). +The output dataset is in pgSnp_ format. (`Dataset missing?`_) + +.. _Dataset missing?: ./static/formatHelp.html +.. _pgSnp: ./static/formatHelp.html#pgSnp +.. _MasterVar: ./static/formatHelp.html#mastervar +.. _tabular: ./static/formatHelp.html#tab + +----- + +**What it does** + +This converts a Complete Genomics MasterVar file to pgSnp format, +so it can be viewed in browsers or used with the phenotype association and +interval operations tools. +Positions homozygous for the reference are skipped. + +----- + +**Examples** + +- input MasterVar file:: + + 934 2 chr1 41980 41981 hom snp A G G 76 97 dbsnp.86:rs806721 425 1 1 1 2 -170 ERVL-E-int:ERVL:47.4 2 1.17 N + 935 2 chr1 41981 42198 hom ref = = = -170 1.17 N + 1102 2 chr1 53205 53206 het-ref snp G C G 93 127 dbsnp.100:rs2854676 477 7 30 0 37 -127 2 1.17 N + etc. + +- output:: + + chr1 41980 41981 G 1 1 76 + chr1 51672 51673 C 1 1 53 + chr1 52237 52238 G 1 7 63 + chr1 53205 53206 C/G 2 7,30 93,127 + etc. + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/mergeSnps.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/mergeSnps.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,57 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +#this merges the significance output with the SNPs so users get more than an index + +my($out, $snp) = @ARGV; + +if (!$out or !$snp) { die "missing args\n"; } + +#merge SNP data with results +merge(); + +exit; + +######################################## + +#merge the input and output files so have SNP data with result +sub merge { + open(FH, $out) or die "Couldn't open $out, $!\n"; + my %res; + my @ind; + while (<FH>) { + chomp; + my $line = $_; + #0: 10 score= 14.224153 , df= 2 , p= 0.040760 , N=50 + if ($line =~ /^(\d+):\s+(.*)/) { $res{$1} = $2; push(@ind, $1); } + } + close FH; + if (!@ind) { return; } #no results, leave alone + @ind = sort { $a <=> $b } @ind; + #read input file to get SNP data + open(FH, $snp) or die "Couldn't open $snp, $!\n"; + my $i = 0; #0 based, not counting ID line + my $c = shift @ind; + while (<FH>) { + chomp; + if (/^ID/) { next; } + my @f = split(/\s+/); + if ($i == $c) { + $res{$i} = "$f[0]\t$f[1]\t$f[2]\t$res{$i}"; + if (!@ind) { last; } + $c = shift @ind; + } + $i++; + } + close FH; + #now reprint results with SNP data included + open(FH, ">", $out) or die "Couldn't write to $out, $!\n"; + print FH "ID\tchr\tposition\tresults\n"; + foreach $i (keys %res) { + print FH $res{$i}, "\n"; + } + close FH; +} + |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/pagetag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/pagetag.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,313 @@\n+#!/usr/bin/env python\n+"""\n+This accepts as input a file of the following format:\n+\n+ Site Sample Allele1 Allele2\n+\n+for example:\n+\n+ 000834 D001 G G\n+ 000834 D002 G G\n+ 000834 D003 G G\n+ 000834 D004 G G\n+ 000834 D005 N N\n+ 000834 E001 G G\n+ 000834 E002 G G\n+ 000834 E003 G G\n+ 000834 E004 G G\n+ 000834 E005 G G\n+ 000963 D001 T T\n+ 000963 D002 T T\n+ 000963 D003 T T\n+ 000963 D004 T T\n+ 000963 D005 N N\n+ 000963 E001 T T\n+ 000963 E002 N N\n+ 000963 E003 G T\n+ 000963 E004 G G\n+ 000963 E005 G T\n+\n+and a rsquare threshold and outputs two files:\n+\n+a) a file of input snps (one on each line). A SNP is identified by the "Site"\n+column in the input file\n+\n+b) a file where each line has the following:\n+ SNP list\n+where SNP is one of the SNPs and the "list" is a comma separated list of SNPs\n+that exceed the rsquare threshold with the first SNP.\n+"""\n+from __future__ import print_function\n+\n+from getopt import getopt, GetoptError\n+from sys import argv, exit, stderr\n+\n+__author__ = "Aakrosh Ratan"\n+__email__ = "ratan@bx.psu.edu"\n+\n+# do we want the debug information to be printed?\n+debug_flag = False\n+\n+# denote different combos of alleles in code\n+HOMC = str(1)\n+HOMR = str(2)\n+HETE = str(3)\n+OTHER = str(4)\n+\n+indexcalculator = {(HOMC, HOMC): 0,\n+ (HOMC, HOMR): 1,\n+ (HOMC, HETE): 2,\n+ (HOMR, HOMC): 3,\n+ (HOMR, HOMR): 4,\n+ (HOMR, HETE): 5,\n+ (HETE, HOMC): 6,\n+ (HETE, HOMR): 7,\n+ (HETE, HETE): 8}\n+\n+\n+def read_inputfile(filename, samples):\n+ input = {}\n+\n+ file = open(filename, "r")\n+\n+ for line in file:\n+ position, sample, allele1, allele2 = line.split()\n+\n+ # if the user specified a list of samples, then only use those samples\n+ if samples is not None and sample not in samples:\n+ continue\n+\n+ if position in input:\n+ v = input[position]\n+ v[sample] = (allele1, allele2)\n+ else:\n+ v = {sample: (allele1, allele2)}\n+ input[position] = v\n+\n+ file.close()\n+ return input\n+\n+\n+def annotate_locus(input, minorallelefrequency, snpsfile):\n+ locus = {}\n+ for k, v in input.items():\n+ genotypes = v.values()\n+ alleles = [y for x in genotypes for y in x]\n+ alleleset = list(set(alleles))\n+ alleleset = list(set(alleles) - set(["N", "X"]))\n+\n+ if len(alleleset) == 2:\n+ genotypevec = ""\n+ num1 = len([x for x in alleles if x == alleleset[0]])\n+ num2 = len([x for x in alleles if x == alleleset[1]])\n+\n+ if num1 > num2:\n+ major = alleleset[0]\n+ minor = alleleset[1]\n+ minorfreq = (num2 * 1.0) / (num1 + num2)\n+ else:\n+ major = alleleset[1]\n+ minor = alleleset[0]\n+ minorfreq = (num1 * 1.0) / (num1 + num2)\n+\n+ if minorfreq < minorallelefrequency:\n+ continue\n+\n+ for gen in genotypes:\n+ if gen == (major, major):\n+ genotypevec += HOMC\n+ elif gen == (minor, minor):\n+ genotypevec += HOMR\n+ elif gen == (major, minor) or gen == (minor, major):\n+ genotypevec += HETE\n+ else:\n+ genotypevec += OTHER\n+\n+ locus[k] = genotypevec, minorfreq\n+ elif len(alleleset) > 2:\n+ print(k, file=snpsfile)\n+ return locus\n+\n+\n+def calculateLD(loci, rsqthreshold):\n+ snps = list(loci)\n+ rsquare = {}\n+\n+ for index, loc1 in enumerate(snps):\n+ for loc2 '..b'2)] = rsq\n+\n+ return rsquare\n+\n+\n+def main(inputfile, snpsfile, neigborhoodfile,\n+ rsquare, minorallelefrequency, samples):\n+ # read the input file\n+ input = read_inputfile(inputfile, samples)\n+ print("Read %d locations" % len(input), file=stderr)\n+\n+ # open the snpsfile to print\n+ file = open(snpsfile, "w")\n+\n+ # annotate the inputs, remove the abnormal loci (which do not have 2 alleles\n+ # and add the major and minor allele to each loci\n+ loci = annotate_locus(input, minorallelefrequency, file)\n+ print("Read %d interesting locations" % len(loci), file=stderr)\n+\n+ # print all the interesting loci as candidate snps\n+ for k in loci.keys():\n+ print(k, file=file)\n+ file.close()\n+ print("Finished creating the snpsfile", file=stderr)\n+\n+ # calculate the LD values and store it if it exceeds the threshold\n+ lds = calculateLD(loci, rsquare)\n+ print("Calculated all the LD values", file=stderr)\n+\n+ # create a list of SNPs\n+ snps = {}\n+ ldvals = {}\n+ for k, v in lds.items():\n+ s1, s2 = k.split()\n+ if s1 in snps:\n+ snps[s1].append(s2)\n+ else:\n+ snps[s1] = [s2]\n+ if s2 in snps:\n+ snps[s2].append(s1)\n+ else:\n+ snps[s2] = [s1]\n+\n+ if s1 in ldvals:\n+ ldvals[s1].append(str(v))\n+ else:\n+ ldvals[s1] = [str(v)]\n+ if s2 in ldvals:\n+ ldvals[s2].append(str(v))\n+ else:\n+ ldvals[s2] = [str(v)]\n+\n+ # print the snps to the output file\n+ file = open(neigborhoodfile, "w")\n+\n+ for k, v in snps.items():\n+ ldv = ldvals[k]\n+ if debug_flag is True:\n+ print("%s\\t%s\\t%s" % (k, ",".join(v), ",".join(ldv)), file=file)\n+ else:\n+ print("%s\\t%s" % (k, ",".join(v)), file=file)\n+\n+ file.close()\n+\n+\n+def read_list(filename):\n+ file = open(filename, "r")\n+ list = {}\n+\n+ for line in file:\n+ list[line.strip()] = 1\n+\n+ file.close()\n+ return list\n+\n+\n+def usage():\n+ f = stderr\n+ print("usage:", file=f)\n+ print("pagetag [options] input.txt snps.txt neighborhood.txt", file=f)\n+ print("where input.txt is the prettybase file", file=f)\n+ print("where snps.txt is the first output file with the snps", file=f)\n+ print("where neighborhood.txt is the output neighborhood file", file=f)\n+ print("where the options are:", file=f)\n+ print("-h,--help : print usage and quit", file=f)\n+ print("-d,--debug: print debug information", file=f)\n+ print("-r,--rsquare: the rsquare threshold (default : 0.64)", file=f)\n+ print("-f,--freq : the minimum MAF required (default: 0.0)", file=f)\n+ print("-s,--sample : a list of samples to be clustered", file=f)\n+\n+\n+if __name__ == "__main__":\n+ try:\n+ opts, args = getopt(argv[1:], "hds:r:f:",\n+ ["help", "debug", "rsquare=", "freq=", "sample="])\n+ except GetoptError as err:\n+ print(str(err))\n+ usage()\n+ exit(2)\n+\n+ rsquare = 0.64\n+ minorallelefrequency = 0.0\n+ samples = None\n+\n+ for o, a in opts:\n+ if o in ("-h", "--help"):\n+ usage()\n+ exit()\n+ elif o in ("-d", "--debug"):\n+ debug_flag = True\n+ elif o in ("-r", "--rsquare"):\n+ rsquare = float(a)\n+ elif o in ("-f", "--freq"):\n+ minorallelefrequency = float(a)\n+ elif o in ("-s", "--sample"):\n+ samples = read_list(a)\n+ else:\n+ assert False, "unhandled option"\n+\n+ if rsquare < 0.00 or rsquare > 1.00:\n+ print("input value of rsquare should be in [0.00, 1.00]", file=stderr)\n+ exit(3)\n+\n+ if minorallelefrequency < 0.0 or minorallelefrequency > 0.5:\n+ print("input value of MAF should be (0.00,0.50]", file=stderr)\n+ exit(4)\n+\n+ if len(args) != 3:\n+ usage()\n+ exit(5)\n+\n+ main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples)\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/pass.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/pass.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,130 @@ +<tool id="hgv_pass" name="PASS" version="1.0.0"> + <description>significant transcription factor binding sites from ChIP data</description> + + <command interpreter="bash"> + pass_wrapper.sh "$input" "$min_window" "$max_window" "$false_num" "$output" + </command> + + <inputs> + <param format="gff" name="input" type="data" label="Dataset"/> + <param name="min_window" label="Smallest window size (by # of probes)" type="integer" value="2" /> + <param name="max_window" label="Largest window size (by # of probes)" type="integer" value="6" /> + <param name="false_num" label="Expected total number of false positive intervals to be called" type="float" value="5.0" help="N.B.: this is a <em>count</em>, not a rate." /> + </inputs> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <requirements> + <requirement type="package">pass</requirement> + <requirement type="binary">sed</requirement> + </requirements> + + <!-- we need to be able to set the seed for the random number generator + <tests> + <test> + <param name="input" ftype="gff" value="pass_input.gff"/> + <param name="min_window" value="2"/> + <param name="max_window" value="6"/> + <param name="false_num" value="5"/> + <output name="output" file="pass_output.tab"/> + </test> + </tests> + --> + + <help> +**Dataset formats** + +The input is in GFF_ format, and the output is tabular_. +(`Dataset missing?`_) + +.. _GFF: ${static_path}/formatHelp.html#gff +.. _tabular: ${static_path}/formatHelp.html#tab +.. _Dataset missing?: ${static_path}/formatHelp.html + +----- + +**What it does** + +PASS (Poisson Approximation for Statistical Significance) detects +significant transcription factor binding sites in the genome from +ChIP data. This is probably the only peak-calling method that +accurately controls the false-positive rate and FDR in ChIP data, +which is important given the huge discrepancy in results obtained +from different peak-calling algorithms. At the same time, this +method achieves a similar or better power than previous methods. + +<!-- we don't have wrapper support for the "prior" file yet +Another unique feature of this method is that it allows varying +thresholds to be used for peak calling at different genomic +locations. For example, if a position lies in an open chromatin +region, is depleted of nucleosome positioning, or a co-binding +protein has been detected within the neighborhood, then the position +is more likely to be bound by the target protein of interest, and +hence a lower threshold will be used to call significant peaks. +As a result, weak but real binding sites can be detected. +--> + +----- + +**Hints** + +- ChIP-Seq data: + + If the data is from ChIP-Seq, you need to convert the ChIP-Seq values + into z-scores before using this program. It is also recommended that + you group read counts within a neighborhood together, e.g. in tiled + windows of 30bp. In this way, the ChIP-Seq data will resemble + ChIP-chip data in format. + +- Choosing window size options: + + The window size is related to the probe tiling density. For example, + if the probes are tiled at every 100bp, then setting the smallest + window = 2 and largest window = 6 is appropriate, because the DNA + fragment size is around 300-500bp. + +----- + +**Example** + +- input file:: + + chr7 Nimblegen ID 40307603 40307652 1.668944 . . . + chr7 Nimblegen ID 40307703 40307752 0.8041307 . . . + chr7 Nimblegen ID 40307808 40307865 -1.089931 . . . + chr7 Nimblegen ID 40307920 40307969 1.055044 . . . + chr7 Nimblegen ID 40308005 40308068 2.447853 . . . + chr7 Nimblegen ID 40308125 40308174 0.1638694 . . . + chr7 Nimblegen ID 40308223 40308275 -0.04796628 . . . + chr7 Nimblegen ID 40308318 40308367 0.9335709 . . . + chr7 Nimblegen ID 40308526 40308584 0.5143972 . . . + chr7 Nimblegen ID 40308611 40308660 -1.089931 . . . + etc. + + In GFF, a value of dot '.' is used to mean "not applicable". + +- output file:: + + ID Chr Start End WinSz PeakValue # of FPs FDR + 1 chr7 40310931 40311266 4 1.663446 0.248817 0.248817 + +----- + +**References** + +Zhang Y. (2008) +Poisson approximation for significance in genome-wide ChIP-chip tiling arrays. +Bioinformatics. 24(24):2825-31. Epub 2008 Oct 25. + +Chen KB, Zhang Y. (2010) +A varying threshold method for ChIP peak calling using multiple sources of information. +Submitted. + + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btn549</citation> + <citation type="doi">10.1093/bioinformatics/btq379</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/pass_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/pass_wrapper.sh Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +export PATH=$PATH:$(dirname $0) + +input=$1 +min_window=$2 +max_window=$3 +false_num=$4 +output=$5 + +pass "$input" "$min_window" "$max_window" "$false_num" "$output" >/dev/null +sed -i -e 's/\t\t*/\t/g' "$output" + |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/senatag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/senatag.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,258 @@ +#!/usr/bin/env python +""" +This tool takes the following file pairs as input: +a) input_snp : A file with identifiers for SNPs (one on each line) +b) ldfile : A file where each line has the following + snp list + where "snp" is an identifier for one SNP and the "list" is a + comma separated list of all the other snps that are in LD with + it (as per some threshold of rsquare) + +The output is a set of tag SNPs for the given datasets + +The algorithm is as follows: + +a) Construct a graph for each population, where each node is a SNP and two nodes +are connected using an edge iff they are in LD. +b) For each SNP, count the total number of connected nodes, which have not yet +been visited. +c) Find the SNP with the highest count and assign it to be a tag SNP. +d) Mark that SNP and all the snps connected to it as "visited". This should be +done for each population. +e) Continue steps b-e until all SNPs, in all populations have been visited. +""" +from __future__ import print_function + +import heapq +import os + +from getopt import getopt, GetoptError +from sys import argv, exit, stderr + +__author__ = "Aakrosh Ratan" +__email__ = "ratan@bx.psu.edu" + +# do we want the debug information to be printed? +debug_flag = False + + +class node: + def __init__(self, name): + self.name = name + self.edges = [] + self.visited = False + + # return the number of nodes connected to this node, that have yet to be + # visited + def num_not_visited(self): + num = 0 + for n in self.edges: + if n.visited is False: + num += 1 + return num + + def __cmp__(self, other): + return other.num_not_visited() - self.num_not_visited() + + def __str__(self): + return self.name + + +class graph: + def __init__(self): + self.nodes = {} + + def __str__(self): + string = "" + for n1 in self.nodes.values(): + n2s = [x.name for x in n1.edges] + string += "%s %s\n" % (n1.name, ",".join(n2s)) + return string[:-1] + + def add_node(self, n): + self.nodes[n.name] = n + + def add_edges(self, n1, n2): + assert n1.name in self.nodes + assert n2.name in self.nodes + n1.edges.append(n2) + n2.edges.append(n1) + + def check_graph(self): + for n in self.nodes.values(): + ms = [x for x in n.edges] + for m in ms: + if n not in m.edges: + print("check : %s - %s" % (n, m), file=stderr) + + +def construct_graph(ldfile, snpfile): + # construct the initial graph. add all the SNPs as nodes + g = graph() + file = open(snpfile, "r") + + for line in file: + # ignore empty lines and add the remainder to the graph + if len(line.strip()) == 0: + continue + n = node(line.strip()) + g.add_node(n) + + file.close() + print("Added %d nodes to a graph" % len(g.nodes), file=stderr) + + # now add all the edges + file = open(ldfile, "r") + + for line in file: + tokens = line.split() + assert len(tokens) == 2 + + # if this node is in the graph, then we need to construct an edge from + # this node to all the nodes which are highly related to it + if tokens[0] in g.nodes: + n1 = g.nodes[tokens[0]] + n2s = [g.nodes[x] for x in tokens[1].split(",")] + + for n2 in n2s: + g.add_edges(n1, n2) + + file.close() + print("Added all edges to the graph", file=stderr) + + return g + + +def check_output(g, tagsnps): + # find all the nodes in the graph + allsnps = [x.name for x in g.nodes.values()] + + # find the nodes that are covered by our tagsnps + mysnps = [x.name for x in tagsnps] + + for n in tagsnps: + for m in n.edges: + mysnps.append(m.name) + + mysnps = list(set(mysnps)) + + if set(allsnps) != set(mysnps): + diff = list(set(allsnps) - set(mysnps)) + print("%s are not covered" % ",".join(diff), file=stderr) + + +def main(ldfile, snpsfile, required, excluded): + # construct the graph + g = construct_graph(ldfile, snpsfile) + if debug_flag is True: + g.check_graph() + + tagsnps = [] + neighbors = {} + + # take care of the SNPs that are required to be TagSNPs + for s in required: + t = g.nodes[s] + + t.visited = True + ns = [] + + for n in t.edges: + if n.visited is False: + ns.append(n.name) + n.visited = True + + tagsnps.append(t) + neighbors[t.name] = list(set(ns)) + + # find the tag SNPs for this graph + data = g.nodes.values()[:] + heapq.heapify(data) + + while data: + s = heapq.heappop(data) + + if s.visited is True or s.name in excluded: + continue + + s.visited = True + ns = [] + + for n in s.edges: + if n.visited is False: + ns.append(n.name) + n.visited = True + + tagsnps.append(s) + neighbors[s.name] = list(set(ns)) + + heapq.heapify(data) + + for s in tagsnps: + if len(neighbors[s.name]) > 0: + print("%s\t%s" % (s, ",".join(neighbors[s.name]))) + continue + print(s) + + if debug_flag is True: + check_output(g, tagsnps) + + +def read_list(filename): + assert os.path.exists(filename) + file = open(filename, "r") + list = {} + + for line in file: + list[line.strip()] = 1 + + file.close() + return list + + +def usage(): + f = stderr + print("usage:", file=f) + print("senatag [options] neighborhood.txt inputsnps.txt", file=f) + print("where inputsnps.txt is a file of snps from one population", file=f) + print("where neighborhood.txt is neighborhood details for the pop.", file=f) + print("where the options are:", file=f) + print("-h,--help : print usage and quit", file=f) + print("-d,--debug: print debug information", file=f) + print("-e,--excluded : file with names of SNPs that cannot be TagSNPs", file=f) + print("-r,--required : file with names of SNPs that should be TagSNPs", file=f) + + +if __name__ == "__main__": + try: + opts, args = getopt(argv[1:], "hdr:e:", + ["help", "debug", "required=", "excluded="]) + except GetoptError as err: + print(str(err)) + usage() + exit(2) + + required = {} + excluded = {} + + for o, a in opts: + if o in ("-h", "--help"): + usage() + exit() + elif o in ("-d", "--debug"): + debug_flag = True + elif o in ("-r", "--required"): + required = read_list(a) + elif o in ("-e", "--excluded"): + excluded = read_list(a) + else: + assert False, "unhandled option" + + if len(args) != 2: + usage() + exit(3) + + assert os.path.exists(args[0]) + assert os.path.exists(args[1]) + + main(args[0], args[1], required, excluded) |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/sift.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/sift.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,180 @@\n+<tool id="hgv_sift" name="SIFT" version="1.0.0">\n+ <description>predictions of functional sites</description>\n+\n+ <command interpreter="bash">\n+ sift_variants_wrapper.sh "$input" "$output" "${input.metadata.dbkey}" "${GALAXY_DATA_INDEX_DIR}/sift_db.loc" "$chrom_col" "$pos_col" "$base" "$allele_col" "$strand_source.strand_col" "$comment_source.comment_col" "$output_opts"\n+ </command>\n+\n+ <inputs>\n+ <param name="input" type="data" format="tabular" label="Dataset">\n+ <validator type="unspecified_build"/>\n+ <validator type="dataset_metadata_in_file" filename="sift_db.loc" metadata_name="dbkey" metadata_column="0" message="Data is currently not available for the specified build."/>\n+ </param>\n+ <param name="chrom_col" type="data_column" data_ref="input" label="Column with chromosome"/>\n+ <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/>\n+ <param name="base" type="select" label="Position coordinates are">\n+ <option value="1" selected="true">one-based</option>\n+ <option value="0">zero-based</option>\n+ </param>\n+ <param name="allele_col" type="data_column" data_ref="input" label="Column with allele"/>\n+ <conditional name="strand_source">\n+ <param name="strand_choice" type="select" label="Strand info">\n+ <option value="data_column" selected="true">a column in the dataset</option>\n+ <option value="all_pos">all on sense/forward/+ strand</option>\n+ <option value="all_neg">all on antisense/reverse/- strand</option>\n+ </param>\n+ <when value="data_column">\n+ <param name="strand_col" type="data_column" data_ref="input" label="Column with strand"/>\n+ </when>\n+ <when value="all_pos">\n+ <param name="strand_col" type="hidden" value="+"/>\n+ </when>\n+ <when value="all_neg">\n+ <param name="strand_col" type="hidden" value="-"/>\n+ </when>\n+ </conditional>\n+ <conditional name="comment_source">\n+ <param name="comment_choice" type="select" label="Include comment column">\n+ <option value="no" selected="true">no</option>\n+ <option value="yes">yes</option>\n+ </param>\n+ <when value="no">\n+ <param name="comment_col" type="hidden" value="-"/>\n+ </when>\n+ <when value="yes">\n+ <param name="comment_col" type="data_column" data_ref="input" label="Column with comment"/>\n+ </when>\n+ </conditional>\n+ <param name="output_opts" type="select" multiple="true" display="checkboxes" label="Include the following additional fields in the output">\n+ <option value="A">Ensembl Gene ID</option>\n+ <option value="B">Gene Name</option>\n+ <option value="C">Gene Description</option>\n+ <option value="D">Ensembl Protein Family ID</option>\n+ <option value="E">Ensembl Protein Family Description</option>\n+ <option value="F">Ensembl Transcript Status (Known / Novel)</option>\n+ <option value="G">Protein Family Size</option>\n+ <option value="H">Ka/Ks (Human-mouse)</option>\n+ <option value="I">Ka/Ks (Human-macaque)</option>\n+ <option value="J">OMIM Disease</option>\n+ <option value="K">Allele Frequencies (All Hapmap Populations - weighted average)</option>\n+ <option value="L">Allele Frequencies (CEU Hapmap population)</option>\n+ </param>\n+ </inputs>\n+\n+ <outputs>\n+ <data format="tabular" name="output" />\n+ </outputs>\n+\n+ <requirements>\n+ <requirement type="binary">awk</requirement>\n+ <requirement type="binary">rm</requirement>\n+ <requirement type="binary">sed</requirement>\n+ </requirements>\n+\n+ <tests>\n+ <test>\n+ <param name="input" value="sift_variants.tab" ftype="tabular" dbkey="hg18"/>\n+ <param name="chrom_col" value="1"/>\n+ <param name="pos_col" value="3"/>\n+ <param name="base" value="1"/>\n+ <param name="allele_col" value="5"/>\n+ <param name="strand_choice" value="data_column"/>\n+ <param name="st'..b'g?: ${static_path}/formatHelp.html\n+\n+-----\n+\n+**What it does**\n+\n+SIFT predicts whether an amino-acid substitution affects protein function,\n+based on sequence homology and the physical properties of amino acids.\n+SIFT can be applied to naturally occurring non-synonymous polymorphisms\n+and laboratory-induced missense mutations. This tool uses SQLite databases\n+containing pre-computed SIFT scores and annotations for all possible nucleotide\n+substitutions at each position in the human exome. Allele frequency data\n+are from the HapMap frequency database, and additional transcript and \n+gene-level data are from Ensembl BioMart.\n+\n+The input dataset must contain columns for the chromosome, position, and\n+alleles. The alleles must be two nucleotides separated by \'/\',\n+usually the reference allele and the allele of interest.\n+The strand must either be in another column or all the same.\n+The output contains a standard set of columns plus the additional ones that\n+have been selected from the list above.\n+\n+Website: http://sift.jcvi.org/\n+\n+-----\n+\n+**Example**\n+\n+- input file::\n+\n+ chr3 81780820 + T/C\n+ chr2 230341630 + G/A\n+ chr2 43881517 + A/T\n+ chr2 43857514 + T/C\n+ chr6 88375602 + G/A\n+ chr22 29307353 - T/A\n+ chr10 115912482 - G/T\n+ chr10 115900918 - C/T\n+ chr16 69875502 + G/T\n+ etc.\n+\n+- output file::\n+\n+ #Chrom Position Strand Allele Codons Transcript ID Protein ID Substitution Region dbSNP ID SNP Type Prediction Score Median Info Num seqs at position User Comment\n+ chr3 81780820 + T/C AGA-gGA ENST00000264326 ENSP00000264326 R190G EXON CDS rs2229519:C Nonsynonymous DAMAGING 0.04 3.06 149\n+ chr2 230341630 + G/T - ENST00000389045 ENSP00000373697 NA EXON CDS rs1803846:A Unknown Not scored NA NA NA\n+ chr2 43881517 + A/T ATA-tTA ENST00000260605 ENSP00000260605 I230L EXON CDS rs11556157:T Nonsynonymous TOLERATED 0.47 3.19 7\n+ chr2 43857514 + T/C TTT-TcT ENST00000260605 ENSP00000260605 F33S EXON CDS rs2288709:C Nonsynonymous TOLERATED 0.61 3.33 6\n+ chr6 88375602 + G/A GTT-aTT ENST00000257789 ENSP00000257789 V217I EXON CDS rs2307389:A Nonsynonymous TOLERATED 0.75 3.17 13\n+ chr22 29307353 + T/A ACC-tCC ENST00000335214 ENSP00000334612 T264S EXON CDS rs42942:A Nonsynonymous TOLERATED 0.4 3.14 23\n+ chr10 115912482 + C/A CGA-CtA ENST00000369285 ENSP00000358291 R179L EXON CDS rs12782946:T Nonsynonymous TOLERATED 0.06 4.32 2\n+ chr10 115900918 + G/A CAA-tAA ENST00000369287 ENSP00000358293 Q271* EXON CDS rs7095762:T Nonsynonymous N/A N/A N/A N/A\n+ chr16 69875502 + G/T ACA-AaA ENST00000338099 ENSP00000337512 T608K EXON CDS rs3096381:T Nonsynonymous TOLERATED 0.12 3.41 3\n+ etc.\n+\n+-----\n+\n+**References**\n+\n+Ng PC, Henikoff S. (2001) Predicting deleterious amino acid substitutions.\n+Genome Res. 11(5):863-74.\n+\n+Ng PC, Henikoff S. (2002) Accounting for human polymorphisms predicted to affect protein function.\n+Genome Res. 12(3):436-46.\n+\n+Ng PC, Henikoff S. (2003) SIFT: Predicting amino acid changes that affect protein function.\n+Nucleic Acids Res. 31(13):3812-4.\n+\n+Kumar P, Henikoff S, Ng PC. (2009) Predicting the effects of coding non-synonymous variants\n+on protein function using the SIFT algorithm.\n+Nat Protoc. 4(7):1073-81. Epub 2009 Jun 25.\n+\n+ </help>\n+ <citations>\n+ <citation type="doi">10.1101/gr.176601</citation>\n+ <citation type="doi">10.1101/gr.212802</citation>\n+ <citation type="doi">10.1093/nar/gkg509</citation>\n+ <citation type="doi">10.1038/nprot.2009.86</citation>\n+ </citations>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/sift_variants_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/sift_variants_wrapper.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,184 @@ +#!/usr/bin/env bash + +input_file=$1 +output_file=$2 +org=$3 +db_loc=$4 +chrom_col=$5 +pos_col=$6 +base=$7 +allele_col=$8 +strand_col=$9 +comment_col=${10} +output_opts=${11} + +working_dir=$PWD +sift_input="$working_dir/sift_input.txt" +sift_output="$working_dir/sift_output.txt" + +################################################################################ +## make sure input file column selections are mutually exclusive ## +################################################################################ +ERROR=0 +declare -a col_use + +function check_col () { + local col=$1 + local use=$2 + local int=$3 + + if [ -n "${col//[0-9]}" ]; then + if [ $int -eq 1 ]; then + echo "ERROR: invalid value for $use column: $col" 1>&2 + ERROR=1 + fi + return + fi + + local cur=${col_use[$col]} + if [ -n "$cur" ]; then + echo "ERROR: $use column is the same as $cur column" 1>&2 + col_use[$col]="${cur},$use" + ERROR=1 + else + col_use[$col]=$use + fi +} + +check_col $chrom_col 'chromosome' 1 +check_col $pos_col 'position' 1 +check_col $allele_col 'allele' 1 +check_col $strand_col 'strand' 0 +check_col $comment_col 'comment' 0 + +if [ $ERROR -ne 0 ]; then + exit 1 +fi + +################################################################################ +## get/check the db directory from the argument org,db_loc ## +################################################################################ +db_dir=$( awk '$1 == org { print $2 }' org=$org $db_loc ) + +if [ -z "$db_dir" ]; then + echo "Can't find dbkey \"$org\" in loc file \"$db_loc\"" 1>&2 + exit 1 +fi + +if [ ! -d "$db_dir" ]; then + echo "Can't access SIFT database directory \"$db_dir\"" 1>&2 + exit 1 +fi + +################################################################################ +## create input file for SIFT_exome_nssnvs.pl ## +################################################################################ +if [ ! -r "$input_file" ]; then + echo "Can't read input file \"$input_file\"" 1>&2 + exit 1 +fi + +if [ $base -eq 0 ]; then + beg_col="$pos_col" + end_col="$pos_col + 1" + pos_adj='$2 = $2 - 1' +else + beg_col="$pos_col - 1" + end_col="$pos_col" + pos_adj='' +fi + +strand_cvt='' +if [ \( "$strand_col" = "+" \) ]; then + strand='"1"' +elif [ \( "$strand_col" = "-" \) ]; then + strand='"-1"' +else + strand="\$$strand_col" + strand_cvt='if ('"${strand}"' == "+") {'"${strand}"' = "1"} else if ('"${strand}"' == "-") {'"${strand}"' = "-1"}' +fi + +print_row='print $'"${chrom_col}"', $'"${beg_col}"', $'"${end_col}"', '"${strand}"', $'"${allele_col}"'' +if [ "$comment_col" != "-" ]; then + print_row=''"${print_row}"', $'"${comment_col}"'' +fi + +awk ' +BEGIN {FS="\t";OFS=","} +$'"${chrom_col}"' ~ /^[cC][hH][rR]/ {$'"${chrom_col}"' = substr($'"${chrom_col}"',4)} +{ + '"${strand_cvt}"' + '"${print_row}"' +} +' "$input_file" > "$sift_input" + +################################################################################ +## run SIFT_exome_nssnvs.pl command line program ## +################################################################################ +if [ "$output_opts" = "None" ]; then + output_opts="" +else + output_opts=$( echo "$output_opts" | sed -e 's/,/ 1 -/g' ) + output_opts="-$output_opts 1" +fi + +SIFT_exome_nssnvs.pl -i "$sift_input" -d "$db_dir" -o "$working_dir" $output_opts &> "$sift_output" +if [ $? -ne 0 ]; then + echo "failed: SIFT_exome_nssnvs.pl -i \"$sift_input\" -d \"$db_dir\" -o \"$working_dir\" $output_opts" + exit 1 +fi + +################################################################################ +## locate the SIFT_exome_nssnvs.pl output file ## +################################################################################ +sift_pid=$( sed -n -e 's/^.*Your job id is \([0-9][0-9]*\) and is currently running.*$/\1/p' "$sift_output" ) + +if [ -z "$sift_pid" ]; then + echo "Can't find SIFT pid in \"$sift_output\"" 1>&2 + exit 1 +fi + +sift_outdir="$working_dir/$sift_pid" +if [ ! -d "$sift_outdir" ]; then + echo "Can't access SIFT output directory \"$sift_outdir\"" 1>&2 + exit 1 +fi + +sift_outfile="$sift_outdir/${sift_pid}_predictions.tsv" +if [ ! -r "$sift_outfile" ]; then + echo "Can't access SIFT output file \"$sift_outfile\"" 1>&2 + exit 1 +fi + +################################################################################ +## create galaxy output file ## +################################################################################ +awk ' +BEGIN {FS="\t";OFS="\t"} +NR == 1 { + $12 = "Num seqs at position" + $1 = "Chrom\tPosition\tStrand\tAllele" +} +NR != 1 { + $1 = "chr" $1 + gsub(/,/, "\t", $1) +} +' "$sift_outfile" | awk ' +BEGIN {FS="\t";OFS="\t"} +NR == 1 { + print "#" $0 +} +NR != 1 { + if ($3 == "1") {$3 = "+"} else if ($3 == "-1") {$3 = "-"} + '"${pos_adj}"' +} +' > "$output_file" + +################################################################################ +## cleanup ## +################################################################################ +rm -rf "$sift_outdir" "$sift_input" "$sift_output" + |
b |
diff -r 000000000000 -r 7621d36a4e9c phenotype_association/vcf2pgSnpMult.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phenotype_association/vcf2pgSnpMult.pl Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,81 @@ +#!/usr/bin/perl -w +use strict; + +#convert from a vcf file to a pgSnp file with multiple sets of the allele +# specific columns +#frequency count = chromosome count + +my $in; +my $stCol = 9; +my $endCol; +if (@ARGV && scalar @ARGV == 1) { + $in = shift @ARGV; +}else { + print "usage: vcf2pgSnpMult.pl file.vcf > file.pgSnpMult\n"; + exit; +} + +if ($in =~ /.gz$/) { + open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; +}else { + open(FH, $in) or die "Couldn't open $in, $!\n"; +} +while (<FH>) { + chomp; + if (/^\s*#/) { next; } #skip comments/headers + if (/^\s*$/) { next; } #skip blank lines + my @f = split(/\t/); + #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... + my $a; + my %nt; + my %all; + my $cnt = 0; + my $var; + if ($f[3] eq 'N') { next; } #ignore ref=N + if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite + if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason + my $ind = 0; + if ($f[8] ne 'GT') { #more than just genotype + my @t = split(/:/, $f[8]); + foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } + if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } + } + if (!$endCol) { $endCol = $#f; } + #put f[3] => nt{0} and split f[4] for rest of nt{} + $nt{0} = $f[3]; + my @t = split(/,/, $f[4]); + for (my $i=0; $i<=$#t; $i++) { + my $j = $i + 1; + $nt{$j} = $t[$i]; + } + if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } + print "$f[0]\t", ($f[1]-1), "\t$f[1]"; #position info + foreach my $col ($stCol .. $endCol) { #add each individual (4 columns) + if ($ind > 0) { + my @t = split(/:/, $f[$col]); + $f[$col] = $t[$ind] . ":"; #only keep genotype part + } + print "\t"; + if ($f[$col] =~ /^(\d).(\d)/) { + my $a1 = $1; + my $a2 = $2; + if (!exists $nt{$a1}) { die "ERROR bad allele $a1 in $f[3] $f[4]\n"; } + if (!exists $nt{$a2}) { die "ERROR bad allele $a2 in $f[3] $f[4]\n"; } + if ($a1 eq $a2) { #homozygous + print "$nt{$a1}\t1\t2\t0"; + }else { #heterozygous + print "$nt{$a1}/$nt{$a2}\t2\t1,1\t0,0"; + } + }elsif ($f[$col] =~ /^(\d):/) { #chrY or male chrX, single + my $a1 = $1; + if (!exists $nt{$a1}) { die "ERROR bad allele $a1 in $f[3] $f[4]\n"; } + print "$nt{$a1}\t1\t1\t0"; + }else { #don't know how to parse + die "ERROR unknown genotype $f[$col]\n"; + } + } + print "\n"; #end this SNP +} +close FH; + +exit; |
b |
diff -r 000000000000 -r 7621d36a4e9c plotting/bar_chart.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plotting/bar_chart.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,140 @@ +#!/usr/bin/env python +""" +histogram_gnuplot.py <datafile> <xtic column> <column_list> <title> <ylabel> <yrange_min> <yrange_max> <grath_file> +a generic histogram builder based on gnuplot backend + + data_file - tab delimited file with data + xtic_column - column containing labels for x ticks [integer, 0 means no ticks] + column_list - comma separated list of columns to plot + title - title for the entire histrogram + ylabel - y axis label + yrange_max - minimal value at the y axis (integer) + yrange_max - maximal value at the y_axis (integer) + to set yrange to autoscaling assign 0 to yrange_min and yrange_max + graph_file - file to write histogram image to + img_size - as X,Y pair in pixels (e.g., 800,600 or 600,800 etc.) + + + This tool required gnuplot and gnuplot.py + +anton nekrutenko | anton@bx.psu.edu +""" + +import string +import sys +import tempfile + +import Gnuplot +import Gnuplot.funcutils + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def main(tmpFileName): + skipped_lines_count = 0 + skipped_lines_index = [] + gf = open(tmpFileName, 'w') + + try: + in_file = open( sys.argv[1], 'r' ) + xtic = int( sys.argv[2] ) + col_list = string.split( sys.argv[3], "," ) + title = 'set title "' + sys.argv[4] + '"' + ylabel = 'set ylabel "' + sys.argv[5] + '"' + ymin = sys.argv[6] + ymax = sys.argv[7] + img_file = sys.argv[8] + img_size = sys.argv[9] + except: + stop_err("Check arguments\n") + + try: + int( col_list[0] ) + except: + stop_err('You forgot to set columns for plotting\n') + + for i, line in enumerate( in_file ): + valid = True + line = line.rstrip('\r\n') + if line and not line.startswith( '#' ): + row = [] + try: + fields = line.split( '\t' ) + for col in col_list: + row.append( str( float( fields[int( col ) - 1] ) ) ) + except: + valid = False + skipped_lines_count += 1 + skipped_lines_index.append(i) + else: + valid = False + skipped_lines_count += 1 + skipped_lines_index.append(i) + + if valid and xtic > 0: + row.append( fields[xtic - 1] ) + elif valid and xtic == 0: + row.append( str( i ) ) + + if valid: + gf.write( '\t'.join( row ) ) + gf.write( '\n' ) + + if skipped_lines_count < i: + # Prepare 'using' clause of plot statement + g_plot_command = ' ' + + # Set the first column + if xtic > 0: + g_plot_command = "'%s' using 1:xticlabels(%s) ti 'Column %s', " % ( tmpFileName, str( len( row ) ), col_list[0] ) + else: + g_plot_command = "'%s' using 1 ti 'Column %s', " % ( tmpFileName, col_list[0] ) + + # Set subsequent columns + for i in range(1, len(col_list)): + g_plot_command += "'%s' using %s t 'Column %s', " % ( tmpFileName, str(i + 1), col_list[i] ) + + g_plot_command = g_plot_command.rstrip( ', ' ) + + yrange = 'set yrange [' + ymin + ":" + ymax + ']' + + try: + g = Gnuplot.Gnuplot() + g('reset') + g('set boxwidth 0.9 absolute') + g('set style fill solid 1.00 border -1') + g('set style histogram clustered gap 5 title offset character 0, 0, 0') + g('set xtics border in scale 1,0.5 nomirror rotate by 90 offset character 0, 0, 0') + g('set key invert reverse Left outside') + if xtic == 0: + g('unset xtics') + g(title) + g(ylabel) + g_term = 'set terminal png tiny size ' + img_size + g(g_term) + g_out = 'set output "' + img_file + '"' + if ymin != ymax: + g(yrange) + g(g_out) + g('set style data histograms') + g.plot(g_plot_command) + except: + stop_err("Gnuplot error: Data cannot be plotted") + else: + sys.stderr.write('Column(s) %s of your dataset do not contain valid numeric data' % sys.argv[3]) + + if skipped_lines_count > 0: + sys.stdout.write('\nWARNING. You dataset contain(s) %d invalid lines starting with line #%d. These lines were skipped while building the graph.\n' % ( skipped_lines_count, skipped_lines_index[0] + 1 ) ) + + +if __name__ == "__main__": + # The tempfile initialization is here because while inside the main() it seems to create a condition + # when the file is removed before gnuplot has a chance of accessing it + gp_data_file = tempfile.NamedTemporaryFile('w') + Gnuplot.gp.GnuplotOpts.default_term = 'png' + main(gp_data_file.name) |
b |
diff -r 000000000000 -r 7621d36a4e9c plotting/bar_chart.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plotting/bar_chart.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,57 @@ +<tool id="barchart_gnuplot" name="Bar chart" version="1.0.0"> + <description>for multiple columns</description> + <command interpreter="python"> + #if $xtic.userSpecified == "Yes" #bar_chart.py $input $xtic.xticColumn $colList "$title" "$ylabel" $ymin $ymax $out_file1 "$pdf_size" + #else #bar_chart.py $input 0 $colList "$title" "$ylabel" $ymin $ymax $out_file1 "$pdf_size" + #end if + </command> + <inputs> + <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> + <conditional name="xtic"> + <param name="userSpecified" type="select" label="Use X Tick labels?" help="see example below"> + <option value="Yes">Yes</option> + <option value="No">No</option> + </param> + <when value="Yes"> + <param name="xticColumn" type="data_column" data_ref="input" numerical="False" label="Use this column for X Tick labels" /> + </when> + <when value="No"> + </when> + </conditional> + <param name="colList" label="Numerical columns" type="data_column" numerical="True" multiple="True" data_ref="input" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" /> + <param name="title" type="text" size="30" value="Bar Chart" label="Plot title"/> + <param name="ylabel" type="text" size="30" value="V1" label="Label for Y axis"/> + <param name="ymin" type="integer" size="4" value="0" label="Minimal value on Y axis" help="set to 0 for autoscaling"/> + <param name="ymax" type="integer" size="4" value="0" label="Maximal value on Y axis" help="set to 0 for autoscaling"/> + <param name="pdf_size" type="select" label="Choose chart size (pixels)"> + <option value="800,600">Normal: 800 by 600</option> + <option value="640,480">Small: 640 by 480</option> + <option value="1480,800">Large: 1480 by 800</option> + <option value="600,800">Normal Flipped: 600 by 800</option> + <option value="480,640">Small Flipped: 480 by 640</option> + <option value="800,1480">Large Flipped: 800 by 1480</option> + </param> + </inputs> + <outputs> + <data format="png" name="out_file1" /> + </outputs> + <requirements> + <requirement type="package">gnuplot-py</requirement> + </requirements> + <help> +**What it does** + +This tool builds a bar chart on one or more columns. Suppose you have dataset like this one:: + + Gene1 10 15 + Gene2 20 14 + Gene3 67 45 + Gene4 55 12 + +Graphing columns 2 and 3 while using column 1 for X Tick Labels will produce the following plot: + +.. image:: ${static_path}/images/bar_chart.png + :height: 324 + :width: 540 +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c plotting/boxplot.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plotting/boxplot.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,111 @@ +<tool id="qual_stats_boxplot" name="Boxplot" version="1.0.0"> + <description>of quality statistics</description> + <command>gnuplot < '$gnuplot_commands' 2>&1 || echo "Error running gnuplot." >&2</command> + <requirements> + <requirement type="package" version="4.6">gnuplot</requirement> + </requirements> + <inputs> + <param name="input_file" type="data" format="tabular" label="Quality Statistics File"/> + <param name="title" type="text" value="Box plot in Galaxy" label="Title for plot" size="50"/> + <param name="graph_size" type="text" value="2048,768" label="Dimensions of Graph"/> + <param name="xlabel" type="text" value="X Axis Label" label="X axis label" size="50"/> + <param name="ylabel" type="text" value="Score Value" label="Y axis label" size="50"/> + <param name="xcol" type="data_column" data_ref="input_file" label="Column for X axis position" default_value="1" help="A unique number; c1 if plotting output of FASTQ summary"/> + <param name="q1col" type="data_column" data_ref="input_file" label="Column for Q1" default_value="7" help="c7 if plotting output of FASTQ summary"/> + <param name="medcol" type="data_column" data_ref="input_file" label="Column for Median" default_value="8" help="c8 if plotting output of FASTQ summary"/> + <param name="q3col" type="data_column" data_ref="input_file" label="Column for Q3" default_value="9" help="c9 if plotting output of FASTQ summary"/> + <param name="lwcol" type="data_column" data_ref="input_file" label="Column for left whisker" default_value="11" help="c11 if plotting output of FASTQ summary"/> + <param name="rwcol" type="data_column" data_ref="input_file" label="Column for right whisker" default_value="12" help="c12 if plotting output of FASTQ summary"/> + <conditional name="use_outliers"> + <param name="use_outliers_type" type="select" label="Plot Outliers"> + <option value="use_outliers" selected="true">Plot Outliers</option> + <option value="dont_use_outliers">Don't Plot Outliers</option> + </param> + <when value="use_outliers"> + <param name="outliercol" type="data_column" data_ref="input_file" label="Column for Outliers" default_value="13" help="c13 if plotting output of FASTQ summary"/> + </when> + <when value="dont_use_outliers"> + </when> + </conditional> + </inputs> + <configfiles> + <configfile name="gnuplot_commands"> +set output '$output_file' +set term png size ${graph_size} +set boxwidth 0.8 +set key right tmargin +set xlabel "${xlabel}" +set ylabel "${ylabel}" +set title "${title}" +set xtics 1 +set ytics 1 +set grid ytics +set offsets 1, 1, 1, 1 +plot '${input_file}' using ${xcol}:${q1col}:${lwcol}:${rwcol}:${q3col} with candlesticks lt 1 lw 1 title 'Quartiles' whiskerbars, \ + '' using ${xcol}:${medcol}:${medcol}:${medcol}:${medcol} with candlesticks lt -1 lw 2 title 'Medians'\ +#if str( $use_outliers['use_outliers_type'] ) == 'use_outliers': +, "< python -c \"for xval, yvals in [ ( fields[${xcol} - 1], fields[${use_outliers['outliercol']} - 1].split( ',' ) ) for fields in [ line.rstrip( '\\n\\r' ).split( '\\t' ) for line in open( '${input_file}' ) if not line.startswith( '#' ) ] if len( fields ) > max( ${xcol} - 1, ${use_outliers['outliercol']} - 1 ) ]: print '\\n'.join( [ '%s\\t%s' % ( xval, yval ) for yval in yvals if yval ] )\"" using 1:2 with points pt 29 title 'Outliers' +#end if + </configfile> + </configfiles> + <outputs> + <data name="output_file" format="png" /> + </outputs> + <tests> + <test> + <param name="input_file" value="fastq_stats_1_out.tabular" ftype="tabular" /> + <param name="title" value="Boxplot of Summary Statistics for Sanger Reads" /> + <param name="graph_size" value="2048,768" /> + <param name="xlabel" value="Read Column" /> + <param name="ylabel" value="Quality Score Value" /> + <param name="xcol" value="1" /> + <param name="q1col" value="7" /> + <param name="medcol" value="8" /> + <param name="q3col" value="9" /> + <param name="lwcol" value="11" /> + <param name="rwcol" value="12" /> + <param name="use_outliers_type" value="use_outliers" /> + <param name="outliercol" value="13" /> + <output name="output_file" file="boxplot_summary_statistics_out.png" /> + </test> + </tests> + <help> + +**What it does** + +Creates a boxplot graph. Its main purpose is to display a distribution of quality scores produced by *NGS: QC and maniupulation -> FASTQ Summary Statistics* tool. + +.. class:: warningmark + +**TIP:** If you want to display a distribution of quality scores produced by *NGS: QC and maniupulation -> FASTQ Summary Statistics* and the column assignments within the tool's interface are not automatically set (they will all read "c1" in that case) set columns manually to the following values:: + + Column for X axis c1 + Column for Q1 c7 + Column for Median c8 + Column for Q3 c9 + Column for left whisker c11 + Column for right whisker c12 + Column for Outliers c13 + +----- + +**Output Example** + +* Black horizontal lines are medians +* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1) +* Whiskers show outliers at max. 1.5*IQR + +.. image:: ${static_path}/images/solid_qual.png + +------ + +**Citation** + +If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ + + + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btq281</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c solid_tools/maq_cs_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solid_tools/maq_cs_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,273 @@\n+#!/usr/bin/env python\n+# Guruprasad Ananda\n+# MAQ mapper for SOLiD colourspace-reads\n+from __future__ import print_function\n+\n+import os\n+import subprocess\n+import sys\n+import tempfile\n+\n+\n+def stop_err( msg ):\n+ sys.stderr.write( "%s\\n" % msg )\n+ sys.exit()\n+\n+\n+def __main__():\n+ out_fname = sys.argv[1].strip()\n+ out_f2 = open(sys.argv[2].strip(), \'r+\')\n+ ref_fname = sys.argv[3].strip()\n+ f3_read_fname = sys.argv[4].strip()\n+ f3_qual_fname = sys.argv[5].strip()\n+ paired = sys.argv[6]\n+ if paired == \'yes\':\n+ r3_read_fname = sys.argv[7].strip()\n+ r3_qual_fname = sys.argv[8].strip()\n+ min_mapqual = int(sys.argv[9].strip())\n+ max_mismatch = int(sys.argv[10].strip())\n+ out_f3name = sys.argv[11].strip()\n+ subprocess_dict = {}\n+\n+ ref_csfa = tempfile.NamedTemporaryFile()\n+ ref_bfa = tempfile.NamedTemporaryFile()\n+ ref_csbfa = tempfile.NamedTemporaryFile()\n+ cmd2_1 = \'maq fasta2csfa %s > %s 2>&1\' % (ref_fname, ref_csfa.name)\n+ cmd2_2 = \'maq fasta2bfa %s %s 2>&1\' % (ref_csfa.name, ref_csbfa.name)\n+ cmd2_3 = \'maq fasta2bfa %s %s 2>&1\' % (ref_fname, ref_bfa.name)\n+ try:\n+ os.system(cmd2_1)\n+ os.system(cmd2_2)\n+ os.system(cmd2_3)\n+ except Exception as erf:\n+ stop_err(str(erf) + "Error processing reference sequence")\n+\n+ if paired == \'yes\': # paired end reads\n+ tmpf = tempfile.NamedTemporaryFile() # forward reads\n+ tmpr = tempfile.NamedTemporaryFile() # reverse reads\n+ tmps = tempfile.NamedTemporaryFile() # single reads\n+ tmpffastq = tempfile.NamedTemporaryFile()\n+ tmprfastq = tempfile.NamedTemporaryFile()\n+ tmpsfastq = tempfile.NamedTemporaryFile()\n+\n+ cmd1 = "solid2fastq_modified.pl \'yes\' %s %s %s %s %s %s %s 2>&1" % (tmpf.name, tmpr.name, tmps.name, f3_read_fname, f3_qual_fname, r3_read_fname, r3_qual_fname)\n+ try:\n+ os.system(cmd1)\n+ os.system(\'gunzip -c %s >> %s\' % (tmpf.name, tmpffastq.name))\n+ os.system(\'gunzip -c %s >> %s\' % (tmpr.name, tmprfastq.name))\n+ os.system(\'gunzip -c %s >> %s\' % (tmps.name, tmpsfastq.name))\n+\n+ except Exception as eq:\n+ stop_err("Error converting data to fastq format." + str(eq))\n+\n+ # Make a temp directory where the split fastq files will be stored\n+ try:\n+ split_dir = tempfile.mkdtemp()\n+ split_file_prefix_f = tempfile.mktemp(dir=split_dir)\n+ split_file_prefix_r = tempfile.mktemp(dir=split_dir)\n+ splitcmd_f = \'split -a 2 -l %d %s %s\' % (32000000, tmpffastq.name, split_file_prefix_f) # 32M lines correspond to 8M reads\n+ splitcmd_r = \'split -a 2 -l %d %s %s\' % (32000000, tmprfastq.name, split_file_prefix_r) # 32M lines correspond to 8M reads\n+\n+ os.system(splitcmd_f)\n+ os.system(splitcmd_r)\n+ os.chdir(split_dir)\n+ ii = 0\n+ for fastq in os.listdir(split_dir):\n+ if not fastq.startswith(split_file_prefix_f.split("/")[-1]):\n+ continue\n+ fastq_r = split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] # find the reverse strand fastq corresponding to forward strand fastq\n+ tmpbfq_f = tempfile.NamedTemporaryFile()\n+ tmpbfq_r = tempfile.NamedTemporaryFile()\n+ cmd3 = \'maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt\' % (fastq, tmpbfq_f.name, fastq_r, tmpbfq_r.name, fastq, ref_csbfa.name, tmpbfq_f.name, tmpbfq_r.name, fastq, fastq)\n+ subprocess_dict[\'sp\' + str(ii + 1)] = subprocess.Popen([cmd3], shell=True, stdout=subprocess.PIPE)\n+ ii += 1\n+ while True:\n+ all_done = True\n+ for j, k in enumerate(subprocess_dict.keys()):\n+ if subprocess_dict[\'sp\' + str(j + 1)].wait() !='..b'eup.name)\n+ os.system(cmdpileup)\n+ tmppileup.seek(0)\n+ print("#chr\\tposition\\tref_nt\\tcoverage\\tSNP_count\\tA_count\\tT_count\\tG_count\\tC_count", file=out_f2)\n+ for line in open(tmppileup.name):\n+ elems = line.strip().split()\n+ ref_nt = elems[2].capitalize()\n+ read_nt = elems[4]\n+ coverage = int(elems[3])\n+ a, t, g, c = 0, 0, 0, 0\n+ ref_nt_count = 0\n+ for ch in read_nt:\n+ ch = ch.capitalize()\n+ if ch not in [\'A\', \'T\', \'G\', \'C\', \',\', \'.\']:\n+ continue\n+ if ch in [\',\', \'.\']:\n+ ch = ref_nt\n+ ref_nt_count += 1\n+ try:\n+ nt_ind = [\'A\', \'T\', \'G\', \'C\'].index(ch)\n+ if nt_ind == 0:\n+ a += 1\n+ elif nt_ind == 1:\n+ t += 1\n+ elif nt_ind == 2:\n+ g += 1\n+ else:\n+ c += 1\n+ except:\n+ pass\n+ print("%s\\t%s\\t%s\\t%s\\t%s\\t%s" % ("\\t".join(elems[:4]), coverage - ref_nt_count, a, t, g, c), file=out_f2)\n+ except Exception as er2:\n+ stop_err("Encountered error while mapping: %s" % (str(er2)))\n+\n+ # Build custom track from pileup\n+ chr_list = []\n+ out_f2.seek(0)\n+ fcov = tempfile.NamedTemporaryFile()\n+ fout_a = tempfile.NamedTemporaryFile()\n+ fout_t = tempfile.NamedTemporaryFile()\n+ fout_g = tempfile.NamedTemporaryFile()\n+ fout_c = tempfile.NamedTemporaryFile()\n+ fcov.write(\'\'\'track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\\n\'\'\')\n+ fout_a.write(\'\'\'track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\\n\'\'\')\n+ fout_t.write(\'\'\'track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\\n\'\'\')\n+ fout_g.write(\'\'\'track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\\n\'\'\')\n+ fout_c.write(\'\'\'track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\\n\'\'\')\n+\n+ for line in out_f2:\n+ if line.startswith("#"):\n+ continue\n+ elems = line.split()\n+ chr = elems[0]\n+\n+ if chr not in chr_list:\n+ chr_list.append(chr)\n+ if not (chr.startswith(\'chr\') or chr.startswith(\'scaffold\')):\n+ chr = \'chr\'\n+ header = "variableStep chrom=%s" % (chr)\n+ fcov.write("%s\\n" % (header))\n+ fout_a.write("%s\\n" % (header))\n+ fout_t.write("%s\\n" % (header))\n+ fout_g.write("%s\\n" % (header))\n+ fout_c.write("%s\\n" % (header))\n+ try:\n+ pos = int(elems[1])\n+ cov = int(elems[3])\n+ a = int(elems[5])\n+ t = int(elems[6])\n+ g = int(elems[7])\n+ c = int(elems[8])\n+ except:\n+ continue\n+ fcov.write("%s\\t%s\\n" % (pos, cov))\n+ try:\n+ a_freq = a * 100. / cov\n+ t_freq = t * 100. / cov\n+ g_freq = g * 100. / cov\n+ c_freq = c * 100. / cov\n+ except ZeroDivisionError:\n+ a_freq = t_freq = g_freq = c_freq = 0\n+ fout_a.write("%s\\t%s\\n" % (pos, a_freq))\n+ fout_t.write("%s\\t%s\\n" % (pos, t_freq))\n+ fout_g.write("%s\\t%s\\n" % (pos, g_freq))\n+ fout_c.write("%s\\t%s\\n" % (pos, c_freq))\n+\n+ fcov.seek(0)\n+ fout_a.seek(0)\n+ fout_g.seek(0)\n+ fout_t.seek(0)\n+ fout_c.seek(0)\n+ os.system("cat %s %s %s %s %s | cat > %s" % (fcov.name, fout_a.name, fout_t.name, fout_g.name, fout_c.name, out_f3name))\n+\n+\n+if __name__ == "__main__":\n+ __main__()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c solid_tools/maq_cs_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solid_tools/maq_cs_wrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,120 @@ +<tool id="maq_cs_wrapper" name="MAQ for SOLiD" version="1.0.0"> + <description> </description> + <command interpreter="python"> + maq_cs_wrapper.py + $output1 + $output2 + $ref + $library_type.f3_reads + $library_type.f3_qual + $library_type.is_paired + #if $library_type.is_paired == "yes": + $library_type.r3_reads + $library_type.r3_qual + #else: + "None" + "None" + #end if + $min_mapqual + $max_mismatch + $output3 + + </command> + + <inputs> + <param name="ref" type="data" format="fasta" label="Target Genome"/> + <conditional name="library_type"> + <param name="is_paired" type="select" label="Is the library mate-paired?" multiple="false"> + <option value="no">No</option> + <option value="yes">Yes</option> + </param> + <when value="no"> + <param name="f3_reads" type="data" format="csfasta" label="F3 reads file"/> + <param format="qualsolid" name="f3_qual" type="data" label="F3 quality file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> + </when> + <when value="yes"> + <param name="f3_reads" type="data" format="csfasta" label="F3 reads file"/> + <param format="qualsolid" name="f3_qual" type="data" label="F3 quality file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> + <param name="r3_reads" type="data" format="csfasta" label="R3 reads file"/> + <param format="qualsolid" name="r3_qual" type="data" label="R3 quality file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> + </when> + </conditional> + <param name="min_mapqual" type="integer" size="3" value="0" label="Minimum mapping quality allowed for a read to be used" help="Reads below the specified mapping quality will not be considered in coverage and SNP analysis."/> + <param name="max_mismatch" type="integer" size="3" value="7" label="Maximum number of mismatches allowed for a read to be used" help="Reads above the specified threshold will not be considered in coverage and SNP analysis."/> + </inputs> + <outputs> + <data format="tabular" name="output1" metadata_source="ref" /> + <data format="tabular" name="output2" metadata_source="ref" /> + <data format="customtrack" name="output3" metadata_source="ref" /> + </outputs> + + <!-- "ToolTestCase does not deal with multiple outputs properly yet." + <tests> + + <test> + <param name="ref" value="phiX_mod.fasta" /> + <param name="is_paired" value="no" /> + <param name="f3_reads" value="phiX_solid.csfasta" /> + <param name="f3_qual" value="phiX_solid.qualsolid" /> + <param name="min_mapqual" value="0" /> + <param name="max_mismatch" value="7" /> + <output name="output1" file="phiX_solid_maq.map" /> + <output name="output2" file="phiX_solid_maq.pileup" /> + <output name="output3" file="phiX_solid_maq.ctrack" /> + + </test> + </tests> + --> +<help> + +.. class:: infomark + +**What it does** + +This tool maps SOLiD color-space reads against the target genome using MAQ. It produces three output datasets: + + +**ALIGNMENT INFO** : contains the read alignment information, + +**PILEUP** : contains the coverage and SNP statistics for every nucleotide of the target genome, + +**CUSTOM TRACK** : contains the coverage and SNP statistics as custom tracks displayable in the UCSC browser. + +----- + +**The ALIGNMENT INFO dataset will contain the following fields:** + +* column 1 = read name +* column 2 = chromosome +* column 3 = position +* column 4 = strand +* column 5 = insert size from the outer coorniates of a pair +* column 6 = paired flag +* column 7 = mapping quality +* column 8 = single-end mapping quality +* column 9 = alternative mapping quality +* column 10 = number of mismatches of the best hit +* column 11 = sum of qualities of mismatched bases of the best hit +* column 12 = number of 0-mismatch hits of the first 24bp +* column 13 = number of 1-mismatch hits of the first 24bp on the reference +* column 14 = length of the read +* column 15 = read sequence +* column 16 = read quality + + +**The PILEUP dataset will contain the following fields:** + +* column 1 = chromosome +* column 2 = position +* column 3 = reference nucleotide +* column 4 = coverage (number of reads that cover this position) +* column 5 = number of SNPs +* column 6 = number of As +* column 7 = number of Ts +* column 8 = number of Gs +* column 9 = number of Cs + +</help> +<code file="maq_cs_wrapper_code.py"/> + +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c solid_tools/maq_cs_wrapper_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solid_tools/maq_cs_wrapper_code.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,4 @@ +def exec_before_job(app, inp_data, out_data, param_dict, tool): + out_data['output1'].name = out_data['output1'].name + " [ ALIGNMENT INFO ]" + out_data['output2'].name = out_data['output2'].name + " [ PILEUP ]" + out_data['output3'].name = out_data['output3'].name + " [ CUSTOM TRACK ]" |
b |
diff -r 000000000000 -r 7621d36a4e9c solid_tools/qualsolid_boxplot_graph.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solid_tools/qualsolid_boxplot_graph.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,94 @@ +#!/bin/sh + +# Modified fastq_quality_boxplot_graph.sh from FASTX-toolkit - FASTA/FASTQ preprocessing tools. +# Copyright (C) 2009 A. Gordon (gordon@cshl.edu) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +function usage() +{ + echo "SOLiD-Quality BoxPlot plotter" + echo "Generates a SOLiD quality score box-plot graph " + echo + echo "Usage: $0 [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]" + echo + echo " [-p] - Generate PostScript (.PS) file. Default is PNG image." + echo " [-i INPUT.TXT] - Input file. Should be the output of \"solid_qual_stats\" program." + echo " [-o OUTPUT] - Output file name. default is STDOUT." + echo " [-t TITLE] - Title (usually the solid file name) - will be plotted on the graph." + echo + exit +} + +# +# Input Data columns: #pos cnt min max sum mean Q1 med Q3 IQR lW rW +# As produced by "solid_qual_stats" program + +TITLE="" # default title is empty +FILENAME="" +OUTPUTTERM="set term png size 800,600" +OUTPUTFILE="/dev/stdout" # Default output file is simply "stdout" +while getopts ":t:i:o:ph" Option + do + case $Option in + # w ) CMD=$OPTARG; FILENAME="PIMSLogList.txt"; TARGET="logfiles"; ;; + t ) TITLE="for $OPTARG" ;; + i ) FILENAME=$OPTARG ;; + o ) OUTPUTFILE="$OPTARG" ;; + p ) OUTPUTTERM="set term postscript enhanced color \"Helvetica\" 4" ;; + h ) usage ;; + * ) echo "unrecognized argument. use '-h' for usage information."; exit -1 ;; + esac +done +shift $(($OPTIND - 1)) + + +if [ "$FILENAME" == "" ]; then + usage +fi + +if [ ! -r "$FILENAME" ]; then + echo "Error: can't open input file ($1)." >&2 + exit 1 +fi + +#Read number of cycles from the stats file (each line is a cycle, minus the header line) +#But for the graph, I want xrange to reach (num_cycles+1), so I don't subtract 1 now. +NUM_CYCLES=$(cat "$FILENAME" | wc -l) + +GNUPLOTCMD=" +$OUTPUTTERM +set boxwidth 0.8 +set size 1,1 +set key Left inside +set xlabel \"read position\" +set ylabel \"Quality Score \" +set title \"Quality Scores $TITLE\" +#set auto x +set bars 4.0 +set xrange [ 0: $NUM_CYCLES ] +set yrange [-2:45] +set y2range [-2:45] +set xtics 1 +set x2tics 1 +set ytics 2 +set y2tics 2 +set tics out +set grid ytics +set style fill empty +plot '$FILENAME' using 1:7:11:12:9 with candlesticks lt 1 lw 1 title 'Quartiles' whiskerbars, \ + '' using 1:8:8:8:8 with candlesticks lt -1 lw 2 title 'Medians' +" + +echo "$GNUPLOTCMD" | gnuplot > "$OUTPUTFILE" |
b |
diff -r 000000000000 -r 7621d36a4e9c solid_tools/solid_qual_boxplot.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solid_tools/solid_qual_boxplot.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,40 @@ +<tool id="solid_qual_boxplot" name="Draw quality score boxplot" version="1.0.0"> + <description>for SOLiD data</description> + + <command interpreter="bash">qualsolid_boxplot_graph.sh -t '$input.name' -i $input -o $output</command> + + <inputs> + <param format="txt" name="input" type="data" label="Statistics report file (output of 'Quality Statistics for SOLiD data' tool)" /> + </inputs> + + <outputs> + <data format="png" name="output" metadata_source="input" /> + </outputs> +<help> + +**What it does** + +Creates a boxplot graph for the quality scores in the library. + +.. class:: infomark + +**TIP:** Use the **Quality Statistics for SOLiD data** tool to generate the report file needed for this tool. + +----- + +**Output Example** + +* Black horizontal lines are medians +* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1) +* Whiskers show outliers at max. 1.5*IQR + + +.. image:: ${static_path}/images/solid_qual.png + +------ + +This tool is based on `FASTX-toolkit`__ by Assaf Gordon. + + .. __: http://hannonlab.cshl.edu/fastx_toolkit/ +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c solid_tools/solid_qual_stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solid_tools/solid_qual_stats.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,140 @@ +#!/usr/bin/env python +# Guruprasad Ananda +from __future__ import print_function + +import sys +import tempfile +import zipfile + +QUAL_UPPER_BOUND = 41 +QUAL_LOWER_BOUND = 1 + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def unzip( filename ): + zip_file = zipfile.ZipFile( filename, 'r' ) + tmpfilename = tempfile.NamedTemporaryFile().name + for name in zip_file.namelist(): + open( tmpfilename, 'a' ).write( zip_file.read( name ) ) + zip_file.close() + return tmpfilename + + +def __main__(): + infile_score_name = sys.argv[1].strip() + fout = open(sys.argv[2].strip(), 'r+w') + + if zipfile.is_zipfile( infile_score_name ): + infile_name = unzip( infile_score_name ) + else: + infile_name = infile_score_name + + readlen = None + invalid_lines = 0 + j = 0 + for line in open( infile_name ): + line = line.strip() + if not(line) or line.startswith("#") or line.startswith(">"): + continue + elems = line.split() + try: + for item in elems: + int(item) + if not readlen: + readlen = len(elems) + if len(elems) != readlen: + print("Note: Reads in the input dataset are of variable lengths.") + j += 1 + except ValueError: + invalid_lines += 1 + if j > 10: + break + + position_dict = {} + print("column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW", file=fout) + for k, line in enumerate(open( infile_name )): + line = line.strip() + if not(line) or line.startswith("#") or line.startswith(">"): + continue + elems = line.split() + if position_dict == {}: + for pos in range(readlen): + position_dict[pos] = [0] * QUAL_UPPER_BOUND + if len(elems) != readlen: + invalid_lines += 1 + continue + for ind, item in enumerate(elems): + try: + item = int(item) + position_dict[ind][item] += 1 + except: + pass + + invalid_positions = 0 + for pos in position_dict: + carr = position_dict[pos] # count array for position pos + total = sum(carr) # number of bases found in this column. + med_elem = int(round(total / 2.0)) + lowest = None # Lowest quality score value found in this column. + highest = None # Highest quality score value found in this column. + median = None # Median quality score value found in this column. + qsum = 0.0 # Sum of quality score values for this column. + q1 = None # 1st quartile quality score. + q3 = None # 3rd quartile quality score. + q1_elem = int(round((total + 1) / 4.0)) + q3_elem = int(round((total + 1) * 3 / 4.0)) + + try: + for ind, cnt in enumerate(carr): + qsum += ind * cnt + + if cnt != 0: + highest = ind + + if lowest is None and cnt != 0: # first non-zero count + lowest = ind + + if q1 is None: + if sum(carr[:ind + 1]) >= q1_elem: + q1 = ind + + if median is None: + if sum(carr[:ind + 1]) < med_elem: + continue + median = ind + if total % 2 == 0: # even number of elements + median2 = median + if sum(carr[:ind + 1]) < med_elem + 1: + for ind2, elem in enumerate(carr[ind + 1:]): + if elem != 0: + median2 = ind + ind2 + 1 + break + median = (median + median2) / 2.0 + + if q3 is None: + if sum(carr[:ind + 1]) >= q3_elem: + q3 = ind + + mean = qsum / total # Mean quality score value for this column. + iqr = q3 - q1 + left_whisker = max(q1 - 1.5 * iqr, lowest) + right_whisker = min(q3 + 1.5 * iqr, highest) + + print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (pos + 1, total, lowest, highest, qsum, mean, q1, median, q3, iqr, left_whisker, right_whisker), file=fout) + except: + invalid_positions += 1 + nullvals = ['NA'] * 11 + print("%s\t%s" % (pos + 1, '\t'.join(nullvals)), file=fout) + + if invalid_lines: + print("Skipped %d reads as invalid." % invalid_lines) + if invalid_positions: + print("Skipped stats computation for %d read positions." % invalid_positions) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c solid_tools/solid_qual_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/solid_tools/solid_qual_stats.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,69 @@ +<tool id="solid_qual_stats" name="Compute quality statistics" version="1.0.0"> + <description>for SOLiD data</description> + <command interpreter="python">solid_qual_stats.py $input $output1</command> + + <inputs> + <param format="qualsolid" name="input" type="data" label="SOLiD qual file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> + </inputs> + <outputs> + <data format="txt" name="output1" metadata_source="input" /> + </outputs> + <tests> + <test> + <param name="input" value="qualscores.qualsolid" /> + <output name="output1" file="qualsolid.stats" /> + </test> + </tests> + +<help> + +**What it does** + +Creates quality statistics report for the given SOLiD quality score file. + +.. class:: infomark + +**TIP:** This statistics report can be used as input for **Quality Boxplot for SOLiD data** and **Nucleotides Distribution** tool. + +----- + +**The output file will contain the following fields:** + +* column = column number (position on the read) +* count = number of bases found in this column. +* min = Lowest quality score value found in this column. +* max = Highest quality score value found in this column. +* sum = Sum of quality score values for this column. +* mean = Mean quality score value for this column. +* Q1 = 1st quartile quality score. +* med = Median quality score. +* Q3 = 3rd quartile quality score. +* IQR = Inter-Quartile range (Q3-Q1). +* lW = 'Left-Whisker' value (for boxplotting). +* rW = 'Right-Whisker' value (for boxplotting). + + + + + +**Output Example**:: + + column count min max sum mean Q1 med Q3 IQR lW rW + 1 6362991 2 32 250734117 20.41 5 9 28 23 2 31 + 2 6362991 2 32 250531036 21.37 10 26 30 20 5 31 + 3 6362991 2 34 248722469 19.09 10 26 30 20 5 31 + 4 6362991 2 34 247654797 18.92 10 26 30 20 5 31 + . + . + 32 6362991 2 31 143436943 16.54 3 10 25 22 2 31 + 33 6362991 2 32 114269843 16.96 3 10 25 22 2 31 + 34 6362991 2 29 140638447 12.10 3 10 25 22 2 29 + 35 6362991 2 29 138910532 11.83 3 10 25 22 2 29 + +------ + +This tool is based on `FASTX-toolkit`__ by Assaf Gordon. + + .. __: http://hannonlab.cshl.edu/fastx_toolkit/ +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/README Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,9 @@ +1. Install R package Splicescope + R CMD INSTALL Splicescope +## Need install ggplot2 and betareg + +2. Add perl_lib/czplib to PERL5LIB + +3. Edit the mm10.conf file in annotation to correct the location of splicing event annotation file + +4. Run Rscript commanline/splicescope.R for usage checking. |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/annotation/mm10/Mm.seq.all.devcortex.cass.chrom.can.id2gene2symbol --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/annotation/mm10/Mm.seq.all.devcortex.cass.chrom.can.id2gene2symbol Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,3818 @@\n+CA-68281-8346-9937-9997-13828[INC][1/39]\t68281\t4930430F08Rik\n+CA-68281-8346-9937-9997-13828[SKIP][1/39]\t68281\t4930430F08Rik\n+CA-216292-3305-8160-8254-11365[INC][5/21][UPT]\t216292\tBC067068\n+CA-216292-3305-8160-8254-11365[SKIP][5/21][UPT]\t216292\tBC067068\n+CA-17931-93525-93925-94096-100356[INC][30/11]\t17931\tPpp1r12a\n+CA-17931-93525-93925-94096-100356[SKIP][30/11]\t17931\tPpp1r12a\n+CA-268345-188413-188806-188910-190367[INC][2/7]\t268345\tKcnc2\n+CA-268345-188413-188806-188910-190367[SKIP][2/7]\t268345\tKcnc2\n+CA-72068-35479-47233-47291-56562[INC][16/136]\t72068\tCnot2\n+CA-72068-35479-47233-47291-56562[SKIP][16/136]\t72068\tCnot2\n+CA-216363-15984-25096-25192-33729[INC][10/5]\t216363\tRab3ip\n+CA-216363-15984-25096-25192-33729[SKIP][10/5]\t216363\tRab3ip\n+CA-17245-13488-18047-18077-18408[INC][11/11]\t17245\tMdm1\n+CA-17245-13488-18047-18077-18408[SKIP][11/11]\t17245\tMdm1\n+CA-74053-368321-446401-446482-478587[INC][4/4][UPT]\t74053\tGrip1\n+CA-74053-368321-446401-446482-478587[SKIP][4/4][UPT]\t74053\tGrip1\n+CA-74053-527294-534160-534316-535018[INC][3/9]\t74053\tGrip1\n+CA-74053-527294-534160-534316-535018[SKIP][3/9]\t74053\tGrip1\n+CA-74053-598803-598918-598963-603519[INC][2/14]\t74053\tGrip1\n+CA-74053-598803-598918-598963-603519[SKIP][2/14]\t74053\tGrip1\n+CA-67074-69012-69575-69593-69874[INC][4/11]\t67074\tMon2\n+CA-67074-69012-69575-69593-69874[SKIP][4/11]\t67074\tMon2\n+CA-14479-36331-46796-46883-52985[INC][10/18]\t14479\tUsp15\n+CA-14479-36331-46796-46883-52985[SKIP][10/18]\t14479\tUsp15\n+CA-216439-13452-14235-14295-14520[INC][3/2]\t216439\tAgap2\n+CA-216439-13452-14235-14295-14520[SKIP][3/2]\t216439\tAgap2\n+CA-71750-72265-74877-74973-77671[INC][22/13]\t71750\tR3hdm2\n+CA-71750-72265-74877-74973-77671[SKIP][22/13]\t71750\tR3hdm2\n+CA-71750-77703-77820-77874-84457[INC][8/27]\t71750\tR3hdm2\n+CA-71750-77703-77820-77874-84457[SKIP][8/27]\t71750\tR3hdm2\n+CA-71750-89388-94382-94484-96665[INC][8/16]\t71750\tR3hdm2\n+CA-71750-89388-94382-94484-96665[SKIP][8/16]\t71750\tR3hdm2\n+CA-17937-6572-6781-6973-7924[INC][5/5][DNT]\t17937\tNab2\n+CA-17937-6572-6781-6973-7924[SKIP][5/5][DNT]\t17937\tNab2\n+CA-67588-20088-26697-26969-27852[INC][18/8]\t67588\tRnf41\n+CA-67588-20088-26697-26969-27852[SKIP][18/8]\t67588\tRnf41\n+CA-68094-24769-25446-25539-25847[INC][5/16]\t68094\tSmarcc2\n+CA-68094-24769-25446-25539-25847[SKIP][5/16]\t68094\tSmarcc2\n+CA-22781-3273-5025-5119-6260[INC][14/5][UPT]\t22781\tIkzf4\n+CA-22781-3273-5025-5119-6260[SKIP][14/5][UPT]\t22781\tIkzf4\n+CA-12566-6709-7932-8076-8376[INC][11/19]\t12566\tCdk2\n+CA-12566-6709-7932-8076-8376[SKIP][11/19]\t12566\tCdk2\n+CA-78428-3688-5404-5459-20268[INC][2/7][UPT]\t78428\tWibg\n+CA-78428-3688-5404-5459-20268[SKIP][2/7][UPT]\t78428\tWibg\n+CA-66118-3201-6008-6061-14541[INC][11/125][UPT]\t66118\tSarnp\n+CA-66118-3201-6008-6061-14541[SKIP][11/125][UPT]\t66118\tSarnp\n+CA-215789-215631-222075-222087-223517[INC][6/5]\t215789\tPhactr2\n+CA-215789-215631-222075-222087-223517[SKIP][6/5]\t215789\tPhactr2\n+CA-215789-210424-215472-215631-223517[INC][4/1]\t215789\tPhactr2\n+CA-215789-210424-215472-215631-223517[SKIP][4/1]\t215789\tPhactr2\n+CA-15273-103306-147286-147408-154140[INC][12/6]\t15273\tHivep2\n+CA-15273-103306-147286-147408-154140[SKIP][12/6]\t15273\tHivep2\n+CA-19707-51336-54728-54809-61459[INC][14/8]\t19707\tReps1\n+CA-19707-51336-54728-54809-61459[SKIP][14/8]\t19707\tReps1\n+CA-215819-107068-111354-111486-119116[INC][2/3]\t215819\tNhsl1\n+CA-215819-107068-111354-111486-119116[SKIP][2/3]\t215819\tNhsl1\n+CA-76306-12055-16803-16960-22307[INC][8/3]\t76306\t1110021L09Rik\n+CA-76306-12055-16803-16960-22307[SKIP][8/3]\t76306\t1110021L09Rik\n+CA-70208-15664-16218-16236-21414[INC][6/11]\t70208\tMed23\n+CA-70208-15664-16218-16236-21414[SKIP][6/11]\t70208\tMed23\n+CA-13822-132296-145026-145341-149377[INC][2/8]\t13822\tEpb4.1l2\n+CA-13822-132296-145026-145341-149377[SKIP][2/8]\t13822\tEpb4.1l2\n+CA-13822-138794-142978-143146-144780[INC][2/1]\t13822\tEpb4.1l2\n+CA-13822-138794-142978-143146-144780[SKIP][2/1]\t13822\tEpb4.1l2\n+CA-13822-138794-144780-144834-145026[INC][1/11]\t13822\tEpb4.1l2\n+CA-13822-138794-144780-144834-145026[SKIP][1/11]\t13'..b'fm1\n+CA-20523-3249-3382-3618-8672[INC][5/16][UPT]\t20523\tSlc25a14\n+CA-20523-3249-3382-3618-8672[SKIP][5/16][UPT]\t20523\tSlc25a14\n+CA-20523-3249-7556-7604-8672[INC][1/16][UPT]\t20523\tSlc25a14\n+CA-20523-3249-7556-7604-8672[SKIP][1/16][UPT]\t20523\tSlc25a14\n+CA-209224-277746-278062-278110-279608[INC][7/7][DNT]\t209224\tEnox2\n+CA-209224-277746-278062-278110-279608[SKIP][7/7][DNT]\t209224\tEnox2\n+CA-209224-3142-4338-4387-122458[INC][14/3][UPT]\t209224\tEnox2\n+CA-209224-3142-4338-4387-122458[SKIP][14/3][UPT]\t209224\tEnox2\n+CA-103012-65035-65117-65244-67967[INC][1/5]\t103012\t6720401G13Rik\n+CA-103012-65035-65117-65244-67967[SKIP][1/5]\t103012\t6720401G13Rik\n+CA-103012-7368-11008-11052-11749[INC][4/8]\t103012\t6720401G13Rik\n+CA-103012-7368-11008-11052-11749[SKIP][4/8]\t103012\t6720401G13Rik\n+CA-236790-41295-42294-42405-44677[INC][4/4]\t236790\tDdx26b\n+CA-236790-41295-42294-42405-44677[SKIP][4/4]\t236790\tDdx26b\n+CA-14199-61864-62452-62652-63057[INC][6/19][DNT]\t14199\tFhl1\n+CA-14199-61864-62452-62652-63057[SKIP][6/19][DNT]\t14199\tFhl1\n+CA-320940-170425-177729-177834-181188[INC][4/5][DNT]\t320940\tAtp11c\n+CA-320940-170425-177729-177834-181188[SKIP][4/5][DNT]\t320940\tAtp11c\n+CA-245446-3197-5906-5978-6887[INC][8/5][UPT][DNT]\t245446\tSlitrk4\n+CA-245446-3197-5906-5978-6887[SKIP][8/5][UPT][DNT]\t245446\tSlitrk4\n+CA-333639-72139-74102-74213-98930[INC][4/2]\t333639\tMamld1\n+CA-333639-72139-74102-74213-98930[SKIP][4/2]\t333639\tMamld1\n+CA-53332-8145-15127-15151-21440[INC][6/14]\t53332\tMtmr1\n+CA-53332-8145-15127-15151-21440[SKIP][6/14]\t53332\tMtmr1\n+CA-53332-8145-17254-17305-21440[INC][5/14]\t53332\tMtmr1\n+CA-53332-8145-17254-17305-21440[SKIP][5/14]\t53332\tMtmr1\n+CA-171486-45239-45842-45911-54704[INC][69/16]\t171486\tCd99l2\n+CA-171486-45239-45842-45911-54704[SKIP][69/16]\t171486\tCd99l2\n+CA-27081-5293-11649-11741-12189[INC][3/6]\t27081\tZfp275\n+CA-27081-5293-11649-11741-12189[SKIP][3/6]\t27081\tZfp275\n+CA-320707-55150-59778-59932-70148[INC][1/3][DNT]\t320707\tAtp2b3\n+CA-320707-55150-59778-59932-70148[SKIP][1/3][DNT]\t320707\tAtp2b3\n+CA-17257-3127-8604-8728-51304[INC][12/5][UPT]\t17257\tMecp2\n+CA-17257-3127-8604-8728-51304[SKIP][12/5][UPT]\t17257\tMecp2\n+CA-192176-16386-16544-16568-18781[INC][10/33]\t192176\tFlna\n+CA-192176-16386-16544-16568-18781[SKIP][10/33]\t192176\tFlna\n+CA-66826-8817-8965-9018-9408[INC][22/9]\t66826\tTaz\n+CA-66826-8817-8965-9018-9408[SKIP][22/9]\t66826\tTaz\n+CA-27643-3133-3290-3396-3639[INC][115/10][UPT]\t27643\tUbl4\n+CA-27643-3133-3290-3396-3639[SKIP][115/10][UPT]\t27643\tUbl4\n+CA-17763-3529-9930-9998-14325[INC][11/2][DNT]\t17763\tMtcp1\n+CA-17763-3529-9930-9998-14325[SKIP][11/2][DNT]\t17763\tMtcp1\n+CA-102866-77669-78427-78454-78680[INC][1/23]\t102866\tPls3\n+CA-102866-77669-78427-78454-78680[SKIP][1/23]\t102866\tPls3\n+CA-19108-25845-27618-27714-33554[INC][15/1]\t19108\tPrkx\n+CA-19108-25845-27618-27714-33554[SKIP][15/1]\t19108\tPrkx\n+CA-54645-5343-7265-7292-7997[INC][30/1]\t54645\tGripap1\n+CA-54645-5343-7265-7292-7997[SKIP][30/1]\t54645\tGripap1\n+CA-54645-5343-7997-8105-12295[INC][1/10]\t54645\tGripap1\n+CA-54645-5343-7997-8105-12295[SKIP][1/10]\t54645\tGripap1\n+CA-54645-16406-16599-16692-17365[INC][2/14]\t54645\tGripap1\n+CA-54645-16406-16599-16692-17365[SKIP][2/14]\t54645\tGripap1\n+CA-22232-8520-10896-11624-12947[INC][2/1][DNT]\t22232\tSlc35a2\n+CA-22232-8520-10896-11624-12947[SKIP][2/1][DNT]\t22232\tSlc35a2\n+CA-13405-2244833-2251634-2251666-2256601[INC][7/5][DNT]\t13405\tDmd\n+CA-13405-2244833-2251634-2251666-2256601[SKIP][7/5][DNT]\t13405\tDmd\n+CA-14933-39525-40580-40598-42470[INC][2/23]\t14933\tGyk\n+CA-14933-39525-40580-40598-42470[SKIP][2/23]\t14933\tGyk\n+CA-236904-3491-10015-10066-20485[INC][11/8]\t236904\tKlhl15\n+CA-236904-3491-10015-10066-20485[SKIP][11/8]\t236904\tKlhl15\n+CA-236915-114544-115196-115258-117332[INC][6/10][DNT]\t236915\tArhgef9\n+CA-236915-114544-115196-115258-117332[SKIP][6/10][DNT]\t236915\tArhgef9\n+CA-26908-4558-5767-5862-7022[INC][24/2]\t26908\tEif2s3y\n+CA-26908-4558-5767-5862-7022[SKIP][24/2]\t26908\tEif2s3y\n+CA-22290-78774-79481-79616-80553[INC][7/4]\t22290\tUty\n+CA-22290-78774-79481-79616-80553[SKIP][7/4]\t22290\tUty\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/annotation/mm10/Mm.seq.devcortex.cass.chrom.can.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/annotation/mm10/Mm.seq.devcortex.cass.chrom.can.bed Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,3818 @@\n+chr10\t100578314\t100584009\tCA-68281-8346-9937-9997-13828[INC][1/39]\t0\t-\t100578314\t100584009\t255,0,0\t3\t117,60,96\t0,3948,5599\n+chr10\t100578314\t100584009\tCA-68281-8346-9937-9997-13828[SKIP][1/39]\t0\t-\t100578314\t100584009\t0,0,255\t2\t117,96\t0,5599\n+chr10\t105832850\t105841378\tCA-216292-3305-8160-8254-11365[INC][5/21][UPT]\t0\t-\t105832850\t105841378\t255,0,0\t3\t165,94,303\t0,3276,8225\n+chr10\t105832850\t105841378\tCA-216292-3305-8160-8254-11365[SKIP][5/21][UPT]\t0\t-\t105832850\t105841378\t0,0,255\t2\t165,303\t0,8225\n+chr10\t108252756\t108259846\tCA-17931-93525-93925-94096-100356[INC][30/11]\t0\t+\t108252756\t108259846\t255,0,0\t3\t168,171,91\t0,568,6999\n+chr10\t108252756\t108259846\tCA-17931-93525-93925-94096-100356[SKIP][30/11]\t0\t+\t108252756\t108259846\t0,0,255\t2\t168,91\t0,6999\n+chr10\t112455607\t112458654\tCA-268345-188413-188806-188910-190367[INC][2/7]\t0\t+\t112455607\t112458654\t255,0,0\t3\t928,104,165\t0,1321,2882\n+chr10\t112455607\t112458654\tCA-268345-188413-188806-188910-190367[SKIP][2/7]\t0\t+\t112455607\t112458654\t0,0,255\t2\t928,165\t0,2882\n+chr10\t116527826\t116549175\tCA-72068-35479-47233-47291-56562[INC][16/136]\t0\t-\t116527826\t116549175\t255,0,0\t3\t123,58,143\t0,9394,21206\n+chr10\t116527826\t116549175\tCA-72068-35479-47233-47291-56562[SKIP][16/136]\t0\t-\t116527826\t116549175\t0,0,255\t2\t123,143\t0,21206\n+chr10\t116919573\t116937655\tCA-216363-15984-25096-25192-33729[INC][10/5]\t0\t-\t116919573\t116937655\t255,0,0\t3\t78,96,259\t0,8615,17823\n+chr10\t116919573\t116937655\tCA-216363-15984-25096-25192-33729[SKIP][10/5]\t0\t-\t116919573\t116937655\t0,0,255\t2\t78,259\t0,17823\n+chr10\t118152174\t118157370\tCA-17245-13488-18047-18077-18408[INC][11/11]\t0\t+\t118152174\t118157370\t255,0,0\t3\t100,30,176\t0,4659,5020\n+chr10\t118152174\t118157370\tCA-17245-13488-18047-18077-18408[SKIP][11/11]\t0\t+\t118152174\t118157370\t0,0,255\t2\t100,176\t0,5020\n+chr10\t119819511\t119930036\tCA-74053-368321-446401-446482-478587[INC][4/4][UPT]\t0\t+\t119819511\t119930036\t255,0,0\t3\t123,81,136\t0,78203,110389\n+chr10\t119819511\t119930036\tCA-74053-368321-446401-446482-478587[SKIP][4/4][UPT]\t0\t+\t119819511\t119930036\t0,0,255\t2\t123,136\t0,110389\n+chr10\t119978437\t119986484\tCA-74053-527294-534160-534316-535018[INC][3/9]\t0\t+\t119978437\t119986484\t255,0,0\t3\t170,156,153\t0,7036,7894\n+chr10\t119978437\t119986484\tCA-74053-527294-534160-534316-535018[SKIP][3/9]\t0\t+\t119978437\t119986484\t0,0,255\t2\t170,153\t0,7894\n+chr10\t120050004\t120055066\tCA-74053-598803-598918-598963-603519[INC][2/14]\t0\t+\t120050004\t120055066\t255,0,0\t3\t112,45,234\t0,227,4828\n+chr10\t120050004\t120055066\tCA-74053-598803-598918-598963-603519[SKIP][2/14]\t0\t+\t120050004\t120055066\t0,0,255\t2\t112,234\t0,4828\n+chr10\t123009502\t123010637\tCA-67074-69012-69575-69593-69874[INC][4/11]\t0\t-\t123009502\t123010637\t255,0,0\t3\t129,18,144\t0,410,991\n+chr10\t123009502\t123010637\tCA-67074-69012-69575-69593-69874[SKIP][4/11]\t0\t-\t123009502\t123010637\t0,0,255\t2\t129,144\t0,991\n+chr10\t123146793\t123163654\tCA-14479-36331-46796-46883-52985[INC][10/18]\t0\t-\t123146793\t123163654\t255,0,0\t3\t145,87,62\t0,6247,16799\n+chr10\t123146793\t123163654\tCA-14479-36331-46796-46883-52985[SKIP][10/18]\t0\t-\t123146793\t123163654\t0,0,255\t2\t145,62\t0,16799\n+chr10\t127089229\t127090575\tCA-216439-13452-14235-14295-14520[INC][3/2]\t0\t+\t127089229\t127090575\t255,0,0\t3\t129,60,149\t0,912,1197\n+chr10\t127089229\t127090575\tCA-216439-13452-14235-14295-14520[SKIP][3/2]\t0\t+\t127089229\t127090575\t0,0,255\t2\t129,149\t0,1197\n+chr10\t127459466\t127465013\tCA-71750-72265-74877-74973-77671[INC][22/13]\t0\t+\t127459466\t127465013\t255,0,0\t3\t109,96,32\t0,2721,5515\n+chr10\t127459466\t127465013\tCA-71750-72265-74877-74973-77671[SKIP][22/13]\t0\t+\t127459466\t127465013\t0,0,255\t2\t109,32\t0,5515\n+chr10\t127464981\t127472000\tCA-71750-77703-77820-77874-84457[INC][8/27]\t0\t+\t127464981\t127472000\t255,0,0\t3\t32,54,233\t0,149,6786\n+chr10\t127464981\t127472000\tCA-71750-77703-77820-77874-84457[SKIP][8/27]\t0\t+\t127464981\t127472000\t0,0,255\t2\t32,233\t0,6786\n+chr10\t127476519\t127484218\tCA-71750-89388-94382-94484-96665[INC][8/16]\t0\t+\t127476519\t127484218\t255,0,0\t3\t179,102,243\t0,5173,7456\n+chr10\t127476519\t127484218\tCA-71750-89388-94382-94484-96665[SKIP][8'..b'7-8604-8728-51304[INC][12/5][UPT]\t0\t-\t74036981\t74085636\t255,0,0\t3\t351,124,127\t0,42927,48528\n+chrX\t74036981\t74085636\tCA-17257-3127-8604-8728-51304[SKIP][12/5][UPT]\t0\t-\t74036981\t74085636\t0,0,255\t2\t351,127\t0,48528\n+chrX\t74230505\t74233338\tCA-192176-16386-16544-16568-18781[INC][10/33]\t0\t-\t74230505\t74233338\t255,0,0\t3\t248,24,190\t0,2461,2643\n+chrX\t74230505\t74233338\tCA-192176-16386-16544-16568-18781[SKIP][10/33]\t0\t-\t74230505\t74233338\t0,0,255\t2\t248,190\t0,2643\n+chrX\t74288450\t74289182\tCA-66826-8817-8965-9018-9408[INC][22/9]\t0\t+\t74288450\t74289182\t255,0,0\t3\t63,53,78\t0,211,654\n+chrX\t74288450\t74289182\tCA-66826-8817-8965-9018-9408[SKIP][22/9]\t0\t+\t74288450\t74289182\t0,0,255\t2\t63,78\t0,654\n+chrX\t74367700\t74368532\tCA-27643-3133-3290-3396-3639[INC][115/10][UPT]\t0\t-\t74367700\t74368532\t255,0,0\t3\t209,106,117\t0,452,715\n+chrX\t74367700\t74368532\tCA-27643-3133-3290-3396-3639[SKIP][115/10][UPT]\t0\t-\t74367700\t74368532\t0,0,255\t2\t209,117\t0,715\n+chrX\t75404845\t75416213\tCA-17763-3529-9930-9998-14325[INC][11/2][DNT]\t0\t-\t75404845\t75416213\t255,0,0\t3\t373,68,199\t0,4700,11169\n+chrX\t75404845\t75416213\tCA-17763-3529-9930-9998-14325[SKIP][11/2][DNT]\t0\t-\t75404845\t75416213\t0,0,255\t2\t373,199\t0,11169\n+chrX\t75799394\t75800644\tCA-102866-77669-78427-78454-78680[INC][1/23]\t0\t-\t75799394\t75800644\t255,0,0\t3\t96,27,143\t0,322,1107\n+chrX\t75799394\t75800644\tCA-102866-77669-78427-78454-78680[SKIP][1/23]\t0\t-\t75799394\t75800644\t0,0,255\t2\t96,143\t0,1107\n+chrX\t77765348\t77773235\tCA-19108-25845-27618-27714-33554[INC][15/1]\t0\t-\t77765348\t77773235\t255,0,0\t3\t58,96,120\t0,5898,7767\n+chrX\t77765348\t77773235\tCA-19108-25845-27618-27714-33554[SKIP][15/1]\t0\t-\t77765348\t77773235\t0,0,255\t2\t58,120\t0,7767\n+chrX\t7792275\t7795099\tCA-54645-5343-7265-7292-7997[INC][30/1]\t0\t+\t7792275\t7795099\t255,0,0\t3\t62,27,108\t0,1984,2716\n+chrX\t7792275\t7795099\tCA-54645-5343-7265-7292-7997[SKIP][30/1]\t0\t+\t7792275\t7795099\t0,0,255\t2\t62,108\t0,2716\n+chrX\t7792275\t7799428\tCA-54645-5343-7997-8105-12295[INC][1/10]\t0\t+\t7792275\t7799428\t255,0,0\t3\t62,108,139\t0,2716,7014\n+chrX\t7792275\t7799428\tCA-54645-5343-7997-8105-12295[SKIP][1/10]\t0\t+\t7792275\t7799428\t0,0,255\t2\t62,139\t0,7014\n+chrX\t7803322\t7804494\tCA-54645-16406-16599-16692-17365[INC][2/14]\t0\t+\t7803322\t7804494\t255,0,0\t3\t78,93,135\t0,271,1037\n+chrX\t7803322\t7804494\tCA-54645-16406-16599-16692-17365[SKIP][2/14]\t0\t+\t7803322\t7804494\t0,0,255\t2\t78,135\t0,1037\n+chrX\t7889611\t7894464\tCA-22232-8520-10896-11624-12947[INC][2/1][DNT]\t0\t+\t7889611\t7894464\t255,0,0\t3\t152,728,274\t0,2528,4579\n+chrX\t7889611\t7894464\tCA-22232-8520-10896-11624-12947[SKIP][2/1][DNT]\t0\t+\t7889611\t7894464\t0,0,255\t2\t152,274\t0,4579\n+chrX\t85190609\t85202765\tCA-13405-2244833-2251634-2251666-2256601[INC][7/5][DNT]\t0\t+\t85190609\t85202765\t255,0,0\t3\t93,32,295\t0,6894,11861\n+chrX\t85190609\t85202765\tCA-13405-2244833-2251634-2251666-2256601[SKIP][7/5][DNT]\t0\t+\t85190609\t85202765\t0,0,255\t2\t93,295\t0,11861\n+chrX\t85737313\t85740361\tCA-14933-39525-40580-40598-42470[INC][2/23]\t0\t-\t85737313\t85740361\t255,0,0\t3\t36,18,67\t0,1908,2981\n+chrX\t85737313\t85740361\tCA-14933-39525-40580-40598-42470[SKIP][2/23]\t0\t-\t85737313\t85740361\t0,0,255\t2\t36,67\t0,2981\n+chrX\t94235220\t94253126\tCA-236904-3491-10015-10066-20485[INC][11/8]\t0\t+\t94235220\t94253126\t255,0,0\t3\t200,51,712\t0,6724,17194\n+chrX\t94235220\t94253126\tCA-236904-3491-10015-10066-20485[SKIP][11/8]\t0\t+\t94235220\t94253126\t0,0,255\t2\t200,712\t0,17194\n+chrX\t95051918\t95055011\tCA-236915-114544-115196-115258-117332[INC][6/10][DNT]\t0\t-\t95051918\t95055011\t255,0,0\t3\t236,62,69\t0,2310,3024\n+chrX\t95051918\t95055011\tCA-236915-114544-115196-115258-117332[SKIP][6/10][DNT]\t0\t-\t95051918\t95055011\t0,0,255\t2\t236,69\t0,3024\n+chrY\t1012047\t1014792\tCA-26908-4558-5767-5862-7022[INC][24/2]\t0\t+\t1012047\t1014792\t255,0,0\t3\t122,95,159\t0,1331,2586\n+chrY\t1012047\t1014792\tCA-26908-4558-5767-5862-7022[SKIP][24/2]\t0\t+\t1012047\t1014792\t0,0,255\t2\t122,159\t0,2586\n+chrY\t1168089\t1170184\tCA-22290-78774-79481-79616-80553[INC][7/4]\t0\t-\t1168089\t1170184\t255,0,0\t3\t96,135,220\t0,1033,1875\n+chrY\t1168089\t1170184\tCA-22290-78774-79481-79616-80553[SKIP][7/4]\t0\t-\t1168089\t1170184\t0,0,255\t2\t96,220\t0,1875\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/annotation/mm10/mm10.conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/annotation/mm10/mm10.conf Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,7 @@ +mm10 as /home/hf2304/galaxy/splicescope/source/annotation/mm10/Mm.seq.devcortex.cass.chrom.can.bed cass +#mm10 as annotation/Mm.seq.all.cass.chrom.can.bed cass +mm10 as /home/hf2304/galaxy/splicescope/source/annotation/mm10/Mm.seq.all.taca.chrom.can.bed taca +mm10 as /home/hf2304/galaxy/splicescope/source/annotation/mm10/Mm.seq.all.alt5.chrom.can.bed alt5 +mm10 as /home/hf2304/galaxy/splicescope/source/annotation/mm10/Mm.seq.all.alt3.chrom.can.bed alt3 +mm10 as /home/hf2304/galaxy/splicescope/source/annotation/mm10/Mm.seq.all.mutx.chrom.can.bed mutx +mm10 as /home/hf2304/galaxy/splicescope/source/annotation/mm10/Mm.seq.all.iret.chrom.can.bed iret |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/splicescope4maturation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/splicescope4maturation.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,107 @@ +<tool id="splicescope4maturation" name="Predicting neuronal maturation" version="0.1.0"> + <description>based on splicing profile.</description> + <command interpreter="bash"> splicescope_wrapper.sh '$output' + #if $file_type.source_type == '-s': + -s "$matrix_input" + #end if + + #if $file_type.source_type == '-b': + -b + #for $bed in $file_type.bed_files: + "$bed.bed_input" "$bed.sample_name" + #end for + #end if + + </command> + + <inputs> + <!--<param format="txt" name="input" type="data" label="Source file"/>--> + <conditional name="file_type"> + <param name="source_type" type="select" label="Input File Type"> + <option value="-s" selected="true">Splicing Matrix (single file)</option> + <option value="-b">Bed Files (multiple files allowed)</option> + </param> + <when value="-s"> + <param format="txt" name="matrix_input" type="data" label="Splicing Matrix File"/> + </when> + <when value="-b"> + <repeat name="bed_files" title="BED File for sample" min ="1"> + <param format="tabular" name="bed_input" type="data" label="BED File"/> + <param name="sample_name" type="text" label="Label for this sample" value=""/> + </repeat> + </when> + </conditional> + </inputs> + + <outputs> + <data format="zip" name="output" /> + </outputs> + +<!-- + <tests> + <test> + <param name="input" value="test.cass.mat.txt"/> + <output name="out_file1" file="NMPre.res.zip"/> + </test> + </tests> +--> + + <help> +**What it does** + +This tool is part of the Splicescope pipeline for prediction of neuronal maturation using splicing profiles. This tool takes either an exon inclusion ratio matrix file or junction bed files from multiple samples as input and outputs a zip file including prediction results, PCA analysis results and a PCA plot based on reference samples and an html file to summarize the prediction. + +------ + +**Input formats** + +- Exon splicing matrix file output from Quantas. Please refer to https://zhanglab.c2b2.columbia.edu/index.php/Quantas_Documentation on how to generate this file format. + +Example for splicing matrix file:: + + event_id NAME Sample1 Sample2 Sample3 + CA-100036521-14294-117618-117680-170683[INC][40/1][DNT] 100036521//Gm16039 1 0.938 1 + CA-100036521-117680-144789-144867-170683[INC][3/42][DNT]100036521//Gm16039 0.05 0.014 0 + etc. + +- Junction BED file from mapping software such as TopHat. Please note that these two input formats can not be specified at the same time. Sample label must be specified if using BED file as input. Set labels to the same if you want to merge biological or technical replicates. + +Example for junction BED file:: + + chr1 3207264 3213485 JUNC00000001 1 - 3207264 3213485 255,0,0 2 53,47 0,6174 + chr1 3216873 3421784 JUNC00000002 5 - 3216873 3421784 255,0,0 2 95,83 0,204828 + etc. + +------ + +**Output formats** + +The output file for download would be a zip file including: + +- res_pre.txt which includes 10 columns representing the predicted maturation stage and corresponding prediction confidence score using both whole sets of developmentally regulated exons and RBP-Specific targets as reference. + +Example:: + + Maturation ConfidenceScore PtbpMaturation PtbpConfidenceScore RbfoxMaturation RbfoxConfidenceScore MbnlMaturation MbnlConfidenceScore NovaMaturation NovaConfidenceScore + Sample1 6 0.119 2 0.149 6 0.387 6 0.534 1 0.323 + Sample2 2 0.080 2 0.142 6 0.361 6 0.483 1 0.293 + etc. + +- res_pca.txt which includes the PC1 and PC2 value used for both user-defined samples and reference cortex samples in the plot. + +Example:: + + PC1 PC2 + Sample1 -1.350 2.817 + Sample2 -1.907 3.265 + etc. + +- dataPCA.pdf which is the PCA plot for use-defined samples based on reference samples <!--(figure below as an example)-->. + +<!--.. image:: $PATH_TO_IMAGES/dataPCA.png --> + +- index.html which summarizes the prediction results and PCA plot. + + </help> + +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/splicescope_wrapper.amazon.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/splicescope_wrapper.amazon.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,41 @@ +#!/bin/bash + +## set perlib location +export PERL5LIB=/usr/local/lib:/usr/local/lib/czplib +export CACHEHOME=/home/galaxy/cache +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +args=("$@") +ELEMENTS=${#args[@]} + +sample_label_file_name="/home/galaxy/cache/samplename_$$.txt" + +if [ ${args[1]} == "-s" ]; then + Rscript ${DIR}/splicescope.R -s "${args[2]}" -o "${args[0]}" + +else + bed_files="" + + for (( i=2; i<$ELEMENTS; i++ )); do + if [ $((i%2)) -eq 0 ]; then + beddir=`echo ${args[${i}]} | cut -d "." -f 1` + #printf '%s\t' "${args[${i}]}" >> ${DIR}/samplename.txt + printf '%s\t' "$beddir" >> $sample_label_file_name + bed_files="${bed_files},${args[${i}]}" + else + printf '%s\n' "${args[${i}]}" >> $sample_label_file_name + fi + done + + #samplename.txt has been made + + #now get rid of first comma in bed_files + bed_files="${bed_files:1}" + + Rscript ${DIR}/splicescope.R -b "${bed_files}" -n $sample_label_file_name -o "${args[0]}" + + #now get rid of temp sample name file + rm $sample_label_file_name + +fi + |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/splicescope_wrapper.intron.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/splicescope_wrapper.intron.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,44 @@ +#!/bin/bash + +## set perlib location +#export PERL5LIB=/home/hf2304/galaxy/czlab/NMPre/perl_lib:/home/hf2304/hf_tools/vcftools/src/perl:/home/hf2304/perl_lib_galaxy:/home/hf2304/perl_lib2:/ifs/data/c2b2/cz_lab/tools/R-2.7.1/R.framework/Resources/library/RSPerl/perl/darwin-thread-multi-ld-2level:/home/hf2304/czlab_src/CASE +export PERL5LIB=/home/hf2304/galaxy/splicescope/source/perl_lib/czplib:$PERL5LIB +export CACHEHOME=/data/galaxy/cache +quantasDir="/home/hf2304/galaxy/splicescope/source/quantas/countit/" +annotationDir="/home/hf2304/galaxy/splicescope/source/annotation/mm10/" +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +args=("$@") +ELEMENTS=${#args[@]} + +sample_label_file_name="/data/galaxy/cache/samplename_$$.txt" + +if [ ${args[1]} == "-s" ]; then + /ifs/data/c2b2/cz_lab/tools/R-3.0.1/bin/Rscript ${DIR}/splicescope.R -s "${args[2]}" -o "${args[0]}" + +else + bed_files="" + + for (( i=2; i<$ELEMENTS; i++ )); do + if [ $((i%2)) -eq 0 ]; then + beddir=`echo ${args[${i}]} | cut -d "." -f 1` + #printf '%s\t' "${args[${i}]}" >> ${DIR}/samplename.txt + printf '%s\t' "$beddir" >> $sample_label_file_name + bed_files="${bed_files},${args[${i}]}" + else + printf '%s\n' "${args[${i}]}" >> $sample_label_file_name + fi + done + + #samplename.txt has been made + + #now get rid of first comma in bed_files + bed_files="${bed_files:1}" + + /ifs/data/c2b2/cz_lab/tools/R-3.0.1/bin/Rscript ${DIR}/splicescope.R -q $quantasDir -a $annotationDir -b "${bed_files}" -n $sample_label_file_name -o "${args[0]}" + + #now get rid of temp sample name file + rm $sample_label_file_name + +fi + |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/splicescope_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/splicescope_wrapper.sh Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,44 @@ +#!/bin/bash + +## set perlib location +#export PERL5LIB=/home/hf2304/galaxy/czlab/NMPre/perl_lib:/home/hf2304/hf_tools/vcftools/src/perl:/home/hf2304/perl_lib_galaxy:/home/hf2304/perl_lib2:/ifs/data/c2b2/cz_lab/tools/R-2.7.1/R.framework/Resources/library/RSPerl/perl/darwin-thread-multi-ld-2level:/home/hf2304/czlab_src/CASE +export PERL5LIB=/home/hf2304/galaxy/splicescope/source/perl_lib/czplib:$PERL5LIB +export CACHEHOME=/data/galaxy/cache +quantasDir="/home/hf2304/galaxy/splicescope/source/quantas/countit/" +annotationDir="/home/hf2304/galaxy/splicescope/source/annotation/mm10/" +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +args=("$@") +ELEMENTS=${#args[@]} + +sample_label_file_name="/data/galaxy/cache/samplename_$$.txt" + +if [ ${args[1]} == "-s" ]; then + /ifs/data/c2b2/cz_lab/tools/R-3.0.1/bin/Rscript ${DIR}/splicescope.R -s "${args[2]}" -o "${args[0]}" + +else + bed_files="" + + for (( i=2; i<$ELEMENTS; i++ )); do + if [ $((i%2)) -eq 0 ]; then + beddir=`echo ${args[${i}]} | cut -d "." -f 1` + #printf '%s\t' "${args[${i}]}" >> ${DIR}/samplename.txt + printf '%s\t' "$beddir" >> $sample_label_file_name + bed_files="${bed_files},${args[${i}]}" + else + printf '%s\n' "${args[${i}]}" >> $sample_label_file_name + fi + done + + #samplename.txt has been made + + #now get rid of first comma in bed_files + bed_files="${bed_files:1}" + + /ifs/data/c2b2/cz_lab/tools/R-3.0.1/bin/Rscript ${DIR}/splicescope.R -q $quantasDir -a $annotationDir -b "${bed_files}" -n $sample_label_file_name -o "${args[0]}" + + #now get rid of temp sample name file + rm $sample_label_file_name + +fi + |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/test.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/test.sh Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,7 @@ +#!/bin/bash + + +sample_name_file="/data/galaxy/cache/samplename_$$.txt" + +echo "$sample_name_file" + |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/test/DGN.cass.mat.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/test/DGN.cass.mat.txt Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,16035 @@\n+event_id\tNAME\tDGN1\tDGN2\tDGN3\r\n+CA-100036521-14294-117618-117680-170683[INC][40/1][DNT]\t100036521//Gm16039\t1\t0.938461538461538\t1\r\n+CA-100036521-117680-144789-144867-170683[INC][3/42][DNT]\t100036521//Gm16039\t0.05\t0.0140845070422535\t0\r\n+CA-100036537-4969-46273-46339-46779[INC][2/2]\t100036537//Gm11149\t\t\t\r\n+CA-100036537-4969-30859-31056-46779[INC][1/2]\t100036537//Gm11149\t\t\t\r\n+CA-100037278-10390-10961-11073-12015[INC][5/2]\t100037278//Fam129c\t\t\t\r\n+CA-100037283-3239-5230-5291-8262[INC][105/1][UPT]\t100037283//Rnaset2a\t\t\t\r\n+CA-100038657-3174-4051-4184-6622[INC][7/1][UPT]\t100038657//7420461P10Rik\t\t\t\r\n+CA-100038914@0-3174-5127-5400-8351[INC][4/1][UPT][DNT]\t100038914//Gm16387\t\t\t\r\n+CA-100038914@1-3174-5127-5400-8392[INC][4/1][UPT][DNT]\t100038914//Gm16387\t\t\t\r\n+CA-100038948-3162-3500-3634-4395[INC][7/1][UPT]\t100038948//Mup9\t\t\t\r\n+CA-100039060@0-6590-7895-7997-13614[INC][4/23]\t100039060//0610010B08Rik\t\t\t\r\n+CA-100039060@1-6598-7903-8005-13622[INC][4/23]\t100039060//0610010B08Rik\t\t\t\r\n+CA-100039060@2-6590-7895-7997-13614[INC][4/23]\t100039060//0610010B08Rik\t\t\t\r\n+CA-100039060@3-6598-7903-8005-13622[INC][4/23]\t100039060//0610010B08Rik\t\t\t\r\n+CA-100039060@4-6592-7897-7999-13616[INC][4/23]\t100039060//0610010B08Rik\t\t\t\r\n+CA-100039060@5-6598-7903-8005-13622[INC][4/23]\t100039060//0610010B08Rik\t\t\t\r\n+CA-100039150@0-6168-6506-6640-7398[INC][46/1][UPT]\t100039150//Mup15\t\t\t\r\n+CA-100039150@1-6168-6506-6640-7408[INC][46/1][UPT]\t100039150//Mup15\t\t\t\r\n+CA-100039206-6168-6506-6640-7411[INC][2/1][UPT]\t100039206//Mup17\t\t\t\r\n+CA-100039206-8212-8553-8655-8818[INC][69/1]\t100039206//Mup17\t\t\t\r\n+CA-100039315-3155-3950-4133-7245[INC][1/15][UPT]\t100039315//Gm10436\t\t\t\r\n+CA-100039315-3155-3950-4133-6421[INC][1/13][UPT]\t100039315//Gm10436\t\t\t\r\n+CA-100039315-3155-6421-6732-7245[INC][10/15][UPT]\t100039315//Gm10436\t\t\t\r\n+CA-100039707-3140-7049-7311-33813[INC][1/12][UPT][DNT]\t100039707//Gm2382\t\t\t\r\n+CA-100039707-3310-7049-7311-33813[INC][1/3][UPT][DNT]\t100039707//Gm2382\t\t\t\r\n+CA-100039795-19480-40277-40334-43452[INC][6/2]\t100039795//Ildr2\t0.948717948717949\t0.892496398778342\t1\r\n+CA-100039795-19480-40277-40334-52363[INC][1/1]\t100039795//Ildr2\t0.777777777777778\t1\t1\r\n+CA-100039795-40334-43452-43599-52363[INC][1/1]\t100039795//Ildr2\t0.916666666666667\t1\t1\r\n+CA-100039795-43599-44613-44790-52363[INC][4/2]\t100039795//Ildr2\t0.696969696969697\t0.6977048411415\t0.703703703703704\r\n+CA-100039795-56795-57708-58405-59621[INC][6/1][DNT]\t100039795//Ildr2\t1\t1\t1\r\n+CA-100040421-4707-6903-6963-7210[INC][1/12]\t100040421//Gm2762\t\t\t\r\n+CA-100040421-4707-5029-5097-7210[INC][1/12]\t100040421//Gm2762\t\t\t\r\n+CA-100040531@0-7858-10054-10274-10941[INC][2/97]\t100040531//Dynlt1f\t\t\t\r\n+CA-100040531@1-5629-7826-8046-8713[INC][2/97]\t100040531//Dynlt1f\t\t\t\r\n+CA-100040724-10310-11376-11438-11685[INC][2/4]\t100040724//Mirg\t\t\t\r\n+CA-100040724-16313-16975-17029-17114[INC][2/39][DNT]\t100040724//Mirg\t0\t0.00440775410006097\t0\r\n+CA-100040843-3226-4322-4467-4990[INC][1/1][UPT]\t100040843//Cyp4a32\t\t\t\r\n+CA-100040843-5035-8707-8835-11689[INC][1/1]\t100040843//Cyp4a32\t\t\t\r\n+CA-100041253@0-11631-12296-12887-13872[INC][1/1]\t100041253//AU018829\t\t\t\r\n+CA-100041253@1-11627-12287-12878-13855[INC][1/1]\t100041253//AU018829\t\t\t\r\n+CA-100041354@0-28149-28813-29404-30364[INC][4/2][DNT]\t100041354//Gm3286\t\t\t\r\n+CA-100041354@1-28094-28747-29338-30298[INC][4/2][DNT]\t100041354//Gm3286\t\t\t\r\n+CA-100041354@2-11240-12108-12699-13659[INC][4/2][DNT]\t100041354//Gm3286\t\t\t\r\n+CA-100041530@0-6105-10770-10921-21454[INC][1/1][UPT]\t100041530//Gm10409\t\t\t\r\n+CA-100041530@0-6105-7479-7593-10770[INC][5/1][UPT]\t100041530//Gm10409\t\t\t\r\n+CA-100041530@1-6105-10770-10921-21453[INC][1/1][UPT]\t100041530//Gm10409\t\t\t\r\n+CA-100041530@1-6105-7479-7593-10770[INC][5/1][UPT]\t100041530//Gm10409\t\t\t\r\n+CA-100041574-6526-7025-7171-8954[INC][1/9]\t100041574//9030025P20Rik\t\t\t\r\n+CA-100041639@1-11205-13887-14043-17235[INC][109/6][DNT]\t100041639//Gm3448\t\t\t\r\n+CA-100041639@2-14213-16895-17051-20242[INC][109/6][DNT]\t100041639//Gm3448\t\t\t\r\n+CA-100041678@0-8904-18671-18803-19535[INC][1/1][UPT]\t100041678//G'..b'\n+CA-17268-5837-7578-7720-8287[INC][10/1]\t17268//Meis1\t\t\t\r\n+CA-17268-137583-137887-137982-140047[INC][11/3][DNT]\t17268//Meis1\t\t\t\r\n+CA-17274-17730-18123-18189-19420[INC][130/4]\t17274//Rab8a\t1\t0.975628298831091\t1\r\n+CA-17283-6983-7360-7525-7708[INC][27/2][DNT]\t17283//Men1\t0.962962962962963\t1\t1\r\n+CA-17288-3093-5578-5597-6307[INC][33/1][UPT]\t17288//Mep1b\t\t\t\r\n+CA-17294-7830-9734-9838-10164[INC][8/422][UPT]\t17294//Mest\t\t0\t\r\n+CA-17295-87883-88075-88216-92282[INC][5/1]\t17295//Met\t\t\t\r\n+CA-17304-6814-8672-8783-9010[INC][82/431]\t17304//Mfge8\t0.0340004701907051\t0.0165332228888802\t0.0386807847506118\r\n+CA-17308-4427-10359-10484-19380[INC][14/18][UPT][DNT]\t17308//Mgat1\t\t0.928571428571429\t1\r\n+CA-17308-6341-10359-10484-19380[INC][4/5][UPT][DNT]\t17308//Mgat1\t\t1\t\r\n+CA-17308-9465-10359-10484-19380[INC][69/1][UPT][DNT]\t17308//Mgat1\t\t1\t\r\n+CA-17311-67307-68033-68117-74523[INC][24/2]\t17311//Kitl\t0.851851851851852\t1\t\r\n+CA-17312-4382-4963-5074-5506[INC][5/2]\t17312//Clec10a\t\t\t\r\n+CA-17312-4382-4960-5074-5506[INC][5/2]\t17312//Clec10a\t\t\t\r\n+CA-17312-6455-6537-6644-6795[INC][7/1][DNT]\t17312//Clec10a\t\t\t\r\n+CA-17314-3077-59657-59794-194338[INC][16/1][UPT]\t17314//Mgmt\t\t\t\r\n+CA-17314-3077-28245-28402-59657[INC][2/16][UPT]\t17314//Mgmt\t\t\t\r\n+CA-17314-3077-28245-28402-194338[INC][1/1][UPT]\t17314//Mgmt\t\t\t\r\n+CA-17314-3077-77347-77406-194338[INC][1/1][UPT]\t17314//Mgmt\t\t\t\r\n+CA-17314-3077-72828-73087-194338[INC][2/1][UPT]\t17314//Mgmt\t\t\t\r\n+CA-17314-28402-59657-59794-194338[INC][2/1]\t17314//Mgmt\t\t\t\r\n+CA-17318-293060-293680-293747-294675[INC][1/4][UPT]\t17318//Mid1\t\t\t\r\n+CA-17344-57939-60789-60863-64649[INC][1/67]\t17344//Pias2\t0\t0\t0.000916491455064963\r\n+CA-17346-3036-11806-12117-20786[INC][2/2][UPT]\t17346//Mknk1\t1\t1\t\r\n+CA-17346-3036-11974-12117-20786[INC][60/2][UPT]\t17346//Mknk1\t1\t1\t1\r\n+CA-17346-3036-9376-9420-11974[INC][10/66][UPT]\t17346//Mknk1\t0\t0\t0\r\n+CA-17346-12117-18656-18762-20786[INC][5/73]\t17346//Mknk1\t0.215759168841648\t0.0144966615128548\t5.33967611076605e-05\r\n+CA-17346-39300-40335-40379-41704[INC][8/2][DNT]\t17346//Mknk1\t0.529524749569833\t0.654346040870471\t0.515151515151515\r\n+CA-17349-13602-19881-19926-21795[INC][14/45]\t17349//Mlf1\t\t\t\r\n+CA-17350-17209-17595-17726-18473[INC][1/19]\t17350//Mlh1\t\t\t\r\n+CA-17350-18981-21632-21745-25295[INC][13/1]\t17350//Mlh1\t\t0.904761904761905\t\r\n+CA-17354-3135-12798-12958-18888[INC][4/4][UPT]\t17354//Mllt10\t\t\t\r\n+CA-17354-18968-39931-40124-40547[INC][2/5]\t17354//Mllt10\t\t\t\r\n+CA-17354-18968-30274-30489-39931[INC][1/2]\t17354//Mllt10\t\t\t\r\n+CA-17354-18968-30274-30489-49221[INC][1/23]\t17354//Mllt10\t\t\t\r\n+CA-17354-18968-40547-40598-49221[INC][5/23]\t17354//Mllt10\t\t\t\r\n+CA-17354-18968-57545-57655-70008[INC][3/1]\t17354//Mllt10\t\t\t\r\n+CA-17354-18968-49221-49276-57545[INC][20/3]\t17354//Mllt10\t\t\t\r\n+CA-17354-30489-39931-40078-40547[INC][1/1]\t17354//Mllt10\t\t\t\r\n+CA-17354-30489-40547-40598-49221[INC][1/1]\t17354//Mllt10\t\t\t\r\n+CA-17354-71576-73897-73993-94540[INC][25/1]\t17354//Mllt10\t\t0.875\t\r\n+CA-17354-73993-87031-87100-94540[INC][1/29]\t17354//Mllt10\t\t\t\r\n+CA-17354-118062-118806-118839-133981[INC][27/1]\t17354//Mllt10\t1\t1\t0.96557780119424\r\n+CA-17354-118839-133981-134160-144346[INC][26/1]\t17354//Mllt10\t1\t1\t1\r\n+CA-17356-65918-68402-68447-71388[INC][4/5]\t17356//Mllt4\t1\t1\t1\r\n+CA-17356-131437-133422-133458-138547[INC][2/11]\t17356//Mllt4\t0.894736842105263\t0.954455182840439\t0.899497487437186\r\n+CA-17364-34789-35552-35638-46442[INC][2/101][UPT]\t17364//Trpm1\t\t\t\r\n+CA-17364-34789-43777-44000-46442[INC][1/101][UPT]\t17364//Trpm1\t\t\t\r\n+CA-17364-35638-46442-46522-48328[INC][3/3]\t17364//Trpm1\t\t\t\r\n+CA-17364-46522-48328-48524-51052[INC][102/5]\t17364//Trpm1\t\t\t\r\n+CA-17364-46522-48328-48436-51052[INC][1/5]\t17364//Trpm1\t\t\t\r\n+CA-17364-52296-53587-53759-57441[INC][74/1][UPT][DNT]\t17364//Trpm1\t\t\t\r\n+CA-17364-60104-66722-66826-68273[INC][2/2]\t17364//Trpm1\t\t\t\r\n+CA-17364-60104-66655-66826-68273[INC][24/2]\t17364//Trpm1\t\t\t\r\n+CA-17364-84246-84911-85043-86821[INC][3/1]\t17364//Trpm1\t\t\t\r\n+CA-17380-8143-33842-34004-34913[INC][36/1]\t17380//Mme\t\t\t\r\n+CA-17387-3370-5354-5521-7141[INC][1/115][UPT]\t17387//Mmp14\t\t\t\r\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/test/DRG1.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/test/DRG1.bed Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,136220 @@\n+chr1\t3207264\t3213485\tJUNC00000001\t1\t-\t3207264\t3213485\t255,0,0\t2\t53,47\t0,6174\n+chr1\t3216873\t3421784\tJUNC00000002\t5\t-\t3216873\t3421784\t255,0,0\t2\t95,83\t0,204828\n+chr1\t3421838\t3670616\tJUNC00000003\t2\t-\t3421838\t3670616\t255,0,0\t2\t63,65\t0,248713\n+chr1\t4492638\t4493194\tJUNC00000004\t4\t-\t4492638\t4493194\t255,0,0\t2\t30,95\t0,461\n+chr1\t4774467\t4777575\tJUNC00000005\t1\t-\t4774467\t4777575\t255,0,0\t2\t49,51\t0,3057\n+chr1\t4776702\t4777618\tJUNC00000006\t30\t-\t4776702\t4777618\t255,0,0\t2\t99,94\t0,822\n+chr1\t4777549\t4782665\tJUNC00000007\t37\t-\t4777549\t4782665\t255,0,0\t2\t99,98\t0,5018\n+chr1\t4782640\t4784049\tJUNC00000008\t27\t-\t4782640\t4784049\t255,0,0\t2\t93,99\t0,1310\n+chr1\t4784008\t4785671\tJUNC00000009\t32\t-\t4784008\t4785671\t255,0,0\t2\t97,99\t0,1564\n+chr1\t4807891\t4808487\tJUNC00000010\t38\t+\t4807891\t4808487\t255,0,0\t2\t91,33\t0,563\n+chr1\t4808453\t4828649\tJUNC00000011\t33\t+\t4808453\t4828649\t255,0,0\t2\t33,66\t0,20130\n+chr1\t4828583\t4830317\tJUNC00000012\t39\t+\t4828583\t4830317\t255,0,0\t2\t66,50\t0,1684\n+chr1\t4830267\t4832381\tJUNC00000013\t39\t+\t4830267\t4832381\t255,0,0\t2\t48,71\t0,2043\n+chr1\t4832310\t4837074\tJUNC00000014\t31\t+\t4832310\t4837074\t255,0,0\t2\t71,74\t0,4690\n+chr1\t4834170\t4837061\tJUNC00000015\t1\t+\t4834170\t4837061\t255,0,0\t2\t39,61\t0,2830\n+chr1\t4836999\t4839476\tJUNC00000016\t32\t+\t4836999\t4839476\t255,0,0\t2\t75,90\t0,2387\n+chr1\t4839392\t4841054\tJUNC00000017\t22\t+\t4839392\t4841054\t255,0,0\t2\t96,99\t0,1563\n+chr1\t4841033\t4845060\tJUNC00000018\t16\t+\t4841033\t4845060\t255,0,0\t2\t99,98\t0,3929\n+chr1\t4841057\t4867494\tJUNC00000019\t1\t+\t4841057\t4867494\t255,0,0\t2\t75,25\t0,26412\n+chr1\t4886753\t4889511\tJUNC00000020\t2\t+\t4886753\t4889511\t255,0,0\t2\t78,52\t0,2706\n+chr1\t4889576\t4890813\tJUNC00000021\t2\t+\t4889576\t4890813\t255,0,0\t2\t26,74\t0,1163\n+chr1\t4891971\t4893514\tJUNC00000022\t40\t+\t4891971\t4893514\t255,0,0\t2\t98,98\t0,1445\n+chr1\t4894983\t4896383\tJUNC00000023\t1\t+\t4894983\t4896383\t255,0,0\t2\t22,28\t0,1372\n+chr1\t4896355\t4897133\tJUNC00000024\t2\t+\t4896355\t4897133\t255,0,0\t2\t28,85\t0,693\n+chr1\t4912475\t4916969\tJUNC00000025\t2\t-\t4912475\t4916969\t255,0,0\t2\t73,73\t0,4421\n+chr1\t4916903\t4923869\tJUNC00000026\t1\t-\t4916903\t4923869\t255,0,0\t2\t77,23\t0,6943\n+chr1\t5083179\t5084515\tJUNC00000027\t125\t+\t5083179\t5084515\t255,0,0\t2\t99,99\t0,1237\n+chr1\t5083526\t5084515\tJUNC00000028\t19\t+\t5083526\t5084515\t255,0,0\t2\t99,99\t0,890\n+chr1\t5084464\t5089104\tJUNC00000029\t118\t+\t5084464\t5089104\t255,0,0\t2\t99,96\t0,4544\n+chr1\t5089014\t5093453\tJUNC00000030\t149\t+\t5089014\t5093453\t255,0,0\t2\t97,91\t0,4348\n+chr1\t5089072\t5095675\tJUNC00000031\t1\t+\t5089072\t5095675\t255,0,0\t2\t39,61\t0,6542\n+chr1\t5093360\t5095713\tJUNC00000032\t160\t+\t5093360\t5095713\t255,0,0\t2\t92,99\t0,2254\n+chr1\t5093464\t5095713\tJUNC00000033\t49\t+\t5093464\t5095713\t255,0,0\t2\t29,99\t0,2150\n+chr1\t5095631\t5098124\tJUNC00000034\t164\t+\t5095631\t5098124\t255,0,0\t2\t97,96\t0,2397\n+chr1\t5095832\t5098081\tJUNC00000035\t2\t+\t5095832\t5098081\t255,0,0\t2\t48,53\t0,2196\n+chr1\t5098034\t5101123\tJUNC00000036\t49\t+\t5098034\t5101123\t255,0,0\t2\t99,54\t0,3035\n+chr1\t5098034\t5117487\tJUNC00000037\t125\t+\t5098034\t5117487\t255,0,0\t2\t99,98\t0,19355\n+chr1\t5098089\t5162203\tJUNC00000038\t5\t+\t5098089\t5162203\t255,0,0\t2\t44,99\t0,64015\n+chr1\t5101068\t5117485\tJUNC00000039\t43\t+\t5101068\t5117485\t255,0,0\t2\t55,96\t0,16321\n+chr1\t5107899\t5117457\tJUNC00000040\t1\t+\t5107899\t5117457\t255,0,0\t2\t32,68\t0,9490\n+chr1\t5117390\t5124375\tJUNC00000041\t235\t+\t5117390\t5124375\t255,0,0\t2\t97,99\t0,6886\n+chr1\t5124370\t5133181\tJUNC00000042\t192\t+\t5124370\t5133181\t255,0,0\t2\t99,98\t0,8713\n+chr1\t5133163\t5135910\tJUNC00000043\t217\t+\t5133163\t5135910\t255,0,0\t2\t99,99\t0,2648\n+chr1\t5135838\t5143846\tJUNC00000044\t216\t+\t5135838\t5143846\t255,0,0\t2\t99,97\t0,7911\n+chr1\t5143752\t5150046\tJUNC00000045\t170\t+\t5143752\t5150046\t255,0,0\t2\t99,99\t0,6195\n+chr1\t5149962\t5162203\tJUNC00000046\t206\t+\t5149962\t5162203\t255,0,0\t2\t99,99\t0,12142\n+chr1\t5588592\t5589063\tJUNC00000047\t1\t+\t5588592\t5589063\t255,0,0\t2\t71,29\t0,442\n+chr1\t6214878\t6228036\tJUNC00000048\t27\t+\t6214878\t6228036\t255,0,0\t2\t79,97\t0,13061\n+chr1\t6227950\t6230036\tJUNC00000049\t19\t+\t6227950\t6230036\t255,0,0\t2\t99,78\t0,2008\n+chr1\t6233988\t6234327\tJUNC00000050\t51\t+\t6233988\t6234327\t255,0,0\t2\t99,99\t0,240\n+chr1\t6234303\t6238354\tJUNC0000'..b'4\t1245219\t255,0,0\t2\t65,64\t0,5401\n+chrY\t1245155\t1245480\tJUNC00136171\t4\t-\t1245155\t1245480\t255,0,0\t2\t64,74\t0,251\n+chrY\t1245449\t1245636\tJUNC00136172\t2\t+\t1245449\t1245636\t255,0,0\t2\t58,42\t0,145\n+chrY\t1263221\t1263642\tJUNC00136173\t2\t-\t1263221\t1263642\t255,0,0\t2\t43,73\t0,348\n+chrY\t1263270\t1263668\tJUNC00136174\t17\t-\t1263270\t1263668\t255,0,0\t2\t97,99\t0,299\n+chrY\t1263604\t1263860\tJUNC00136175\t3\t-\t1263604\t1263860\t255,0,0\t2\t93,65\t0,191\n+chrY\t1263609\t1263887\tJUNC00136176\t19\t-\t1263609\t1263887\t255,0,0\t2\t91,92\t0,186\n+chrY\t1263853\t1264937\tJUNC00136177\t21\t-\t1263853\t1264937\t255,0,0\t2\t96,97\t0,987\n+chrY\t1264862\t1265126\tJUNC00136178\t25\t-\t1264862\t1265126\t255,0,0\t2\t96,99\t0,165\n+chrY\t1265110\t1265669\tJUNC00136179\t22\t-\t1265110\t1265669\t255,0,0\t2\t99,95\t0,464\n+chrY\t1265631\t1265917\tJUNC00136180\t16\t-\t1265631\t1265917\t255,0,0\t2\t88,89\t0,197\n+chrY\t1265875\t1266454\tJUNC00136181\t33\t-\t1265875\t1266454\t255,0,0\t2\t98,96\t0,483\n+chrY\t1266422\t1266691\tJUNC00136182\t29\t-\t1266422\t1266691\t255,0,0\t2\t97,97\t0,172\n+chrY\t1266597\t1267191\tJUNC00136183\t39\t-\t1266597\t1267191\t255,0,0\t2\t96,89\t0,505\n+chrY\t1267102\t1269803\tJUNC00136184\t21\t-\t1267102\t1269803\t255,0,0\t2\t86,99\t0,2602\n+chrY\t1269753\t1278976\tJUNC00136185\t21\t-\t1269753\t1278976\t255,0,0\t2\t87,80\t0,9143\n+chrY\t1278912\t1279520\tJUNC00136186\t22\t-\t1278912\t1279520\t255,0,0\t2\t84,98\t0,510\n+chrY\t1279482\t1280408\tJUNC00136187\t19\t-\t1279482\t1280408\t255,0,0\t2\t96,99\t0,827\n+chrY\t1280345\t1282948\tJUNC00136188\t6\t-\t1280345\t1282948\t255,0,0\t2\t94,48\t0,2555\n+chrY\t1282900\t1284382\tJUNC00136189\t2\t-\t1282900\t1284382\t255,0,0\t2\t45,61\t0,1421\n+chrY\t1282900\t1284382\tJUNC00136190\t12\t-\t1282900\t1284382\t255,0,0\t2\t48,61\t0,1421\n+chrY\t1284321\t1286571\tJUNC00136191\t12\t-\t1284321\t1286571\t255,0,0\t2\t61,93\t0,2157\n+chrY\t2328883\t2329159\tJUNC00136192\t5\t+\t2328883\t2329159\t255,0,0\t2\t89,64\t0,212\n+chrY\t2838157\t2839624\tJUNC00136193\t1\t+\t2838157\t2839624\t255,0,0\t2\t30,70\t0,1397\n+chrY\t2838157\t2871272\tJUNC00136194\t1\t+\t2838157\t2871272\t255,0,0\t2\t30,70\t0,33045\n+chrY\t2869805\t2871272\tJUNC00136195\t1\t+\t2869805\t2871272\t255,0,0\t2\t30,70\t0,1397\n+chrY\t2869805\t2910199\tJUNC00136196\t1\t+\t2869805\t2910199\t255,0,0\t2\t30,70\t0,40324\n+chrY\t2908732\t2910199\tJUNC00136197\t1\t+\t2908732\t2910199\t255,0,0\t2\t30,70\t0,1397\n+chrY\t3297388\t3298856\tJUNC00136198\t1\t-\t3297388\t3298856\t255,0,0\t2\t70,30\t0,1438\n+chrY\t3297388\t3330406\tJUNC00136199\t1\t-\t3297388\t3330406\t255,0,0\t2\t70,30\t0,32988\n+chrY\t3328938\t3330406\tJUNC00136200\t1\t-\t3328938\t3330406\t255,0,0\t2\t70,30\t0,1438\n+chrY\t3328938\t3370818\tJUNC00136201\t1\t-\t3328938\t3370818\t255,0,0\t2\t70,30\t0,41850\n+chrY\t3369351\t3370818\tJUNC00136202\t1\t-\t3369351\t3370818\t255,0,0\t2\t70,30\t0,1437\n+chrY\t3369351\t3402461\tJUNC00136203\t1\t-\t3369351\t3402461\t255,0,0\t2\t70,30\t0,33080\n+chrY\t3400994\t3402461\tJUNC00136204\t1\t-\t3400994\t3402461\t255,0,0\t2\t70,30\t0,1437\n+chrY\t3779414\t3780882\tJUNC00136205\t1\t+\t3779414\t3780882\t255,0,0\t2\t30,70\t0,1398\n+chrY\t90757371\t90758563\tJUNC00136206\t1\t+\t90757371\t90758563\t255,0,0\t2\t78,22\t0,1170\n+chrY\t90758581\t90759533\tJUNC00136207\t1\t+\t90758581\t90759533\t255,0,0\t2\t88,12\t0,940\n+chrY\t90759599\t90760486\tJUNC00136208\t2\t+\t90759599\t90760486\t255,0,0\t2\t66,34\t0,853\n+chrY\t90760602\t90762166\tJUNC00136209\t2\t+\t90760602\t90762166\t255,0,0\t2\t12,88\t0,1476\n+chrY\t90762221\t90763234\tJUNC00136210\t7\t+\t90762221\t90763234\t255,0,0\t2\t65,98\t0,915\n+chrY\t90784318\t90784607\tJUNC00136211\t11\t+\t90784318\t90784607\t255,0,0\t2\t88,99\t0,190\n+chrY\t90784642\t90784919\tJUNC00136212\t1\t+\t90784642\t90784919\t255,0,0\t2\t36,64\t0,213\n+chrY\t90784586\t90785684\tJUNC00136213\t8\t+\t90784586\t90785684\t255,0,0\t2\t92,95\t0,1003\n+chrY\t90785889\t90793394\tJUNC00136214\t21\t+\t90785889\t90793394\t255,0,0\t2\t90,99\t0,7406\n+chrY\t90793321\t90816438\tJUNC00136215\t34\t+\t90793321\t90816438\t255,0,0\t2\t96,90\t0,23027\n+chrY\t90793321\t90822375\tJUNC00136216\t34\t+\t90793321\t90822375\t255,0,0\t2\t96,90\t0,28964\n+chrY\t90793604\t90816372\tJUNC00136217\t1\t+\t90793604\t90816372\t255,0,0\t2\t76,24\t0,22744\n+chrY\t90793604\t90822309\tJUNC00136218\t1\t+\t90793604\t90822309\t255,0,0\t2\t76,24\t0,28681\n+chrY\t90841727\t90842919\tJUNC00136219\t1\t+\t90841727\t90842919\t255,0,0\t2\t78,22\t0,1170\n+chrY\t90842937\t90843889\tJUNC00136220\t1\t+\t90842937\t90843889\t255,0,0\t2\t88,12\t0,940\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c splicescope/test/DRG2.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splicescope/test/DRG2.bed Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,121087 @@\n+track name=junctions description="TopHat junctions"\n+chr1\t3216925\t3421781\tJUNC00000001\t6\t-\t3216925\t3421781\t255,0,0\t2\t43,80\t0,204776\n+chr1\t3421818\t3670581\tJUNC00000002\t11\t-\t3421818\t3670581\t255,0,0\t2\t83,30\t0,248733\n+chr1\t4492646\t4493178\tJUNC00000003\t3\t-\t4492646\t4493178\t255,0,0\t2\t22,79\t0,453\n+chr1\t4776704\t4777617\tJUNC00000004\t19\t-\t4776704\t4777617\t255,0,0\t2\t97,93\t0,820\n+chr1\t4777562\t4782665\tJUNC00000005\t24\t-\t4777562\t4782665\t255,0,0\t2\t86,98\t0,5005\n+chr1\t4782634\t4784043\tJUNC00000006\t36\t-\t4782634\t4784043\t255,0,0\t2\t99,93\t0,1316\n+chr1\t4782707\t4785646\tJUNC00000007\t2\t-\t4782707\t4785646\t255,0,0\t2\t26,74\t0,2865\n+chr1\t4784007\t4785664\tJUNC00000008\t34\t-\t4784007\t4785664\t255,0,0\t2\t98,92\t0,1565\n+chr1\t4807893\t4808486\tJUNC00000009\t18\t+\t4807893\t4808486\t255,0,0\t2\t89,32\t0,561\n+chr1\t4808454\t4828649\tJUNC00000010\t15\t+\t4808454\t4828649\t255,0,0\t2\t32,66\t0,20129\n+chr1\t4828583\t4830316\tJUNC00000011\t37\t+\t4828583\t4830316\t255,0,0\t2\t66,49\t0,1684\n+chr1\t4830267\t4832381\tJUNC00000012\t36\t+\t4830267\t4832381\t255,0,0\t2\t48,71\t0,2043\n+chr1\t4832310\t4837074\tJUNC00000013\t31\t+\t4832310\t4837074\t255,0,0\t2\t71,74\t0,4690\n+chr1\t4837000\t4839476\tJUNC00000014\t40\t+\t4837000\t4839476\t255,0,0\t2\t74,90\t0,2386\n+chr1\t4839391\t4841041\tJUNC00000015\t27\t+\t4839391\t4841041\t255,0,0\t2\t97,86\t0,1564\n+chr1\t4841033\t4845059\tJUNC00000016\t9\t+\t4841033\t4845059\t255,0,0\t2\t99,97\t0,3929\n+chr1\t4889561\t4890793\tJUNC00000017\t2\t+\t4889561\t4890793\t255,0,0\t2\t41,54\t0,1178\n+chr1\t4891970\t4893479\tJUNC00000018\t31\t+\t4891970\t4893479\t255,0,0\t2\t99,63\t0,1446\n+chr1\t5083179\t5084513\tJUNC00000019\t99\t+\t5083179\t5084513\t255,0,0\t2\t99,97\t0,1237\n+chr1\t5083218\t5089048\tJUNC00000020\t2\t+\t5083218\t5089048\t255,0,0\t2\t60,40\t0,5790\n+chr1\t5083568\t5084500\tJUNC00000021\t16\t+\t5083568\t5084500\t255,0,0\t2\t57,84\t0,848\n+chr1\t5084464\t5089105\tJUNC00000022\t108\t+\t5084464\t5089105\t255,0,0\t2\t99,97\t0,4544\n+chr1\t5089015\t5093452\tJUNC00000023\t100\t+\t5089015\t5093452\t255,0,0\t2\t96,90\t0,4347\n+chr1\t5093359\t5095713\tJUNC00000024\t100\t+\t5093359\t5095713\t255,0,0\t2\t93,99\t0,2255\n+chr1\t5093463\t5095713\tJUNC00000025\t23\t+\t5093463\t5095713\t255,0,0\t2\t30,99\t0,2151\n+chr1\t5095632\t5098127\tJUNC00000026\t126\t+\t5095632\t5098127\t255,0,0\t2\t96,99\t0,2396\n+chr1\t5098042\t5101123\tJUNC00000027\t32\t+\t5098042\t5101123\t255,0,0\t2\t91,54\t0,3027\n+chr1\t5098035\t5117487\tJUNC00000028\t101\t+\t5098035\t5117487\t255,0,0\t2\t98,98\t0,19354\n+chr1\t5101069\t5117482\tJUNC00000029\t40\t+\t5101069\t5117482\t255,0,0\t2\t54,93\t0,16320\n+chr1\t5117390\t5124375\tJUNC00000030\t232\t+\t5117390\t5124375\t255,0,0\t2\t97,99\t0,6886\n+chr1\t5124371\t5133180\tJUNC00000031\t115\t+\t5124371\t5133180\t255,0,0\t2\t98,97\t0,8712\n+chr1\t5133164\t5135905\tJUNC00000032\t193\t+\t5133164\t5135905\t255,0,0\t2\t98,94\t0,2647\n+chr1\t5135842\t5143845\tJUNC00000033\t228\t+\t5135842\t5143845\t255,0,0\t2\t95,96\t0,7907\n+chr1\t5143761\t5150046\tJUNC00000034\t172\t+\t5143761\t5150046\t255,0,0\t2\t90,99\t0,6186\n+chr1\t5149962\t5162202\tJUNC00000035\t173\t+\t5149962\t5162202\t255,0,0\t2\t99,98\t0,12142\n+chr1\t6214860\t6228024\tJUNC00000036\t17\t+\t6214860\t6228024\t255,0,0\t2\t97,85\t0,13079\n+chr1\t6227955\t6230053\tJUNC00000037\t17\t+\t6227955\t6230053\t255,0,0\t2\t94,95\t0,2003\n+chr1\t6229974\t6234048\tJUNC00000038\t6\t+\t6229974\t6234048\t255,0,0\t2\t99,88\t0,3986\n+chr1\t6233992\t6234326\tJUNC00000039\t32\t+\t6233992\t6234326\t255,0,0\t2\t95,98\t0,236\n+chr1\t6234300\t6238358\tJUNC00000040\t36\t+\t6234300\t6238358\t255,0,0\t2\t99,97\t0,3961\n+chr1\t6238317\t6240005\tJUNC00000041\t4\t+\t6238317\t6240005\t255,0,0\t2\t92,54\t0,1634\n+chr1\t6244180\t6244824\tJUNC00000042\t12\t+\t6244180\t6244824\t255,0,0\t2\t95,30\t0,614\n+chr1\t6244893\t6245233\tJUNC00000043\t23\t+\t6244893\t6245233\t255,0,0\t2\t86,76\t0,264\n+chr1\t6245245\t6245487\tJUNC00000044\t18\t+\t6245245\t6245487\t255,0,0\t2\t99,63\t0,179\n+chr1\t6245439\t6245659\tJUNC00000045\t19\t+\t6245439\t6245659\t255,0,0\t2\t67,62\t0,158\n+chr1\t6245597\t6247735\tJUNC00000046\t9\t+\t6245597\t6247735\t255,0,0\t2\t62,30\t0,2108\n+chr1\t6248093\t6248374\tJUNC00000047\t40\t+\t6248093\t6248374\t255,0,0\t2\t99,99\t0,182\n+chr1\t6250072\t6261014\tJUNC00000048\t41\t+\t6250072\t6261014\t255,0,0\t2\t95,85\t0,10857\n+chr1\t6260996\t6262931\tJUNC00000049\t33\t+\t6260996\t6262931\t255,0,0\t2\t99,98\t0,1837\n+chr1\t6262846\t6263112\tJUNC00000050\t39\t+\t6262846\t6263'..b'037\t7\t-\t1284335\t1286568\t255,0,0\t2\t47,90\t0,2143\n+chrY\t2792573\t2793132\tJUNC00121038\t2\t-\t2792573\t2793132\t255,0,0\t2\t24,88\t0,471\n+chrY\t2833579\t2834138\tJUNC00121039\t2\t+\t2833579\t2834138\t255,0,0\t2\t88,24\t0,535\n+chrY\t2833579\t2865789\tJUNC00121040\t2\t+\t2833579\t2865789\t255,0,0\t2\t88,24\t0,32186\n+chrY\t2833579\t2904712\tJUNC00121041\t2\t+\t2833579\t2904712\t255,0,0\t2\t88,24\t0,71109\n+chrY\t2833579\t2936320\tJUNC00121042\t2\t+\t2833579\t2936320\t255,0,0\t2\t88,24\t0,102717\n+chrY\t2865230\t2865789\tJUNC00121043\t2\t+\t2865230\t2865789\t255,0,0\t2\t88,24\t0,535\n+chrY\t2865230\t2904712\tJUNC00121044\t2\t+\t2865230\t2904712\t255,0,0\t2\t88,24\t0,39458\n+chrY\t2865230\t2936320\tJUNC00121045\t2\t+\t2865230\t2936320\t255,0,0\t2\t88,24\t0,71066\n+chrY\t2904153\t2904712\tJUNC00121046\t2\t+\t2904153\t2904712\t255,0,0\t2\t88,24\t0,535\n+chrY\t2904153\t2936308\tJUNC00121047\t1\t+\t2904153\t2936308\t255,0,0\t2\t88,12\t0,32143\n+chrY\t2935761\t2936320\tJUNC00121048\t2\t+\t2935761\t2936320\t255,0,0\t2\t88,24\t0,535\n+chrY\t3302886\t3303445\tJUNC00121049\t2\t-\t3302886\t3303445\t255,0,0\t2\t24,88\t0,471\n+chrY\t3302886\t3334989\tJUNC00121050\t2\t-\t3302886\t3334989\t255,0,0\t2\t24,88\t0,32015\n+chrY\t3302886\t3375407\tJUNC00121051\t2\t-\t3302886\t3375407\t255,0,0\t2\t24,88\t0,72433\n+chrY\t3302886\t3407046\tJUNC00121052\t2\t-\t3302886\t3407046\t255,0,0\t2\t24,88\t0,104072\n+chrY\t3334430\t3334989\tJUNC00121053\t2\t-\t3334430\t3334989\t255,0,0\t2\t24,88\t0,471\n+chrY\t3334430\t3375407\tJUNC00121054\t2\t-\t3334430\t3375407\t255,0,0\t2\t24,88\t0,40889\n+chrY\t3334430\t3407046\tJUNC00121055\t2\t-\t3334430\t3407046\t255,0,0\t2\t24,88\t0,72528\n+chrY\t3374848\t3375407\tJUNC00121056\t2\t-\t3374848\t3375407\t255,0,0\t2\t24,88\t0,471\n+chrY\t3374848\t3407046\tJUNC00121057\t2\t-\t3374848\t3407046\t255,0,0\t2\t24,88\t0,32110\n+chrY\t3406487\t3407034\tJUNC00121058\t1\t-\t3406487\t3407034\t255,0,0\t2\t24,76\t0,471\n+chrY\t90752879\t90754476\tJUNC00121059\t2\t-\t90752879\t90754476\t255,0,0\t2\t44,56\t0,1541\n+chrY\t90752879\t90838832\tJUNC00121060\t2\t-\t90752879\t90838832\t255,0,0\t2\t44,56\t0,85897\n+chrY\t90753131\t90757367\tJUNC00121061\t1\t+\t90753131\t90757367\t255,0,0\t2\t33,67\t0,4169\n+chrY\t90757400\t90758592\tJUNC00121062\t3\t+\t90757400\t90758592\t255,0,0\t2\t49,51\t0,1141\n+chrY\t90757400\t90842948\tJUNC00121063\t3\t+\t90757400\t90842948\t255,0,0\t2\t49,51\t0,85497\n+chrY\t90759618\t90760528\tJUNC00121064\t2\t+\t90759618\t90760528\t255,0,0\t2\t47,76\t0,834\n+chrY\t90760539\t90762103\tJUNC00121065\t1\t+\t90760539\t90762103\t255,0,0\t2\t75,25\t0,1539\n+chrY\t90762228\t90763205\tJUNC00121066\t2\t+\t90762228\t90763205\t255,0,0\t2\t58,69\t0,908\n+chrY\t90784311\t90784607\tJUNC00121067\t23\t+\t90784311\t90784607\t255,0,0\t2\t95,99\t0,197\n+chrY\t90784612\t90784928\tJUNC00121068\t8\t+\t90784612\t90784928\t255,0,0\t2\t66,73\t0,243\n+chrY\t90784612\t90784989\tJUNC00121069\t3\t+\t90784612\t90784989\t255,0,0\t2\t66,53\t0,324\n+chrY\t90784580\t90785683\tJUNC00121070\t19\t+\t90784580\t90785683\t255,0,0\t2\t98,94\t0,1009\n+chrY\t90784774\t90785047\tJUNC00121071\t31\t+\t90784774\t90785047\t255,0,0\t2\t97,95\t0,178\n+chrY\t90784870\t90785673\tJUNC00121072\t18\t+\t90784870\t90785673\t255,0,0\t2\t67,84\t0,719\n+chrY\t90784981\t90785673\tJUNC00121073\t2\t+\t90784981\t90785673\t255,0,0\t2\t37,84\t0,608\n+chrY\t90785703\t90816442\tJUNC00121074\t11\t+\t90785703\t90816442\t255,0,0\t2\t16,94\t0,30645\n+chrY\t90785703\t90822379\tJUNC00121075\t11\t+\t90785703\t90822379\t255,0,0\t2\t16,94\t0,36582\n+chrY\t90785884\t90793394\tJUNC00121076\t95\t+\t90785884\t90793394\t255,0,0\t2\t95,99\t0,7411\n+chrY\t90785920\t90793365\tJUNC00121077\t1\t+\t90785920\t90793365\t255,0,0\t2\t59,41\t0,7404\n+chrY\t90785968\t90816437\tJUNC00121078\t3\t+\t90785968\t90816437\t255,0,0\t2\t11,89\t0,30380\n+chrY\t90785968\t90822374\tJUNC00121079\t3\t+\t90785968\t90822374\t255,0,0\t2\t11,89\t0,36317\n+chrY\t90793322\t90816442\tJUNC00121080\t78\t+\t90793322\t90816442\t255,0,0\t2\t95,94\t0,23026\n+chrY\t90793322\t90822379\tJUNC00121081\t78\t+\t90793322\t90822379\t255,0,0\t2\t95,94\t0,28963\n+chrY\t90793590\t90816437\tJUNC00121082\t22\t+\t90793590\t90816437\t255,0,0\t2\t90,89\t0,22758\n+chrY\t90793590\t90822374\tJUNC00121083\t22\t+\t90793590\t90822374\t255,0,0\t2\t90,89\t0,28695\n+chrY\t90837235\t90838832\tJUNC00121084\t2\t-\t90837235\t90838832\t255,0,0\t2\t44,56\t0,1541\n+chrY\t90837487\t90841723\tJUNC00121085\t1\t+\t90837487\t90841723\t255,0,0\t2\t33,67\t0,4169\n+chrY\t90841756\t90842948\tJUNC00121086\t3\t+\t90841756\t90842948\t255,0,0\t2\t49,51\t0,1141\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_assembly/velvetg.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_assembly/velvetg.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,301 @@\n+<tool id="velvetg" name="velvetg" version="1.0.0">\n+ <description>Velvet sequence assembler for very short reads</description>\n+ <version_command>velvetg 2>&1 | grep "Version" | sed -e \'s/Version //\'</version_command>\n+ <command interpreter="python">\n+ velvetg_wrapper.py \n+ \'$input.extra_files_path\'\n+ #if $generate_amos.afg == "yes":\n+ -amos_file $generate_amos.afg\n+ #end if\n+ #if $unused_reads.generate_unused == "yes":\n+ -unused_reads $unused_reads.generate_unused\n+ #end if\n+ $read_trkg\n+ #if $coverage.cutoff == "auto":\n+ -cov_cutoff auto\n+ #elif $coverage.cutoff == "value":\n+ -cov_cutoff $coverage.cov_cutoff\n+ #end if\n+ #if $expected.coverage == "auto":\n+ -exp_cov auto\n+ #elif $expected.coverage == "value":\n+ -exp_cov $expected.exp_cov\n+ #end if\n+ #if $contig_lgth.use_contig_lgth == "yes":\n+ -min_contig_lgth $contig_lgth.min_contig_lgth\n+ #end if\n+ #if $reads.paired == "yes":\n+ #if int($reads.ins_length) > 0:\n+ -ins_length $reads.ins_length\n+ #end if\n+ #if $reads.options.advanced == "yes":\n+ #if int($reads.options.ins_length_sd) > 0:\n+ -ins_length_sd $reads.options.ins_length_sd\n+ #end if\n+ #if int($reads.options.ins_length2) > 0:\n+ -ins_length2 $reads.options.ins_length2\n+ #end if\n+ #if int($reads.options.ins_length2_sd) > 0:\n+ -ins_length2_sd $reads.options.ins_length2_sd\n+ #end if\n+ #if int($reads.options.ins_length_long) > 0:\n+ -ins_length_long $reads.options.ins_length_long\n+ #end if\n+ #if int($reads.options.ins_length_long_sd) > 0:\n+ -ins_length_long_sd $reads.options.ins_length_long_sd\n+ #end if\n+ #if int($reads.options.max_branch_length) > 0:\n+ -max_branch_length $reads.options.max_branch_length\n+ #end if\n+ #if int($reads.options.max_divergence) > 0:\n+ -max_divergence $reads.options.max_divergence\n+ #end if\n+ #if int($reads.options.max_gap_count) > 0:\n+ -max_gap_count $reads.options.max_gap_count\n+ #end if\n+ #if int($reads.options.min_pair_count) > 0:\n+ -min_pair_count $reads.options.min_pair_count\n+ #end if\n+ #if int($reads.options.max_coverage) > 0:\n+ -max_coverage $reads.options.max_coverage\n+ #end if\n+ #if int($reads.options.long_mult_cutoff) > 0:\n+ -long_mult_cutoff $reads.options.long_mult_cutoff\n+ #end if\n+ $reads.options.scaffolding\n+ #end if\n+ #end if\n+ </command>\n+ <inputs>\n+ <param name="input" type="data" format="velvet" label="Velvet Dataset" help="Prepared by velveth."/>\n+ <conditional name="generate_amos">\n+ <param name="afg" type="select" label="Generate a AMOS.afg file">\n+ <option value="no">No</option>\n+ <option value="yes">Yes</option>\n+ </param>\n+ <when value="no"/>\n+ <when value="yes"/>\n+ </conditional>\n+\n+ <conditional name="unused_reads">\n+ <param name="generate_unused" type="select" label="Generate a UnusedReads fasta file">\n+ <option value="no">No</option>\n+ <option value="yes">Yes</option>\n+ </param>\n+ <when value="no"/>\n+ <when value="yes"/>\n+ </conditional>\n+\n+ <conditional name="last_graph">\n+ '..b"at node lengths are given in k-mers. To obtain the length in nucleotides of each node you simply need to add k - 1, where k is the word-length used in velveth.\n+The in and out columns correspond to the number of arcs on the 5' and 3' ends of the contig respectively.\n+The coverages in columns short1 cov, short1 Ocov, short2 cov, and short2 Ocov are provided in k-mer coverage (5.1).\n+Also, the difference between # cov and # Ocov is the way these values are computed. In the first count, slightly divergent sequences are added to the coverage tally. However, in the second, stricter count, only the sequences which map perfectly onto the consensus sequence are taken into account.\n+\n+**LastGraph**\n+\n+The *LastGraph* file. \n+This file describes in its entirety the graph produced by Velvet. \n+\n+**AMOS.afg**\n+\n+The *velvet_asm.afg* file. \n+This file is mainly designed to be read by the open-source AMOS genome assembly package. Nonetheless, a number of programs are available to transform this kind of file into other assembly file formats (namely ACE, TIGR, Arachne and Celera). See http://amos.sourceforge.net/ for more information.\n+The file describes all the contigs contained in the contigs.fa file (cf 4.2.1).\n+\n+------\n+\n+**Velvet parameter list**\n+\n+This is a list of implemented Velvetg options::\n+\n+ Standard options:\n+ -cov_cutoff floating-point|auto : removal of low coverage nodes AFTER tour bus or allow the system to infer it\n+ (default: no removal)\n+ -ins_length integer : expected distance between two paired end reads (default: no read pairing)\n+ -read_trkg yes|no : tracking of short read positions in assembly (default: no tracking)\n+ -min_contig_lgth integer : minimum contig length exported to contigs.fa file (default: hash length * 2)\n+ -amos_file yes|no : export assembly to AMOS file (default: no export)\n+ -exp_cov floating point|auto : expected coverage of unique regions or allow the system to infer it\n+ (default: no long or paired-end read resolution)\n+ \n+ Advanced options:\n+ -ins_length2 integer : expected distance between two paired-end reads in the second short-read dataset (default: no read pairing)\n+ -ins_length_long integer : expected distance between two long paired-end reads (default: no read pairing)\n+ -ins_length*_sd integer : est. standard deviation of respective dataset (default: 10% of corresponding length)\n+ [replace '*' by nothing, '2' or '_long' as necessary]\n+ -scaffolding yes|no : scaffolding of contigs used paired end information (default: on)\n+ -max_branch_length integer : maximum length in base pair of bubble (default: 100)\n+ -max_divergence floating-point : maximum divergence rate between two branches in a bubble (default: 0.2)\n+ -max_gap_count integer : maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3)\n+ -min_pair_count integer : minimum number of paired end connections to justify the scaffolding of two long contigs (default: 10)\n+ -max_coverage floating point : removal of high coverage nodes AFTER tour bus (default: no removal)\n+ -long_mult_cutoff int : minimum number of long reads required to merge contigs (default: 2)\n+ -unused_reads yes|no : export unused reads in UnusedReads.fa file (default: no)\n+ \n+ Output:\n+ directory/contigs.fa : fasta file of contigs longer than twice hash length\n+ directory/stats.txt : stats file (tab-spaced) useful for determining appropriate coverage cutoff\n+ directory/LastGraph : special formatted file with all the information on the final graph\n+ directory/velvet_asm.afg : (if requested) AMOS compatible assembly file\n+\n+ </help>\n+</tool>\n" |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_assembly/velvetg_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_assembly/velvetg_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,48 @@ +#!/usr/bin/env python +""" +Classes encapsulating decypher tool. +James E Johnson - University of Minnesota +""" +from __future__ import print_function + +import os +import subprocess +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def __main__(): + # Parse Command Line + working_dir = sys.argv[1] + inputs = ' '.join(sys.argv[2:]) + for _ in ('Roadmaps', 'Sequences'): + os.symlink(os.path.join(working_dir, _), _) + cmdline = 'velvetg . %s' % (inputs) + print("Command to be executed: %s" % cmdline) + try: + proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) + returncode = proc.wait() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + if returncode != 0: + raise Exception(stderr) + except Exception as e: + stop_err( 'Error running velvetg ' + str( e ) ) + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_assembly/velveth.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_assembly/velveth.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,129 @@ +<tool id="velveth" name="velveth" version="1.0.0"> + <description>Prepare a dataset for the Velvet velvetg Assembler</description> + <version_command>velveth 2>&1 | grep "Version" | sed -e 's/Version //'</version_command> + <command interpreter="python"> + velveth_wrapper.py + '$out_file1' '$out_file1.extra_files_path' + $hash_length + $strand_specific + #for $i in $inputs + ${i.file_format} + ${i.read_type} + ${i.input} + #end for + </command> + <inputs> + <param label="Hash Length" name="hash_length" type="select" help="k-mer length in base pairs of the words being hashed."> + <option value="11">11</option> + <option value="13">13</option> + <option value="15">15</option> + <option value="17">17</option> + <option value="19">19</option> + <option value="21" selected="yes">21</option> + <option value="23">23</option> + <option value="25">25</option> + <option value="27">27</option> + <option value="29">29</option> + </param> + <param name="strand_specific" type="boolean" checked="false" truevalue="-strand_specific" falsevalue="" label="Use strand specific transcriptome sequencing" help="If you are using a strand specific transcriptome sequencing protocol, you may wish to use this option for better results."/> + <repeat name="inputs" title="Input Files"> + <param label="file format" name="file_format" type="select"> + <option value="-fasta" selected="yes">fasta</option> + <option value="-fastq">fastq</option> + <option value="-eland">eland</option> + <option value="-gerald">gerald</option> + </param> + <param label="read type" name="read_type" type="select"> + <option value="-short" selected="yes">short reads</option> + <option value="-shortPaired">shortPaired reads</option> + <option value="-short2">short2 reads</option> + <option value="-shortPaired2">shortPaired2 reads</option> + <option value="-long">long reads</option> + <option value="-longPaired">longPaired reads</option> + </param> + + <param name="input" type="data" format="fasta,fastq,eland,gerald" label="Dataset"/> + </repeat> + </inputs> + <outputs> + <data format="velvet" name="out_file1" /> + </outputs> + <requirements> + <requirement type="package">velvet</requirement> + </requirements> + <tests> + <test> + <param name="hash_length" value="21" /> + <param name="read_type" value="-shortPaired" /> + <!-- <repeat name="inputs"> --> + <param name="file_format" value="fasta" /> + <param name="read_type" value="shortPaired reads" /> + <param name="input" value="velvet_test_reads.fa" ftype="fasta" /> + <!-- </repeat> --> + <param name="strand_specific" value="" /> + <output name="out_file1" file="velveth_test1/output.html" lines_diff="4"> + <extra_files type="file" name='Sequences' value="velveth_test1/Sequences" compare="diff" /> + <extra_files type="file" name='Roadmaps' value="velveth_test1/Roadmaps" compare="diff" /> + </output> + </test> + </tests> + <help> +**Velvet Overview** + +Velvet_ is a de novo genomic assembler specially designed for short read sequencing technologies, such as Solexa or 454, developed by Daniel Zerbino and Ewan Birney at the European Bioinformatics Institute (EMBL-EBI), near Cambridge, in the United Kingdom. + +Velvet currently takes in short read sequences, removes errors then produces high quality unique contigs. It then uses paired-end read and long read information, when available, to retrieve the repeated areas between contigs. + +Read the Velvet `documentation`__ for details on using the Velvet Assembler. + +.. _Velvet: http://www.ebi.ac.uk/~zerbino/velvet/ + +.. __: http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf + +------ + +**Velveth** + +Velveth takes in a number of sequence files, produces a hashtable, then outputs two files in an output directory (creating it if necessary), Sequences and Roadmaps, which are necessary to velvetg. + +------ + +**Hash Length** + +The hash length, also known as k-mer length, corresponds to the length, in base pairs, of the words being hashed. + +The hash length is the length of the k-mers being entered in the hash table. Firstly, you must observe three technical constraints:: + +# it must be an odd number, to avoid palindromes. If you put in an even number, Velvet will just decrement it and proceed. +# it must be below or equal to MAXKMERHASH length (cf. 2.3.3, by default 31bp), because it is stored on 64 bits +# it must be strictly inferior to read length, otherwise you simply will not observe any overlaps between reads, for obvious reasons. + +Now you still have quite a lot of possibilities. As is often the case, it's a trade- off between specificity and sensitivity. Longer kmers bring you more specificity (i.e. less spurious overlaps) but lowers coverage (cf. below). . . so there's a sweet spot to be found with time and experience. +We like to think in terms of "k-mer coverage", i.e. how many times has a k-mer been seen among the reads. The relation between k-mer coverage Ck and standard (nucleotide-wise) coverage C is Ck = C # (L - k + 1)/L where k is your hash length, and L you read length. +Experience shows that this kmer coverage should be above 10 to start getting decent results. If Ck is above 20, you might be "wasting" coverage. Experience also shows that empirical tests with different values for k are not that costly to run! + +**Input Files** + +Velvet works mainly with fasta and fastq formats. For paired-end reads, the assumption is that each read is next to its mate +read. In other words, if the reads are indexed from 0, then reads 0 and 1 are paired, 2 and 3, 4 and 5, etc. + +Supported file formats are:: + + fasta + fastq + fasta.gz + fastq.gz + eland + gerald + +Read categories are:: + + short (default) + shortPaired + short2 (same as short, but for a separate insert-size library) + shortPaired2 (see above) + long (for Sanger, 454 or even reference sequences) + longPaired + + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_assembly/velveth_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_assembly/velveth_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,61 @@ +#!/usr/bin/env python +""" +Classes encapsulating decypher tool. +James E Johnson - University of Minnesota +""" +import os +import string +import subprocess +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + + +def __main__(): + # Parse command line + html_file = sys.argv[1] + working_dir = sys.argv[2] + try: # for test - needs this done + os.makedirs(working_dir) + except Exception as e: + stop_err( 'Error running velveth ' + str( e ) ) + hash_length = sys.argv[3] + inputs = string.join(sys.argv[4:], ' ') + cmdline = 'velveth %s %s %s > /dev/null' % (working_dir, hash_length, inputs) + try: + proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) + returncode = proc.wait() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + if returncode != 0: + raise Exception(stderr) + except Exception as e: + stop_err( 'Error running velveth ' + str( e ) ) + sequences_path = os.path.join(working_dir, 'Sequences') + roadmaps_path = os.path.join(working_dir, 'Roadmaps') + rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>'] + rval.append('<div>%s<p/></div>' % (cmdline) ) + rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') + rval.append( '<li><a href="%s" type="text/plain">%s </a>%s</li>' % (sequences_path, 'Sequences', 'Sequences' ) ) + rval.append( '<li><a href="%s" type="text/plain">%s </a>%s</li>' % (roadmaps_path, 'Roadmaps', 'Roadmaps' ) ) + rval.append( '</ul></div></html>' ) + with open(html_file, 'w') as f: + f.write("\n".join( rval )) + f.write('\n') + + +if __name__ == "__main__": + __main__() |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_mapping/PerM.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_mapping/PerM.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,369 @@\n+<tool id="PerM" name="Map with PerM" version="1.1.2">\n+ <description>for SOLiD and Illumina</description>\n+ <!-- works with PerM version 0.2.6 -->\n+ <requirements>\n+ <requirement type="package">perm</requirement>\n+ </requirements>\n+ <command>\n+ echo -n "PerM "; PerM 2>&1 | grep "Version";\n+ PerM\n+ #if $s.sourceOfRef.refSource == "history"\n+ $s.sourceOfRef.ref\n+ #else\n+ #if $s.space == "color"\n+ "${s.sourceOfRef.index.fields.path}"\n+ #elif $s.space == "base"\n+ "${s.sourceOfRef.index.fields.path}"\n+ #end if\n+ #end if\n+ #if $s.mate.singleOrPairs == "single":\n+ $s.mate.reads\n+ #else:\n+ -1 $s.mate.reads1 -2 $s.mate.reads2\n+ -U $s.mate.upperbound\n+ -L $s.mate.lowerbound\n+ $s.mate.excludeAmbiguousPairs\n+ #end if\n+ #if $s.space == "color":\n+ --readFormat "csfastq"\n+ #else:\n+ --readFormat "fastq"\n+ #end if\n+ #if $int($str($valAlign)) >= 0\n+ -v $valAlign\n+ #end if\n+ #if $align.options == "full":\n+ --seed $align.seed\n+ -$align.alignments\n+ #if $str($align.delimiter) != "None"\n+ --delimiter $align.delimiter\n+ #end if\n+ -T $align.sTrimL\n+ $align.includeReadsWN\n+ $align.statsOnly\n+ $align.ignoreQS\n+ #end if\n+ #if $str($bUnmappedRead) == "true" and $s.space == "color"\n+ -u $unmappedReadOutCS\n+ #elif $str($bUnmappedRead) == "true" and $s.space == "base"\n+ -u $unmappedReadOut\n+ #end if\n+ -o $output\n+ --outputFormat sam\n+ --noSamHeader | tr \'\\r\' \'\\n\' | tr -cd "[:print:]\\t\\n " | grep "Reads\\|Sub0\\|Pairs\\|single" | sed \'s/.*Reads:,//\' | sed \'s/\\/.*dat,_ Sub0/Sub0/\'\n+ </command>\n+ <inputs>\n+ <conditional name="s">\n+ <param name="space" label="Is your data color space (SOLiD) or base space (Illumina)?" type="select">\n+ <option value="color">Color space</option>\n+ <option value="base">Base space</option>\n+ </param>\n+ <when value="color">\n+ <conditional name="sourceOfRef">\n+ <param name="refSource" label="Will you provide your own reference file from the history or use a built-in index?" type="select">\n+ <option value="indexed">Built-in index</option>\n+ <option value="history">Fasta file from history</option>\n+ </param>\n+ <when value="indexed">\n+ <param name="index" type="select" label="Select a reference genome (with seed and read length)" help="if your genome of interest is not listed - contact Galaxy team">\n+ <options from_data_table="perm_color_indexes"/>\n+ </param>\n+ </when>\n+ <when value="history">\n+ <param name="ref" format="fasta" type="data" label="Reference" />\n+ </when>\n+ </conditional>\n+ <conditional name="mate">\n+ <param name="singleOrPairs" label="Mate-paired?" type="select">\n+ <option value="single">Single-end</option>\n+ <option value="paired">Mate pairs</option>\n+ </param>\n+ <when value="single">\n+ <param format="fastqcssanger" name="reads" type="data" label="Reads" />\n+ </when>\n+ <when value="paired">\n+ <param name="reads1" format="fastqcssanger" label="Forward FASTQ file" type="data" />\n+ <param name="reads2" format="fastqcssanger" label="Reverse FASTQ file" type="data" />\n+ <param label="Upperbound of pairs separation (-U)" name="upperbound" type="integer" size="8" value="100000" />\n+ <param label="Lowerbound of pairs separation (-L)" name="lowerbound" type="integer" size="8" value="0" />\n+ <param label="Exclude ambiguous pairs (-e)" name="excludeAmbiguousPairs" type="boolean" checked="false" truevalue="-e" falsevalue="" />\n+ </when>\n+ </conditional>\n+ </when>\n+ <when value="base">\n+ '..b'------------------------------------\n+ 0x0001 the read is paired in sequencing\n+ 0x0002 the read is mapped in a proper pair\n+ 0x0004 the query sequence itself is unmapped\n+ 0x0008 the mate is unmapped\n+ 0x0010 strand of the query (1 for reverse)\n+ 0x0020 strand of the mate\n+ 0x0040 the read is the first read in a pair\n+ 0x0080 the read is the second read in a pair\n+ 0x0100 the alignment is not primary\n+\n+Here is some sample output::\n+\n+ Qname\tFLAG\tRname\tPOS\tMAPQ\tCIAGR\tMRNM\tMPOS\tISIZE\tSEQ\tQUAL\tNM\tCS\tCQ\n+ 491_28_332_F3 16 ref-1 282734 255 35M * 0 0 AGTCAAACTCCGAATGCCAATGACTTATCCTTAGG #%%%%%%%!!%%%!!%%%%%%%%!!%%%%%%%%%% NM:i:3 CS:Z:C0230202330012130103100230121001212 CQ:Z:###################################\n+ 491_28_332_F3 16 ref-1 269436 255 35M * 0 0 AGTCAAACTCCGAATGCCAATGACTTATCCTTAGG #%%%%%%%!!%%%!!%%%%%%%%!!%%%%%%%%%% NM:i:3 CS:Z:C0230202330012130103100230121001212 CQ:Z:###################################\n+\n+The user can check a checkbox for optional output containing the unmmaped reads in fastqsanger or fastqcssanger. The default is to produce it.\n+\n+**PerM parameter list**\n+\n+Below is a list of PerM command line options for PerM. Not all of these are relevant to Galaxy\'s implementation, but are included for completeness.\n+\n+The command for single-end::\n+\n+ PerM [ref_or_index] [read] [options]\n+\n+The command for paired-end::\n+\n+ PerM [ref_or_index] -1 [read1] -2 [read1] [options]\n+\n+The command-line options::\n+\n+ -A Output all alignments within the given mismatch threshold, end-to-end.\n+ -B Output best alignments in terms of mismatches in the given mismatch threshold. [Default]\n+ -E Output only the uniquely mapped reads in the given mismatch threshold.\n+ -m Create the reference index, without reusing the saved index.\n+ -s PATH Save the reference index to accelerate the mapping in the future. If PATH is not specified, the default path will be used.\n+ -v INT Where INT is the number of mismatches allowed in one end. [Default=2]\n+ -T INT Where INT is the length to truncate read length to, so 30 means use only first 30 bases (signals). Leave blank if the full read is meant to be used.\n+ -o PATH Where PATH is for output the mapping of one read set. PerM\'s output are in .mapping or .sam format, determined by the ext name of PATH. Ex: -o out.sam will output in SAM format; -o out.mapping will output in .mapping format.\n+ -d PATH Where PATH is the directory for multiple read sets.\n+ -u PATH Print the fastq file of those unmapped reads to the file in PATH.\n+ --noSamHeader Print no SAM header so it is convenient to concatenate multiple SAM output files.\n+ --includeReadsWN Encodes N or "." with A or 3, respectively.\n+ --statsOnly Output the mapping statistics in stdout only, without saving alignments to files.\n+ --ignoreQS Ignore the quality scores in fastq or QUAL files.\n+ --seed {F2 | S11 | F3 | F4} Specify the seed pattern, which has a specific full sensitivity. Check the algorithm page (link below) for seed patterns to balance the sensitivity and running time.\n+ --readFormat {fasta | fastq | csfasta | csfastq} Read in reads in the specified format, instead of guessing according to the extension name.\n+ --delimiter CHAR Which is a character used as the delimiter to separate the the read id, and the additional info in the line with ">" in fasta or csfasta.\n+\n+Paired reads options::\n+\n+ -e Exclude ambiguous paired.\n+ -L INT Mate-paired separate lower bound.\n+ -U INT Mate-paired separate upper bound.\n+ -1 PATH The forward reads file path.\n+ -2 PATH The reversed reads file path.\n+\n+See the PerM `algorithm page`__ for information on algorithms and seeds.\n+\n+ .. __: http://code.google.com/p/perm/wiki/Algorithms\n+ </help>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_mapping/bfast_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_mapping/bfast_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,351 @@\n+#!/usr/bin/env python\n+"""\n+Runs BFAST on single-end or paired-end data.\n+TODO: more documentation\n+\n+TODO:\n+ - auto-detect gzip or bz2\n+ - split options (?)\n+ - queue lengths (?)\n+ - assumes reference always has been indexed\n+ - main and secondary indexes\n+ - scoring matrix file ?\n+ - read group file ?\n+\n+usage: bfast_wrapper.py [options]\n+ -r, --ref=r: The reference genome to use or index\n+ -f, --fastq=f: The fastq file to use for the mapping\n+ -F, --output=u: The file to save the output (SAM format)\n+ -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)\n+ -p, --params=p: Parameter setting to use (pre_set or full)\n+ -n, --numThreads=n: The number of threads to use\n+ -A, --space=A: The encoding space (0: base 1: color)\n+ -o, --offsets=o: The offsets for \'match\'\n+ -l, --loadAllIndexes=l: Load all indexes into memory\n+ -k, --keySize=k: truncate key size in \'match\'\n+ -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored\n+ -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded\n+ -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse)\n+ -t, --timing=t: output timing information to stderr\n+ -u, --ungapped=u: performed ungapped local alignment\n+ -U, --unconstrained=U: performed local alignment without mask constraints\n+ -O, --offset=O: the number of bases before and after each hit to consider in local alignment\n+ -q, --avgMismatchQuality=q: average mismatch quality\n+ -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all)\n+ -P, --disallowPairing=P: do not choose alignments based on pairing\n+ -R, --reverse=R: paired end reads are given on reverse strands\n+ -z, --random=z: output a random best scoring alignment\n+ -D, --dbkey=D: Dbkey for reference genome\n+ -H, --suppressHeader=H: Suppress the sam header\n+"""\n+\n+import optparse\n+import os\n+import shutil\n+import subprocess\n+import sys\n+import tempfile\n+\n+\n+def stop_err( msg ):\n+ sys.stderr.write( \'%s\\n\' % msg )\n+ sys.exit()\n+\n+\n+def __main__():\n+ parser = optparse.OptionParser()\n+ parser.add_option( \'-r\', \'--ref\', dest=\'ref\', help=\'The reference genome to index and use\' )\n+ parser.add_option( \'-f\', \'--fastq\', dest=\'fastq\', help=\'The fastq file to use for the mapping\' )\n+ parser.add_option( \'-F\', \'--output\', dest=\'output\', help=\'The file to save the output (SAM format)\' )\n+ parser.add_option( \'-A\', \'--space\', dest=\'space\', type="choice", default=\'0\', choices=(\'0\', \'1\'), help=\'The encoding space (0: base 1: color)\' )\n+ parser.add_option( \'-H\', \'--suppressHeader\', action="store_true", dest=\'suppressHeader\', default=False, help=\'Suppress header\' )\n+ parser.add_option( \'-n\', \'--numThreads\', dest=\'numThreads\', type="int", default="1", help=\'The number of threads to use\' )\n+ parser.add_option( \'-t\', \'--timing\', action="store_true", default=False, dest=\'timing\', help=\'output timming information to stderr\' )\n+ parser.add_option( \'-l\', \'--loadAllIndexes\', action="store_true", default=False, dest=\'loadAllIndexes\', help=\'Load all indexes into memory\' )\n+ parser.add_option( \'-m\', \'--indexMask\', dest=\'indexMask\', help=\'String containing info on how to build custom indexes\' )\n+ parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help=\'String containing info on how to build custom indexes\' )\n+ parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help=\'Do not index lower case sequences. Such as those created by RepeatMasker\' )\n+ parser.add_option( \'--indexContigOptions\', dest=\'indexContigOptions\', default="", help=\'The contig range options to use for the indexing\' )\n+ parser.add_option( \'--indexExonsFileName\', dest=\'ind'..b' if not stderr or len( stderr ) % buffsize != 0:\n+ break\n+ except OverflowError:\n+ pass\n+ tmp_stderr.close()\n+ if returncode != 0:\n+ raise Exception(stderr)\n+ except Exception as e:\n+ raise Exception(\'Error in \\\'bfast match\\\'. \\n\' + str( e ))\n+ # bfast \'localalign\'\n+ try:\n+ tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+ tmp_stderr = open( tmp, \'wb\' )\n+ proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+ returncode = proc.wait()\n+ tmp_stderr.close()\n+ # get stderr, allowing for case where it\'s very large\n+ tmp_stderr = open( tmp, \'rb\' )\n+ stderr = \'\'\n+ try:\n+ while True:\n+ stderr += tmp_stderr.read( buffsize )\n+ if not stderr or len( stderr ) % buffsize != 0:\n+ break\n+ except OverflowError:\n+ pass\n+ tmp_stderr.close()\n+ if returncode != 0:\n+ raise Exception(stderr)\n+ except Exception as e:\n+ raise Exception(\'Error in \\\'bfast localalign\\\'. \\n\' + str( e ))\n+ # bfast \'postprocess\'\n+ try:\n+ tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+ tmp_stderr = open( tmp, \'wb\' )\n+ proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+ returncode = proc.wait()\n+ tmp_stderr.close()\n+ # get stderr, allowing for case where it\'s very large\n+ tmp_stderr = open( tmp, \'rb\' )\n+ stderr = \'\'\n+ try:\n+ while True:\n+ stderr += tmp_stderr.read( buffsize )\n+ if not stderr or len( stderr ) % buffsize != 0:\n+ break\n+ except OverflowError:\n+ pass\n+ tmp_stderr.close()\n+ if returncode != 0:\n+ raise Exception(stderr)\n+ except Exception as e:\n+ raise Exception(\'Error in \\\'bfast postprocess\\\'. \\n\' + str( e ))\n+ # remove header if necessary\n+ if options.suppressHeader:\n+ tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)\n+ tmp_out_name = tmp_out.name\n+ tmp_out.close()\n+ try:\n+ shutil.move( options.output, tmp_out_name )\n+ except Exception as e:\n+ raise Exception(\'Error moving output file before removing headers. \\n\' + str( e ))\n+ fout = open( options.output, \'w\' )\n+ for line in open( tmp_out.name, \'r\' ):\n+ if len( line ) < 3 or line[0:3] not in [ \'@HD\', \'@SQ\', \'@RG\', \'@PG\', \'@CO\' ]:\n+ fout.write( line )\n+ fout.close()\n+ # check that there are results in the output file\n+ if os.path.getsize( options.output ) > 0:\n+ if "0" == options.space:\n+ sys.stdout.write( \'BFAST run on Base Space data\' )\n+ else:\n+ sys.stdout.write( \'BFAST run on Color Space data\' )\n+ else:\n+ raise Exception(\'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.\')\n+ except Exception as e:\n+ stop_err( \'The alignment failed.\\n\' + str( e ) )\n+ finally:\n+ # clean up temp dir\n+ if os.path.exists( tmp_dir ):\n+ shutil.rmtree( tmp_dir )\n+\n+\n+if __name__ == "__main__":\n+ __main__()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_mapping/bfast_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_mapping/bfast_wrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,384 @@\n+<tool id="bfast_wrapper" name="Map with BFAST" version="0.1.3">\n+ <description></description>\n+ <command interpreter="python">bfast_wrapper.py\n+ --numThreads="\\${GALAXY_SLOTS:-4}"\n+ --fastq="$input1"\n+ #if $input1.extension.startswith( "fastqcs" ):\n+ ##if extention starts with fastqcs, then we have a color space file\n+ --space="1" ##color space\n+ #else\n+ --space="0"\n+ #end if\n+ --output="$output"\n+ $suppressHeader\n+ \n+ #if $refGenomeSource.refGenomeSource_type == "history":\n+ ##build indexes on the fly\n+ --buildIndex\n+ --ref="${refGenomeSource.ownFile}"\n+ --indexMask="${",".join( [ "%s:%s" % ( str( custom_index.get( \'mask\' ) ).strip(), str( custom_index.get( \'hash_width\' ) ).strip() ) for custom_index in $refGenomeSource.custom_index ] )}"\n+ ${refGenomeSource.indexing_repeatmasker}\n+ #if $refGenomeSource.indexing_option.indexing_option_selector == "contig_offset":\n+ --indexContigOptions="${refGenomeSource.indexing_option.start_contig},${refGenomeSource.indexing_option.start_pos},${refGenomeSource.indexing_option.end_contig},${refGenomeSource.indexing_option.end_pos}"\n+ #elif $refGenomeSource.indexing_option.indexing_option_selector == "exons_file":\n+ --indexExonsFileName="${refGenomeSource.indexing_option.exons_file}"\n+ #end if\n+ #else:\n+ ##use precomputed indexes\n+ --ref="${ refGenomeSource.indices.fields.path }"\n+ #end if\n+ \n+ #if $params.source_select == "full":\n+ --offsets="$params.offsets"\n+ --keySize="$params.keySize"\n+ --maxKeyMatches="$params.maxKeyMatches"\n+ --maxNumMatches="$params.maxNumMatches"\n+ --whichStrand="$params.whichStrand"\n+ \n+ #if str( $params.scoringMatrixFileName ) != \'None\':\n+ --scoringMatrixFileName="$params.scoringMatrixFileName"\n+ #end if\n+ ${params.ungapped}\n+ ${params.unconstrained}\n+ --offset="${params.offset}"\n+ --avgMismatchQuality="${params.avgMismatchQuality}"\n+ \n+ --algorithm="${params.localalign_params.algorithm}"\n+ ${params.unpaired}\n+ ${params.reverseStrand}\n+ #if $params.localalign_params.algorithm == "3":\n+ ${params.localalign_params.pairedEndInfer}\n+ ${params.localalign_params.randomBest}\n+ #end if\n+ #end if\n+ </command>\n+ <inputs>\n+ <param name="input1" type="data" format="fastqsanger,fastqcssanger" label="FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>\n+ <conditional name="refGenomeSource">\n+ <param name="refGenomeSource_type" type="select" label="Will you select a reference genome from your history or use a built-in index?">\n+ <option value="indexed">Use a built-in index</option>\n+ <option value="history">Use one from the history</option>\n+ </param>\n+ <when value="indexed">\n+ <param name="indices" type="select" label="Select a reference genome index set">\n+ <options from_data_table="bfast_indexes">\n+ <filter type="multiple_splitter" column="2" separator=","/>\n+ <filter type="param_value" column="2" ref="input1" ref_attribute="extension"/>\n+ <filter type="sort_by" column="3"/>\n+ <validator type="no_options" message="No indexes are available for the selected input dataset"/>\n+ </options>\n+ </param>\n+ </when>\n+ <when value="history">\n+ <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />\n+ <repeat name="custom_index" title="Custom indice" min="1" >\n+ <param name="mask" type="text" value="" label="Specify the mask" size="20">\n+ <!-- <validator type="no_options" message="No indexes are available for the selected input dataset"/> need is int validator here or regex all 01s-->\n+ </param>\n+ <param name="hash_width" type="integer" value="" label="Hash Width" />\n+ '..b'es the maximum total number of matches to consider\n+ before the read is discarded [384]\n+ -q INT Specifies the average mismatch quality\n+ -n INT Specifies the number of threads to use [1] \n+ -t Specifies to output timing information\n+\n+For **postprocess**::\n+\n+ -a INT Specifies the algorithm to choose the alignment for each end of the read:\n+\n+ 0: No filtering will occur.\n+ 1: All alignments that pass the filters will be output\n+ 2: Only consider reads that have been aligned uniquely\n+ 3: Choose uniquely the alignment with the best score\n+ 4: Choose all alignments with the best score\n+ \n+ -A INT 0: NT space 1: Color space [0]\n+ -U Specifies that pairing should not be performed\n+ -R Specifies that paired reads are on opposite strands\n+ -q INT Specifies the average mismatch quality\n+ -x FILE Specifies the file name storing the scoring matrix\n+ -z Specifies to output a random best scoring alignment (with -a 3)\n+ -r FILE Specifies to add the RG in the specified file to the SAM\n+ header and updates the RG tag (and LB/PU tags if present) in\n+ the reads (SAM only)\n+ -n INT Specifies the number of threads to use [1] \n+ -t Specifies to output timing information\n+\n+ </help>\n+ <requirements>\n+ <requirement type="package">bfast</requirement>\n+ </requirements>\n+ <tests>\n+ <test>\n+ <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" />\n+ <param name="refGenomeSource_type" value="history" />\n+ <param name="ownFile" ftype="fasta" value="phiX.fasta" />\n+ <param name="mask" value="111111111111111111" />\n+ <param name="hash_width" value="14" />\n+ <param name="source_select" value="pre_set" />\n+ <param name="indexing_repeatmasker" value="False" />\n+ <param name="indexing_option_selector" value="default" />\n+ <param name="suppressHeader" value="" />\n+ <output name="output" ftype="sam" file="bfast_out1.sam" />\n+ </test>\n+ <test>\n+ <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger"/>\n+ <param name="refGenomeSource_type" value="history" />\n+ <param name="ownFile" ftype="fasta" value="phiX.fasta" />\n+ <param name="mask" value="111111111111111111" />\n+ <param name="hash_width" value="14" />\n+ <param name="source_select" value="pre_set" />\n+ <param name="indexing_repeatmasker" value="False" />\n+ <param name="indexing_option_selector" value="default" />\n+ <param name="suppressHeader" value="--suppressHeader" />\n+ <output name="output" ftype="sam" file="bfast_out1.sam" lines_diff="3" /><!-- 3 headers exist in compare file, but headers are suppressed -->\n+ </test>\n+ <test>\n+ <param name="input1" ftype="fastqcssanger" value="random_phiX_1.fastqcssanger" />\n+ <param name="refGenomeSource_type" value="history" />\n+ <param name="ownFile" ftype="fasta" value="phiX.fasta" />\n+ <param name="mask" value="111111111111111111" />\n+ <param name="hash_width" value="14" />\n+ <param name="source_select" value="pre_set" />\n+ <param name="indexing_repeatmasker" value="False" />\n+ <param name="indexing_option_selector" value="default" />\n+ <param name="suppressHeader" value="" />\n+ <output name="output" ftype="sam" file="bfast_out2.sam" />\n+ </test>\n+ <!-- test of pre-indexed data now -->\n+ <test>\n+ <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" />\n+ <param name="refGenomeSource_type" value="indexed" />\n+ <param name="indices" value="phiX_nt_50" />\n+ <param name="source_select" value="pre_set" />\n+ <param name="suppressHeader" value="" />\n+ <output name="output" ftype="sam" file="bfast_out3.sam" lines_diff="2" /><!-- MD:Z:11T38 instead of MD:Z:50 on one line-->\n+ </test>\n+ </tests>\n+\n+ <citations>\n+ <citation type="doi">10.1371/journal.pone.0007767</citation>\n+ </citations>\n+\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_mapping/fastq_statistics.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_mapping/fastq_statistics.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,94 @@ +<tool id="cshl_fastq_statistics" name="FASTQ Statistics" version="1.0.0"> + <description>for Solexa file</description> + <command>cat $input | solexa_quality_statistics -o $output</command> + <inputs> + <param format="fastqsolexa" name="input" type="data" label="Library to analyze" /> + </inputs> + <outputs> + <data format="txt" name="output" /> + </outputs> + <help> + +**What it does** + +Creates quality statistics report for the given Solexa/FASTQ library. + +----- + +**The output file will contain the following fields:** + +* column = column number (1 to 36 for a 36-cycles read solexa file) +* count = number of bases found in this column. +* min = Lowest quality score value found in this column. +* max = Highest quality score value found in this column. +* sum = Sum of quality score values for this column. +* mean = Mean quality score value for this column. +* Q1 = 1st quartile quality score. +* med = Median quality score. +* Q3 = 3rd quartile quality score. +* IQR = Inter-Quartile range (Q3-Q1). +* lW = 'Left-Whisker' value (for boxplotting). +* rW = 'Right-Whisker' value (for boxplotting). +* A_Count = Count of 'A' nucleotides found in this column. +* C_Count = Count of 'C' nucleotides found in this column. +* G_Count = Count of 'G' nucleotides found in this column. +* T_Count = Count of 'T' nucleotides found in this column. +* N_Count = Count of 'N' nucleotides found in this column. + + +.. class:: infomark + +**TIP:** This statistics report can be used as input for **Quality Score** and **Nucleotides Distribution** tools. + + + + + +**Output Example**:: + + column count min max sum mean Q1 med Q3 IQR lW rW A_Count C_Count G_Count T_Count N_Count + 1 6362991 -4 40 250734117 39.41 40 40 40 0 40 40 1396976 1329101 678730 2958184 0 + 2 6362991 -5 40 250531036 39.37 40 40 40 0 40 40 1786786 1055766 1738025 1782414 0 + 3 6362991 -5 40 248722469 39.09 40 40 40 0 40 40 2296384 984875 1443989 1637743 0 + 4 6362991 -5 40 247654797 38.92 40 40 40 0 40 40 1683197 1410855 1722633 1546306 0 + 5 6362991 -4 40 248214827 39.01 40 40 40 0 40 40 2536861 1167423 1248968 1409739 0 + 6 6362991 -5 40 248499903 39.05 40 40 40 0 40 40 1598956 1236081 1568608 1959346 0 + 7 6362991 -4 40 247719760 38.93 40 40 40 0 40 40 1692667 1822140 1496741 1351443 0 + 8 6362991 -5 40 245745205 38.62 40 40 40 0 40 40 2230936 1343260 1529928 1258867 0 + 9 6362991 -5 40 245766735 38.62 40 40 40 0 40 40 1702064 1306257 1336511 2018159 0 + 10 6362991 -5 40 245089706 38.52 40 40 40 0 40 40 1519917 1446370 1450995 1945709 0 + 11 6362991 -5 40 242641359 38.13 40 40 40 0 40 40 1717434 1282975 1387804 1974778 0 + 12 6362991 -5 40 242026113 38.04 40 40 40 0 40 40 1662872 1202041 1519721 1978357 0 + 13 6362991 -5 40 238704245 37.51 40 40 40 0 40 40 1549965 1271411 1973291 1566681 1643 + 14 6362991 -5 40 235622401 37.03 40 40 40 0 40 40 2101301 1141451 1603990 1515774 475 + 15 6362991 -5 40 230766669 36.27 40 40 40 0 40 40 2344003 1058571 1440466 1519865 86 + 16 6362991 -5 40 224466237 35.28 38 40 40 2 35 40 2203515 1026017 1474060 1651582 7817 + 17 6362991 -5 40 219990002 34.57 34 40 40 6 25 40 1522515 1125455 2159183 1555765 73 + 18 6362991 -5 40 214104778 33.65 30 40 40 10 15 40 1479795 2068113 1558400 1249337 7346 + 19 6362991 -5 40 212934712 33.46 30 40 40 10 15 40 1432749 1231352 1769799 1920093 8998 + 20 6362991 -5 40 212787944 33.44 29 40 40 11 13 40 1311657 1411663 2126316 1513282 73 + 21 6362991 -5 40 211369187 33.22 28 40 40 12 10 40 1887985 1846300 1300326 1318380 10000 + 22 6362991 -5 40 213371720 33.53 30 40 40 10 15 40 542299 3446249 516615 1848190 9638 + 23 6362991 -5 40 221975899 34.89 36 40 40 4 30 40 347679 1233267 926621 3855355 69 + 24 6362991 -5 40 194378421 30.55 21 40 40 19 -5 40 433560 674358 3262764 1992242 67 + 25 6362991 -5 40 199773985 31.40 23 40 40 17 -2 40 944760 325595 1322800 3769641 195 + 26 6362991 -5 40 179404759 28.20 17 34 40 23 -5 40 3457922 156013 1494664 1254293 99 + 27 6362991 -5 40 163386668 25.68 13 28 40 27 -5 40 1392177 281250 3867895 821491 178 + 28 6362991 -5 40 156230534 24.55 12 25 40 28 -5 40 907189 981249 4174945 299437 171 + 29 6362991 -5 40 163236046 25.65 13 28 40 27 -5 40 1097171 3418678 1567013 280008 121 + 30 6362991 -5 40 151309826 23.78 12 23 40 28 -5 40 3514775 2036194 566277 245613 132 + 31 6362991 -5 40 141392520 22.22 10 21 40 30 -5 40 1569000 4571357 124732 97721 181 + 32 6362991 -5 40 143436943 22.54 10 21 40 30 -5 40 1453607 4519441 38176 351107 660 + 33 6362991 -5 40 114269843 17.96 6 14 30 24 -5 40 3311001 2161254 155505 734297 934 + 34 6362991 -5 40 140638447 22.10 10 20 40 30 -5 40 1501615 1637357 18113 3205237 669 + 35 6362991 -5 40 138910532 21.83 10 20 40 30 -5 40 1532519 3495057 23229 1311834 352 + 36 6362991 -5 40 117158566 18.41 7 15 30 23 -5 40 4074444 1402980 63287 822035 245 + + +</help> + + <citations> + <citation type="doi">10.1093/bioinformatics/btq281</citation> + </citations> + +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_mapping/mosaik.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_mapping/mosaik.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,129 @@ +<?xml version="1.0"?> +<tool id="mosaik_wrapper" name="Map with Mosaik" version="1.1.2"> + <description/> + <requirements> + <requirement type="package" version="1.1.0021">mosaik</requirement> + <requirement type="package" version="0.1.18">samtools</requirement> + </requirements> + <version_command>MosaikAligner | sed -e 's/\x1b\[[[:digit:]]\{1,2\}\(;[[:digit:]]\{1,2\}\)\{0,1\}m//g' | grep -o 'MosaikAligner [[:digit:].]\{1,\}'</version_command> + <command> + #set $processors = '-p ${GALAXY_SLOTS:-4}' + MosaikBuild -fr + #if $genomeSource.refGenomeSource == 'indexed': + ${genomeSource.indexReference.fields.path} + #else: + ${genomeSource.historyReference} + #end if + -oa mosaik_ref_file; + MosaikBuild -q $reads + #if $paired.kind == 'single' + #set $ls_string = '' + #else + -q2 ${paired.reads2} + -mfl ${paired.mfl} + #set $ls_string = '-ls %s' % $paired.ls + #end if + -st $st -out mosaik_reads_file; + MosaikAligner -ia mosaik_ref_file -in mosaik_reads_file -out mosaik_aligned_file $ls_string -mm $mm -mhp $mhp -act $act -bw $bw $processors -hs 15; + MosaikText -in mosaik_aligned_file -$outFormat sam_bam_file; + #if str($outFormat) == 'bam': + samtools sort sam_bam_file sorted_bam; + mv sorted_bam.bam $output + #else: + gunzip sam_bam_file.gz; + mv sam_bam_file $output + #end if + </command> + <inputs> + <conditional name="genomeSource"> + <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="indexReference" type="select" label="Select a reference genome"> + <options from_data_table="mosaik_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available" /> + </options> + </param> + </when> + <when value="history"> + <param format="fasta" name="historyReference" type="data" metadata_name="dbkey" label="Select a reference from history"/> + </when> + </conditional> + <param format="fastq" name="reads" type="data" label="FASTQ reads file" /> + <param name="outFormat" type="select" label="Output format"> + <option value="sam">SAM</option> + <option value="bam">BAM</option> + </param> + <param name="st" type="select" label="Sequencing technology used"> + <option value="454">454</option> + <option value="illumina">Illumina</option> + <option value="solid">Solid</option> + <option value="sanger">Sanger</option> + <option value="helicos">Helicos</option> + </param> + <conditional name="paired"> + <param name="kind" type="select" label="Is this library mate-paired?"> + <option value="single">Single-end</option> + <option value="paired">Paired-end</option> + </param> + <when value="single"/> + <when value="paired"> + <param format="fastq" name="reads2" type="data" label="FASTQ 2nd mate" /> + <param name="mfl" type="integer" value="200" label="Median fragment length" /> + <param name="ls" type="integer" min="0" value="50" label="Local alignment search radius to rescue mates" help="A large value slows down performances" /> + </when> + </conditional> + <param name="mm" type="integer" value="6" label="Number of mismatches allowed per sequence" /> + <param name="act" type="integer" value="35" label="Alignment candidate threshold" help="Determines which hash regions will be aligned with Smith-Waterman" /> + <param name="bw" type="integer" value="9" label="Smith-Waterman band width" /> + <param name="mhp" type="integer" value="100" label="Maximum number of positions stored per seed" help="Number of places in the reference the aligner will try to place a particular hash" /> + </inputs> + <outputs> + <data format="sam" name="output"> + <change_format> + <when input="outFormat" value="bam" format="bam" /> + </change_format> + <actions> + <conditional name="genomeSource.refGenomeSource"> + <when value="indexed"> + <action type="metadata" name="dbkey"> + <option type="from_data_table" name="mosaik_indexes" column="1"> + <filter type="param_value" column="0" value="#" compare="startswith" keep="False" /> + <filter type="param_value" ref="genomeSource.indexReference" column="0" /> + </option> + </action> + </when> + <when value="history"> + <action type="metadata" name="dbkey"> + <option type="from_param" name="genomeSource.historyReference" param_attribute="dbkey" /> + </action> + </when> + </conditional> + </actions> + </data> + </outputs> + <tests> + <test> + <param name="refGenomeSource" value="history"/> + <param name="historyReference" ftype="fasta" value="mosaik_test_ref.fasta"/> + <param name="reads" ftype="fastq" value="mosaik_test_input.fastq"/> + <param name="outFormat" value="sam"/> + <param name="st" value="454"/> + <param name="kind" value="single"/> + <param name="mm" value="6"/> + <param name="act" value="35"/> + <param name="bw" value="19"/> + <param name="mhp" value="100"/> + <output name="output" file="mosaik_test_out.sam" compare="sim_size" delta="0"/> + </test> + </tests> + <help> +This tool uses Mosaik to align reads to a reference sequence. + </help> + <citations> + <citation type="doi">10.1371/journal.pone.0090581</citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_mapping/srma_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_mapping/srma_wrapper.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,201 @@\n+#!/usr/bin/env python\n+"""\n+Runs SRMA on a SAM/BAM file;\n+TODO: more documentation\n+\n+usage: srma_wrapper.py [options]\n+\n+See below for options\n+"""\n+\n+import optparse\n+import os\n+import shutil\n+import subprocess\n+import sys\n+import tempfile\n+\n+\n+def stop_err( msg ):\n+ sys.stderr.write( \'%s\\n\' % msg )\n+ sys.exit()\n+\n+\n+def parseRefLoc( refLoc, refUID ):\n+ for line in open( refLoc ):\n+ if not line.startswith( \'#\' ):\n+ fields = line.strip().split( \'\\t\' )\n+ if len( fields ) >= 3:\n+ if fields[0] == refUID:\n+ return fields[1]\n+ return None\n+\n+\n+def __main__():\n+ parser = optparse.OptionParser()\n+ parser.add_option( \'-r\', \'--ref\', dest=\'ref\', help=\'The reference genome to index and use\' )\n+ parser.add_option( \'-u\', \'--refUID\', dest=\'refUID\', help=\'The pre-index reference genome unique Identifier\' )\n+ parser.add_option( \'-i\', \'--input\', dest=\'input\', help=\'The SAM/BAM input file\' )\n+ parser.add_option( \'-I\', \'--inputIndex\', dest=\'inputIndex\', help=\'The SAM/BAM input index file\' )\n+ parser.add_option( \'-o\', \'--output\', dest=\'output\', help=\'The SAM/BAM output file\' )\n+ parser.add_option( \'-O\', \'--offset\', dest=\'offset\', help=\'The alignment offset\' )\n+ parser.add_option( \'-Q\', \'--minMappingQuality\', dest=\'minMappingQuality\', help=\'The minimum mapping quality\' )\n+ parser.add_option( \'-P\', \'--minAlleleProbability\', dest=\'minAlleleProbability\', help=\'The minimum allele probability conditioned on coverage (for the binomial quantile).\' )\n+ parser.add_option( \'-C\', \'--minAlleleCoverage\', dest=\'minAlleleCoverage\', help=\'The minimum haploid coverage for the consensus\' )\n+ parser.add_option( \'-R\', \'--range\', dest=\'range\', help=\'A range to examine\' )\n+ parser.add_option( \'-c\', \'--correctBases\', dest=\'correctBases\', help=\'Correct bases \' )\n+ parser.add_option( \'-q\', \'--useSequenceQualities\', dest=\'useSequenceQualities\', help=\'Use sequence qualities \' )\n+ parser.add_option( \'-M\', \'--maxHeapSize\', dest=\'maxHeapSize\', help=\'The maximum number of nodes on the heap before re-alignment is ignored\' )\n+ parser.add_option( \'-s\', \'--fileSource\', dest=\'fileSource\', help=\'Whether to use a previously indexed reference sequence or one from history (indexed or history)\' )\n+ parser.add_option( \'-p\', \'--params\', dest=\'params\', help=\'Parameter setting to use (pre_set or full)\' )\n+ parser.add_option( \'-j\', \'--jarBin\', dest=\'jarBin\', default=\'\', help=\'The path to where jars are stored\' )\n+ parser.add_option( \'-f\', \'--jarFile\', dest=\'jarFile\', help=\'The file name of the jar file to use\')\n+ (options, args) = parser.parse_args()\n+\n+ # make temp directory for srma\n+ tmp_dir = tempfile.mkdtemp()\n+ buffsize = 1048576\n+\n+ # set up reference filenames\n+ reference_filepath_name = None\n+ # need to create SRMA dict and Samtools fai files for custom genome\n+ if options.fileSource == \'history\':\n+ try:\n+ reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix=\'.fa\' )\n+ reference_filepath_name = reference_filepath.name\n+ reference_filepath.close()\n+ dict_filepath_name = reference_filepath_name.replace( \'.fa\', \'.dict\' )\n+ os.symlink( options.ref, reference_filepath_name )\n+ # create fai file using Samtools\n+ index_fai_cmd = \'samtools faidx %s\' % reference_filepath_name\n+ try:\n+ tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+ tmp_stderr = open( tmp, \'wb\' )\n+ proc = subprocess.Popen( args=index_fai_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+ returncode = proc.wait()\n+ tmp_stderr.close()\n+ # get stderr, allowing for case where it\'s very large\n+ tmp_stderr = open( tmp, \'rb\' )\n+ stderr = \'\'\n+ try:\n+ while True:\n+ '..b' if os.path.exists( tmp_dir ):\n+ shutil.rmtree( tmp_dir )\n+ stop_err( \'Problem handling SRMA index (dict file) for custom genome file: %s\\n\' % str( e ) )\n+ # using built-in dict/index files\n+ else:\n+ if options.ref:\n+ reference_filepath_name = options.ref\n+ else:\n+ reference_filepath_name = parseRefLoc( options.refLocation, options.refUID )\n+ if reference_filepath_name is None:\n+ raise ValueError( \'A valid genome reference was not provided.\' )\n+\n+ # set up aligning and generate aligning command options\n+ if options.params == \'pre_set\':\n+ srma_cmds = \'\'\n+ else:\n+ ranges = \'null\'\n+ if options.range == \'None\':\n+ range = \'null\'\n+ else:\n+ range = options.range\n+ srma_cmds = "OFFSET=%s MIN_MAPQ=%s MINIMUM_ALLELE_PROBABILITY=%s MINIMUM_ALLELE_COVERAGE=%s RANGES=%s RANGE=%s CORRECT_BASES=%s USE_SEQUENCE_QUALITIES=%s MAX_HEAP_SIZE=%s" % ( options.offset, options.minMappingQuality, options.minAlleleProbability, options.minAlleleCoverage, ranges, range, options.correctBases, options.useSequenceQualities, options.maxHeapSize )\n+\n+ srma_cmds = "%s VALIDATION_STRINGENCY=LENIENT" % srma_cmds\n+\n+ # perform alignments\n+ buffsize = 1048576\n+ try:\n+ # symlink input bam and index files due to the naming conventions required by srma here\n+ input_bam_filename = os.path.join( tmp_dir, \'%s.bam\' % os.path.split( options.input )[-1] )\n+ os.symlink( options.input, input_bam_filename )\n+ input_bai_filename = "%s.bai" % os.path.splitext( input_bam_filename )[0]\n+ os.symlink( options.inputIndex, input_bai_filename )\n+\n+ # create a temp output name, ending in .bam due to required naming conventions? unkown if required\n+ output_bam_filename = os.path.join( tmp_dir, "%s.bam" % os.path.split( options.output )[-1] )\n+ # generate commandline\n+ java_opts = \'\'\n+ if \'_JAVA_OPTIONS\' not in os.environ:\n+ java_opts = \'-Xmx2048m\'\n+ cmd = \'java %s -jar %s I=%s O=%s R=%s %s\' % ( java_opts, os.path.join( options.jarBin, options.jarFile ), input_bam_filename, output_bam_filename, reference_filepath_name, srma_cmds )\n+ # need to nest try-except in try-finally to handle 2.4\n+ try:\n+ try:\n+ tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+ tmp_stderr = open( tmp, \'wb\' )\n+ proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+ returncode = proc.wait()\n+ tmp_stderr.close()\n+ # get stderr, allowing for case where it\'s very large\n+ tmp_stderr = open( tmp, \'rb\' )\n+ stderr = \'\'\n+ try:\n+ while True:\n+ stderr += tmp_stderr.read( buffsize )\n+ if not stderr or len( stderr ) % buffsize != 0:\n+ break\n+ except OverflowError:\n+ pass\n+ tmp_stderr.close()\n+ if returncode != 0:\n+ raise Exception(stderr)\n+ except Exception as e:\n+ raise Exception(\'Error executing SRMA. \' + str( e ))\n+ # move file from temp location (with .bam name) to provided path\n+ shutil.move( output_bam_filename, options.output )\n+ # check that there are results in the output file\n+ if os.path.getsize( options.output ) <= 0:\n+ raise Exception(\'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.\')\n+ except Exception as e:\n+ stop_err( \'The re-alignment failed.\\n\' + str( e ) )\n+ finally:\n+ # clean up temp dir\n+ if os.path.exists( tmp_dir ):\n+ shutil.rmtree( tmp_dir )\n+\n+\n+if __name__ == "__main__":\n+ __main__()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c sr_mapping/srma_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sr_mapping/srma_wrapper.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
b'@@ -0,0 +1,221 @@\n+<tool id="srma_wrapper" name="Re-align with SRMA" version="0.2.5">\n+ <description></description>\n+ <command interpreter="python">srma_wrapper.py \n+ #if $refGenomeSource.refGenomeSource_type == "history":\n+ --ref=$refGenomeSource.ownFile\n+ #else:\n+ --ref="${refGenomeSource.ref.fields.path}"\n+ --refUID=$refGenomeSource.ref\n+ ##--refLocations=${GALAXY_DATA_INDEX_DIR}/srma_index.loc\n+ #end if\n+ --input=$input\n+ --inputIndex=${input.metadata.bam_index}\n+ --output=$output\n+ --params=$params.source_select\n+ --fileSource=$refGenomeSource.refGenomeSource_type\n+ --jarBin="${GALAXY_DATA_INDEX_DIR}/shared/jars"\n+ #if $params.source_select == "full":\n+ --offset=$params.offset\n+ --minMappingQuality=$params.minMappingQuality\n+ --minAlleleProbability=$params.minAlleleProbability\n+ --minAlleleCoverage=$params.minAlleleCoverage\n+ --range=$params.range\n+ --correctBases=$params.correctBases\n+ --useSequenceQualities=$params.useSequenceQualities\n+ --maxHeapSize=$params.maxHeapSize\n+ #end if\n+ --jarFile="srma.jar"\n+ </command>\n+ <inputs>\n+ <conditional name="refGenomeSource">\n+ <param name="refGenomeSource_type" type="select" label="Will you select a reference genome from your history or use a built-in reference?">\n+ <option value="built-in">Use a built-in reference</option>\n+ <option value="history">Use one from the history</option>\n+ </param>\n+ <when value="built-in">\n+ <param name="ref" type="select" label="Select a reference genome">\n+ <options from_data_table="srma_indexes">\n+ <filter type="sort_by" column="2" />\n+ <validator type="no_options" message="No indexes are available" />\n+ </options>\n+ </param>\n+ </when>\n+ <when value="history">\n+ <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />\n+ </when>\n+ </conditional>\n+ <param name="input" type="data" format="bam" label="Input BAM file" help="The input BAM file to re-align"/>\n+ <conditional name="params">\n+ <param name="source_select" type="select" label="SRMA settings to use" help="For most re-alignment needs, use Commonly Used settings. If you want full control use Full Parameter List">\n+ <option value="pre_set">Commonly Used</option>\n+ <option value="full">Full Parameter List</option>\n+ </param>\n+ <when value="pre_set" />\n+ <when value="full">\n+ <param name="offset" type="integer" value="20" label="Offset" help="The alignment offset" />\n+ <param name="minMappingQuality" type="integer" value="0" label="Minimum mapping quality" help="The minimum mapping quality" />\n+ <param name="minAlleleProbability" type="float" value="0.1" label="Minimum allele probability" help="The minimum allele probability conditioned on coverage (for the binomial quantile)." />\n+ <param name="minAlleleCoverage" type="integer" value="2" label="Minimum allele coverage" help="The minimum haploid coverage for the consensus. Default value: 3. This option can be set " />\n+ <param name="range" type="text" value="null" label="Range" help="A range to examine" />\n+ <param name="correctBases" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Correct bases" help="Correct bases " />\n+ <param name="useSequenceQualities" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Use sequence qualities" help="Use sequence qualities " />\n+ <param name="maxHeapSize" type="integer" value="8192" label="Maximum heap size" help="The maximum number of nodes on the heap before re-alignment is ignored" />\n+ </when>\n+ </conditional>\n+ </inputs>\n+ <outputs>\n+ <data format="bam" name="output" label="${tool.name} on ${on_string}: re-aligned reads">\n+ <actions>\n+ <conditional name="refGenomeSource.refGenomeSou'..b'am" lines_diff="2" /><!-- allows tag with version number to be different -->\n+ </test>\n+ </tests>\n+ <help>\n+**What it does**\n+\n+SRMA is a short read micro re-aligner for next-generation high throughput sequencing data.\n+\n+Sequence alignment algorithms examine each read independently. When indels occur towards the ends of reads, the alignment can lead to false SNPs as well as improperly placed indels. This tool aims to perform a re-alignment of each read to a graphical representation of all alignments within a local region to provide a better overall base-resolution consensus.\n+\n+Currently this tool works well with and has been tested on 30x diploid coverage genome sequencing data from Illumina and ABI SOLiD technology. This tool may not work well with 454 data, as indels are a significant error mode for 454 data. \n+\n+------\n+\n+Please cite the website "http://srma.sourceforge.net" as well as:\n+\n+Homer N, and Nelson SF. SRMA: short read micro re-aligner. 2010.\n+\n+------\n+\n+**Know what you are doing**\n+\n+.. class:: warningmark\n+\n+There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.\n+\n+.. __: http://srma.sourceforge.net/\n+\n+------\n+\n+**Input formats**\n+\n+SRMA accepts a BAM input file. Note that this file should have been generated from a SAM file which contains the header.\n+\n+------\n+\n+**Outputs**\n+\n+The output is in BAM format, see http://samtools.sourceforge.net for more details.\n+\n+-------\n+\n+**SRMA settings**\n+\n+All of the options have a default value. You can change any of them. Most of the options in SRMA have been implemented here.\n+\n+------\n+\n+**SRMA parameter list**\n+\n+This is an exhaustive list of SRMA options:\n+\n+For **SRMA**::\n+\n+ INPUT=File\n+ I=File The input SAM or BAM file. Required. \n+ \n+ OUTPUT=File\n+ O=File The output SAM or BAM file. Default value: null. \n+ \n+ REFERENCE=File\n+ R=File The reference FASTA file. Required. \n+ \n+ OFFSET=Integer The alignment offset. Default value: 20. This option can be set to \'null\' to clear the \n+ default value. \n+ \n+ MIN_MAPQ=Integer The minimum mapping quality. Default value: 0. This option can be set to \'null\' to clear \n+ the default value. \n+ \n+ MINIMUM_ALLELE_PROBABILITY=Double\n+ The minimum allele probability conditioned on coverage (for the binomial quantile). \n+ Default value: 0.1. This option can be set to \'null\' to clear the default value. \n+ \n+ MINIMUM_ALLELE_COVERAGE=Integer\n+ The minimum haploid coverage for the consensus. Default value: 3. This option can be set \n+ to \'null\' to clear the default value. \n+ \n+ RANGE=String A range to examine. Default value: null. \n+ \n+ CORRECT_BASES=Boolean Correct bases. Default value: false. This option can be set to \'null\' to clear the \n+ default value. Possible values: {true, false} \n+ \n+ USE_SEQUENCE_QUALITIES=BooleanUse sequence qualities Default value: true. This option can be set to \'null\' to clear the \n+ default value. Possible values: {true, false} \n+ \n+ MAX_HEAP_SIZE=Integer The maximum number of nodes on the heap before re-alignment is ignored Default value: \n+ 8192. This option can be set to \'null\' to clear the default value. \n+\n+ </help>\n+ <citations>\n+ <citation type="doi">10.1093/bioinformatics/bts286</citation>\n+ </citations>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/aggregate_binned_scores_in_intervals.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/aggregate_binned_scores_in_intervals.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,114 @@ +<tool id="aggregate_scores_in_intervals2" name="Aggregate datapoints" version="1.1.3"> + <description>Appends the average, min, max of datapoints per interval</description> + <command> +python '$__tool_directory__/aggregate_scores_in_intervals.py' +#if $score_source_type.score_source == "user" + '$score_source_type.input2' '$input1' '${input1.metadata.chromCol}' '${input1.metadata.startCol}' '${input1.metadata.endCol}' '$out_file1' --chrom_buffer=3 +#else + '$score_source_type.datasets' '$input1' '${input1.metadata.chromCol}' '${input1.metadata.startCol}' '${input1.metadata.endCol}' '$out_file1' -b +#end if + </command> + <inputs> + <param format="interval" name="input1" type="data" label="Interval file"/> + <conditional name="score_source_type"> + <param name="score_source" type="select" label="Score Source"> + <option value="cached" selected="true">Locally Cached Scores</option> + <option value="user">Scores in Your History</option> + </param> + <when value="cached"> + <param name="datasets" type="select" label="Available datasets" display="radio"> + <options from_file="binned_scores.loc"> + <column name="name" index="1"/> + <column name="value" index="2"/> + <column name="dbkey" index="0"/> + <filter type="data_meta" ref="input1" key="dbkey" column="0" /> + </options> + </param> + </when> + <when value="user"> + <param format="wig" name="input2" type="data" label="Score file"> + <options> + <filter type="data_meta" ref="input1" key="dbkey" /> + </options> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data format="interval" name="out_file1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="6.bed" dbkey="hg17" ftype="bed"/> + <param name="score_source" value="cached"/> + <param name="datasets" value="/galaxy/data/binned_scores/hg17/phastcons_encode_sep2005_tba" /> + <output name="out_file1" file="aggregate_binned_scores_in_intervals.out" /> + </test> + <test> + <param name="input1" value="9_hg18.bed" dbkey="hg18" ftype="bed"/> + <param name="score_source" value="cached"/> + <param name="datasets" value="/galaxy/data/binned_scores/hg18/phastCons17way/ba" /> + <output name="out_file1" file="aggregate_binned_scores_in_intervals2.interval" /> + </test> + <test> + <param name="input1" value="6.bed" dbkey="hg17" ftype="bed"/> + <param name="score_source" value="user"/> + <param name="input2" value="aggregate_binned_scores_3.wig" dbkey="hg17" ftype="wig"/> + <output name="out_file1" file="aggregate_binned_scores_in_intervals3.out"/> + </test> + </tests> + <help> +.. class:: warningmark + +This tool currently only has cached data for genome builds hg16, hg17 and hg18. However, you may use your own data point (wiggle) data, such as those available from UCSC. If you are trying to use your own data point file and it is not appearing as an option, make sure that the builds for your history items are the same. + +.. class:: warningmark + +This tool assumes that the input dataset is in interval format and contains at least a chrom column, a start column and an end column. These 3 columns can be dispersed throughout any number of other data columns. + +----- + +.. class:: infomark + +**TIP:** Computing summary information may throw exceptions if the data type (e.g., string, integer) in every line of the columns is not appropriate for the computation (e.g., attempting numerical calculations on strings). If an exception is thrown when computing summary information for a line, that line is skipped as invalid for the computation. The number of invalid skipped lines is documented in the resulting history item as a "Data issue". + +----- + +**Syntax** + +This tool appends columns of summary information for each interval matched against a selected dataset. For each interval, the average, minimum and maximum for the data falling within the interval is computed. + +- Several quantitative scores are provided for the ENCODE regions. + + - Various Scores + - Regulatory Potential + - Neutral rate (Ancestral Repeats) + - GC fraction + - Conservation Scores + - PhastCons + - binCons + - GERP + +----- + +**Example** + +If your original data has the following format: + ++------+-----+-----+---+------+ +|other1|chrom|start|end|other2| ++------+-----+-----+---+------+ + +and you choose to aggregate phastCons scores, your output will look like this: + ++------+-----+-----+---+------+---+---+---+ +|other1|chrom|start|end|other2|avg|min|max| ++------+-----+-----+---+------+---+---+---+ + +where: + +* **avg** - average phastCons score for each region +* **min** - minimum phastCons score for each region +* **max** - maximum phastCons score for each region + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/aggregate_scores_in_intervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/aggregate_scores_in_intervals.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,252 @@\n+#!/usr/bin/env python\n+# Greg Von Kuster\n+"""\n+usage: %prog score_file interval_file chrom start stop [out_file] [options]\n+ -b, --binned: \'score_file\' is actually a directory of binned array files\n+ -m, --mask=FILE: bed file containing regions not to consider valid\n+ -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file\n+"""\n+\n+from __future__ import division, print_function\n+\n+import os\n+import os.path\n+import struct\n+import sys\n+import tempfile\n+from math import isnan\n+from UserDict import DictMixin\n+\n+import bx.wiggle\n+from bx.binned_array import BinnedArray, FileBinnedArray\n+from bx.bitset_builders import binned_bitsets_from_file\n+from bx.cookbook import doc_optparse\n+\n+from galaxy.util.ucsc import UCSCLimitException, UCSCOutWrapper\n+\n+\n+class PositionalScoresOnDisk:\n+ fmt = \'f\'\n+ fmt_size = struct.calcsize( fmt )\n+ default_value = float( \'nan\' )\n+\n+ def __init__( self ):\n+ self.file = tempfile.TemporaryFile( \'w+b\' )\n+ self.length = 0\n+\n+ def __getitem__( self, i ):\n+ if i < 0:\n+ i = self.length + i\n+ if i < 0 or i >= self.length:\n+ return self.default_value\n+ try:\n+ self.file.seek( i * self.fmt_size )\n+ return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\n+ except Exception as e:\n+ raise IndexError(e)\n+\n+ def __setitem__( self, i, value ):\n+ if i < 0:\n+ i = self.length + i\n+ if i < 0:\n+ raise IndexError(\'Negative assignment index out of range\')\n+ if i >= self.length:\n+ self.file.seek( self.length * self.fmt_size )\n+ self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )\n+ self.length = i + 1\n+ self.file.seek( i * self.fmt_size )\n+ self.file.write( struct.pack( self.fmt, value ) )\n+\n+ def __len__( self ):\n+ return self.length\n+\n+ def __repr__( self ):\n+ i = 0\n+ repr = "[ "\n+ for i in range( self.length ):\n+ repr = "%s %s," % ( repr, self[i] )\n+ return "%s ]" % ( repr )\n+\n+\n+class FileBinnedArrayDir( DictMixin ):\n+ """\n+ Adapter that makes a directory of FileBinnedArray files look like\n+ a regular dict of BinnedArray objects.\n+ """\n+ def __init__( self, dir ):\n+ self.dir = dir\n+ self.cache = dict()\n+\n+ def __getitem__( self, key ):\n+ value = None\n+ if key in self.cache:\n+ value = self.cache[key]\n+ else:\n+ fname = os.path.join( self.dir, "%s.ba" % key )\n+ if os.path.exists( fname ):\n+ value = FileBinnedArray( open( fname ) )\n+ self.cache[key] = value\n+ if value is None:\n+ raise KeyError( "File does not exist: " + fname )\n+ return value\n+\n+\n+def stop_err(msg):\n+ sys.stderr.write(msg)\n+ sys.exit()\n+\n+\n+def load_scores_wiggle( fname, chrom_buffer_size=3 ):\n+ """\n+ Read a wiggle file and return a dict of BinnedArray objects keyed\n+ by chromosome.\n+ """\n+ scores_by_chrom = dict()\n+ try:\n+ for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):\n+ if chrom not in scores_by_chrom:\n+ if chrom_buffer_size:\n+ scores_by_chrom[chrom] = BinnedArray()\n+ chrom_buffer_size -= 1\n+ else:\n+ scores_by_chrom[chrom] = PositionalScoresOnDisk()\n+ scores_by_chrom[chrom][pos] = val\n+ except UCSCLimitException:\n+ # Wiggle data was truncated, at the very least need to warn the user.\n+ print(\'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.\')\n+ except IndexError:\n+ stop_err(\'Data error: one or more column data values is missing in "%s"\' % fname)\n+ except ValueError:\n+ st'..b'ept:\n+ doc_optparse.exit()\n+\n+ if score_fname == \'None\':\n+ stop_err( \'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.\' )\n+\n+ try:\n+ chrom_col = int(chrom_col) - 1\n+ start_col = int(start_col) - 1\n+ stop_col = int(stop_col) - 1\n+ except:\n+ stop_err( \'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.\' )\n+\n+ if chrom_col < 0 or start_col < 0 or stop_col < 0:\n+ stop_err( \'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.\' )\n+\n+ if binned:\n+ scores_by_chrom = load_scores_ba_dir( score_fname )\n+ else:\n+ try:\n+ chrom_buffer = int( options.chrom_buffer )\n+ except:\n+ chrom_buffer = 3\n+ scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )\n+\n+ if mask_fname:\n+ masks = binned_bitsets_from_file( open( mask_fname ) )\n+ else:\n+ masks = None\n+\n+ skipped_lines = 0\n+ first_invalid_line = 0\n+ invalid_line = \'\'\n+\n+ for i, line in enumerate( open( interval_fname )):\n+ valid = True\n+ line = line.rstrip(\'\\r\\n\')\n+ if line and not line.startswith( \'#\' ):\n+ fields = line.split()\n+\n+ try:\n+ chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )\n+ except:\n+ valid = False\n+ skipped_lines += 1\n+ if not invalid_line:\n+ first_invalid_line = i + 1\n+ invalid_line = line\n+ if valid:\n+ total = 0\n+ count = 0\n+ min_score = 100000000\n+ max_score = -100000000\n+ for j in range( start, stop ):\n+ if chrom in scores_by_chrom:\n+ try:\n+ # Skip if base is masked\n+ if masks and chrom in masks:\n+ if masks[chrom][j]:\n+ continue\n+ # Get the score, only count if not \'nan\'\n+ score = scores_by_chrom[chrom][j]\n+ if not isnan( score ):\n+ total += score\n+ count += 1\n+ max_score = max( score, max_score )\n+ min_score = min( score, min_score )\n+ except:\n+ continue\n+ if count > 0:\n+ avg = total / count\n+ else:\n+ avg = "nan"\n+ min_score = "nan"\n+ max_score = "nan"\n+\n+ # Build the resulting line of data\n+ out_line = []\n+ for k in range(0, len(fields)):\n+ out_line.append(fields[k])\n+ out_line.append(avg)\n+ out_line.append(min_score)\n+ out_line.append(max_score)\n+\n+ print("\\t".join( map( str, out_line ) ), file=out_file)\n+ else:\n+ skipped_lines += 1\n+ if not invalid_line:\n+ first_invalid_line = i + 1\n+ invalid_line = line\n+ elif line.startswith( \'#\' ):\n+ # We\'ll save the original comments\n+ print(line, file=out_file)\n+\n+ out_file.close()\n+\n+ if skipped_lines > 0:\n+ print(\'Data issue: skipped %d invalid lines starting at line #%d which is "%s"\' % ( skipped_lines, first_invalid_line, invalid_line ))\n+ if skipped_lines == i:\n+ print(\'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.\')\n+\n+\n+if __name__ == "__main__":\n+ main()\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/filtering.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/filtering.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
b'@@ -0,0 +1,263 @@\n+#!/usr/bin/env python\n+# This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties.\n+# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.\n+from __future__ import division, print_function\n+\n+import re\n+import sys\n+from ast import Module, parse, walk\n+\n+AST_NODE_TYPE_WHITELIST = [\n+ \'Expr\', \'Load\', \'Str\', \'Num\', \'BoolOp\', \'Compare\', \'And\', \'Eq\', \'NotEq\',\n+ \'Or\', \'GtE\', \'LtE\', \'Lt\', \'Gt\', \'BinOp\', \'Add\', \'Div\', \'Sub\', \'Mult\', \'Mod\',\n+ \'Pow\', \'LShift\', \'GShift\', \'BitAnd\', \'BitOr\', \'BitXor\', \'UnaryOp\', \'Invert\',\n+ \'Not\', \'NotIn\', \'In\', \'Is\', \'IsNot\', \'List\', \'Index\', \'Subscript\',\n+ # Further checks\n+ \'Name\', \'Call\', \'Attribute\',\n+]\n+\n+\n+BUILTIN_AND_MATH_FUNCTIONS = \'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor\'.split(\'|\')\n+STRING_AND_LIST_METHODS = [ name for name in dir(\'\') + dir([]) if not name.startswith(\'_\') ]\n+VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS\n+\n+\n+def __check_name( ast_node ):\n+ name = ast_node.id\n+ if re.match(r\'^c\\d+$\', name):\n+ return True\n+ return name in VALID_FUNCTIONS\n+\n+\n+def __check_attribute( ast_node ):\n+ attribute_name = ast_node.attr\n+ if attribute_name not in STRING_AND_LIST_METHODS:\n+ return False\n+ return True\n+\n+\n+def __check_call( ast_node ):\n+ # If we are calling a function or method, it better be a math,\n+ # string or list function.\n+ ast_func = ast_node.func\n+ ast_func_class = ast_func.__class__.__name__\n+ if ast_func_class == \'Name\':\n+ if ast_func.id not in BUILTIN_AND_MATH_FUNCTIONS:\n+ return False\n+ elif ast_func_class == \'Attribute\':\n+ if not __check_attribute( ast_func ):\n+ return False\n+ else:\n+ return False\n+\n+ return True\n+\n+\n+def check_expression( text ):\n+ """\n+\n+ >>> check_expression("c1==\'chr1\' and c3-c2>=2000 and c6==\'+\'")\n+ True\n+ >>> check_expression("eval(\'1+1\')")\n+ False\n+ >>> check_expression("import sys")\n+ False\n+ >>> check_expression("[].__str__")\n+ False\n+ >>> check_expression("__builtins__")\n+ False\n+ >>> check_expression("\'x\' in globals")\n+ False\n+ >>> check_expression("\'x\' in [1,2,3]")\n+ True\n+ >>> check_expression("c3==\'chr1\' and c5>5")\n+ True\n+ >>> check_expression("c3==\'chr1\' and d5>5") # Invalid d5 reference\n+ False\n+ >>> check_expression("c3==\'chr1\' and c5>5 or exec")\n+ False\n+ >>> check_expression("type(c1) != type(1)")\n+ True\n+ >>> check_expression("c1.split(\',\')[1] == \'1\'")\n+ True\n+ >>> check_expression("exec 1")\n+ False\n+ >>> check_expression("str(c2) in [\\\\\\"a\\\\\\",\\\\\\"b\\\\\\"]")\n+ True\n+ """\n+ try:\n+ module = parse( text )\n+ except SyntaxError:\n+ return False\n+\n+ if not isinstance(module, Module):\n+ return False\n+ statements = module.body\n+ if not len( statements ) == 1:\n+ return False\n+ expression = statements[0]\n+ if expression.__class__.__name__ != \'Expr\':\n+ return False\n+\n+ for ast_node in walk( expression ):\n+ ast_node_class = ast_node.__class__.__name__\n+\n+ # Toss out everything that is not a "simple" expression,\n+ # imports, error handling, etc...\n+ if ast_node_class not in AST_NODE_TYPE_WHITELIST:\n+ return False\n+\n+ # White-list more potentially dangerous types AST elements.\n+ if ast_node_class == \'Name\':\n+ # In order to prevent loading \'exec\', \'eval\', etc...\n+ # put string restriction on names allowed.\n+ if not __check_name( ast_node ):\n+ return False\n+ # Check only valid, white-listed functions are called.\n+ elif ast_node_class == \'Call\':\n+ if not __check_call( ast_node ):\n+ return Fa'..b'nt( sys.argv[4] )\n+ assert sys.argv[5] # check to see that the column types variable isn\'t null\n+ in_column_types = sys.argv[5].split( \',\' )\n+except:\n+ stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." )\n+num_header_lines = int( sys.argv[6] )\n+\n+# Unescape if input has been escaped\n+mapped_str = {\n+ \'__lt__\': \'<\',\n+ \'__le__\': \'<=\',\n+ \'__eq__\': \'==\',\n+ \'__ne__\': \'!=\',\n+ \'__gt__\': \'>\',\n+ \'__ge__\': \'>=\',\n+ \'__sq__\': \'\\\'\',\n+ \'__dq__\': \'"\',\n+ \'__ob__\': \'[\',\n+ \'__cb__\': \']\',\n+}\n+for key, value in mapped_str.items():\n+ cond_text = cond_text.replace( key, value )\n+\n+# Attempt to determine if the condition includes executable stuff and, if so, exit\n+secured = dir()\n+operands = get_operands(cond_text)\n+for operand in operands:\n+ try:\n+ check = int( operand )\n+ except:\n+ if operand in secured:\n+ stop_err( "Illegal value \'%s\' in condition \'%s\'" % ( operand, cond_text ) )\n+\n+if not check_expression(cond_text):\n+ stop_err( "Illegal/invalid in condition \'%s\'" % ( cond_text ) )\n+\n+# Work out which columns are used in the filter (save using 1 based counting)\n+used_cols = sorted(set(int(match.group()[1:])\n+ for match in re.finditer(\'c(\\d)+\', cond_text)))\n+largest_col_index = max(used_cols)\n+\n+# Prepare the column variable names and wrappers for column data types. Only\n+# cast columns used in the filter.\n+cols, type_casts = [], []\n+for col in range( 1, largest_col_index + 1 ):\n+ col_name = "c%d" % col\n+ cols.append( col_name )\n+ col_type = in_column_types[ col - 1 ]\n+ if col in used_cols:\n+ type_cast = "%s(%s)" % ( col_type, col_name )\n+ else:\n+ # If we don\'t use this column, don\'t cast it.\n+ # Otherwise we get errors on things like optional integer columns.\n+ type_cast = col_name\n+ type_casts.append( type_cast )\n+\n+col_str = \', \'.join( cols ) # \'c1, c2, c3, c4\'\n+type_cast_str = \', \'.join( type_casts ) # \'str(c1), int(c2), int(c3), str(c4)\'\n+assign = "%s, = line.split( \'\\\\t\' )[:%i]" % ( col_str, largest_col_index )\n+wrap = "%s = %s" % ( col_str, type_cast_str )\n+skipped_lines = 0\n+invalid_lines = 0\n+first_invalid_line = 0\n+invalid_line = None\n+lines_kept = 0\n+total_lines = 0\n+out = open( out_fname, \'wt\' )\n+\n+# Read and filter input file, skipping invalid lines\n+code = \'\'\'\n+for i, line in enumerate( open( in_fname ) ):\n+ total_lines += 1\n+ line = line.rstrip( \'\\\\r\\\\n\' )\n+\n+ if i < num_header_lines:\n+ lines_kept += 1\n+ print( line, file=out )\n+ continue\n+\n+ if not line or line.startswith( \'#\' ):\n+ skipped_lines += 1\n+ continue\n+ try:\n+ %s\n+ %s\n+ if %s:\n+ lines_kept += 1\n+ print( line, file=out )\n+ except:\n+ invalid_lines += 1\n+ if not invalid_line:\n+ first_invalid_line = i + 1\n+ invalid_line = line\n+\'\'\' % ( assign, wrap, cond_text )\n+valid_filter = True\n+try:\n+ exec(code)\n+except Exception as e:\n+ out.close()\n+ if str( e ).startswith( \'invalid syntax\' ):\n+ valid_filter = False\n+ stop_err( \'Filter condition "%s" likely invalid. See tool tips, syntax and examples.\' % cond_text )\n+ else:\n+ stop_err( str( e ) )\n+\n+if valid_filter:\n+ out.close()\n+ valid_lines = total_lines - skipped_lines\n+ print(\'Filtering with %s, \' % cond_text)\n+ if valid_lines > 0:\n+ print(\'kept %4.2f%% of %d valid lines (%d total lines).\' % ( 100.0 * lines_kept / valid_lines, valid_lines, total_lines ))\n+ else:\n+ print(\'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.\' % cond_text)\n+ if invalid_lines:\n+ print(\'Skipped %d invalid line(s) starting at line #%d: "%s"\' % ( invalid_lines, first_invalid_line, invalid_line ))\n+ if skipped_lines:\n+ print(\'Skipped %i comment (starting with #) or blank line(s)\' % skipped_lines)\n' |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/filtering.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/filtering.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,90 @@ +<tool id="Filter1" name="Filter" version="1.1.0"> + <description>data on any column using simple expressions</description> + <edam_operations> + <edam_operation>operation_0335</edam_operation> + </edam_operations> + <command interpreter="python"> + filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}" $header_lines + </command> + <inputs> + <param format="tabular" name="input" type="data" label="Filter" help="Dataset missing? See TIP below."/> + <param name="cond" size="40" type="text" value="c1=='chr22'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool."> + <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/> + </param> + <param name="header_lines" type="integer" value="0" label="Number of header lines to skip"/> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="input" value="1.bed"/> + <param name="cond" value="c1=='chr22'"/> + <param name="header_lines" value="0"/> + <output name="out_file1" file="filter1_test1.bed"/> + </test> + <test> + <param name="input" value="7.bed"/> + <param name="cond" value="c1=='chr1' and c3-c2>=2000 and c6=='+'"/> + <param name="header_lines" value="0"/> + <output name="out_file1" file="filter1_test2.bed"/> + </test> + <!-- Test filtering of file with a variable number of columns. --> + <test> + <param name="input" value="filter1_in3.sam"/> + <param name="cond" value="c3=='chr1' and c5>5"/> + <param name="header_lines" value="0"/> + <output name="out_file1" file="filter1_test3.sam"/> + </test> + <test> + <param name="input" value="filter1_inbad.bed"/> + <param name="cond" value="c1=='chr22'"/> + <param name="header_lines" value="0"/> + <output name="out_file1" file="filter1_test4.bed"/> + </test> + <test> + <param name="input" value="filter1_in5.tab"/> + <param name="cond" value="c8>500"/> + <param name="header_lines" value="1"/> + <output name="out_file1" file="filter1_test5.tab"/> + </test> + </tests> + <help> + +.. class:: warningmark + +Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**) + +.. class:: infomark + +**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the columns being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings). If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition. The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue". + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +The filter tool allows you to restrict the dataset using simple conditional statements. + +- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file +- Make sure that multi-character operators contain no white space ( e.g., **<=** is valid while **< =** is not valid ) +- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **c1=='chr1'** ) +- Non-numerical values must be included in single or double quotes ( e.g., **c6=='+'** ) +- Filtering condition can include logical operators, but **make sure operators are all lower case** ( e.g., **(c1!='chrX' and c1!='chrY') or not c6=='+'** ) + +----- + +**Example** + +- **c1=='chr1'** selects lines in which the first column is chr1 +- **c3-c2<100*c4** selects lines where subtracting column 3 from column 2 is less than the value of column 4 times 100 +- **len(c2.split(',')) < 4** will select lines where the second column has less than four comma separated elements +- **c2>=1** selects lines in which the value of column 2 is greater than or equal to 1 +- Numbers should not contain commas - **c2<=44,554,350** will not work, but **c2<=44554350** will +- Some words in the data can be used, but must be single or double quoted ( e.g., **c3=='exon'** ) + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/grouping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/grouping.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,179 @@ +#!/usr/bin/env python +# Guruprasad Ananda +# Refactored 2011 to use numpy instead of rpy, Kanwei Li +""" +This tool provides the SQL "group by" functionality. +""" +from __future__ import print_function + +import random +import subprocess +import sys +import tempfile +from itertools import groupby + +import numpy + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def mode(data): + counts = {} + for x in data: + counts[x] = counts.get(x, 0) + 1 + maxcount = max(counts.values()) + modelist = [] + for x in counts: + if counts[x] == maxcount: + modelist.append( str(x) ) + return ','.join(modelist) + + +def main(): + inputfile = sys.argv[2] + ignorecase = int(sys.argv[4]) + ops = [] + cols = [] + round_val = [] + + if sys.argv[5] != "None": + asciitodelete = sys.argv[5] + if asciitodelete: + oldfile = open(inputfile, 'r') + newinputfile = "input_cleaned.tsv" + newfile = open(newinputfile, 'w') + asciitodelete = asciitodelete.split(',') + for i in range(len(asciitodelete)): + asciitodelete[i] = chr(int(asciitodelete[i])) + for line in oldfile: + if line[0] not in asciitodelete: + newfile.write(line) + oldfile.close() + newfile.close() + inputfile = newinputfile + + for var in sys.argv[6:]: + op, col, do_round = var.split() + ops.append(op) + cols.append(col) + round_val.append(do_round) + """ + At this point, ops, cols and rounds will look something like this: + ops: ['mean', 'min', 'c'] + cols: ['1', '3', '4'] + round_val: ['no', 'yes' 'no'] + """ + + try: + group_col = int(sys.argv[3]) - 1 + except: + stop_err( "Group column not specified." ) + + tmpfile = tempfile.NamedTemporaryFile() + + try: + """ + The -k option for the Posix sort command is as follows: + -k, --key=POS1[,POS2] + start a key at POS1, end it at POS2 (origin 1) + In other words, column positions start at 1 rather than 0, so + we need to add 1 to group_col. + if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1. + """ + case = '' + if ignorecase == 1: + case = '-f' + command_line = "sort -t ' ' %s -k%s,%s -o %s %s" % (case, group_col + 1, group_col + 1, tmpfile.name, inputfile) + except Exception as exc: + stop_err( 'Initialization error -> %s' % str(exc) ) + + try: + subprocess.check_output(command_line, stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError as e: + stop_err( "Sorting input dataset resulted in error: %s: %s" % ( e.returncode, e.output )) + + fout = open(sys.argv[1], "w") + + def is_new_item(line): + try: + item = line.strip().split("\t")[group_col] + except IndexError: + stop_err( "The following line didn't have %s columns: %s" % (group_col + 1, line) ) + + if ignorecase == 1: + return item.lower() + return item + + for key, line_list in groupby(tmpfile, key=is_new_item): + op_vals = [ [] for _ in ops ] + out_str = key + + for line in line_list: + fields = line.strip().split("\t") + for i, col in enumerate(cols): + col = int(col) - 1 # cXX from galaxy is 1-based + try: + val = fields[col].strip() + op_vals[i].append(val) + except IndexError: + sys.stderr.write( 'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col + 1, line) ) + sys.exit( 1 ) + + # Generate string for each op for this group + for i, op in enumerate( ops ): + data = op_vals[i] + rval = "" + if op == "mode": + rval = mode( data ) + elif op == "length": + rval = len( data ) + elif op == "random": + rval = random.choice(data) + elif op in ['cat', 'cat_uniq']: + if op == 'cat_uniq': + data = numpy.unique(data) + rval = ','.join(data) + elif op == "unique": + rval = len( numpy.unique(data) ) + else: + # some kind of numpy fn + try: + data = [float(_) for _ in data] + except ValueError: + sys.stderr.write( "Operation %s expected number values but got %s instead.\n" % (op, data) ) + sys.exit( 1 ) + rval = getattr(numpy, op)( data ) + if round_val[i] == 'yes': + rval = int(round(rval)) + else: + rval = '%g' % rval + out_str += "\t%s" % rval + + fout.write(out_str + "\n") + + # Generate a useful info message. + msg = "--Group by c%d: " % (group_col + 1) + for i, op in enumerate(ops): + if op == 'cat': + op = 'concat' + elif op == 'cat_uniq': + op = 'concat_distinct' + elif op == 'length': + op = 'count' + elif op == 'unique': + op = 'count_distinct' + elif op == 'random': + op = 'randomly_pick' + + msg += op + "[c" + cols[i] + "] " + + print(msg) + fout.close() + tmpfile.close() + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/grouping.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/grouping.xml Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,142 @@ +<tool id="Grouping1" name="Group" version="2.1.1"> + <description>data by a column and perform aggregate operation on other columns.</description> + <command interpreter="python"> + grouping.py + "${out_file1}" + "${input1}" + "${groupcol}" + "${ignorecase}" + "${ignorelines}" + #for $op in $operations + '${op.optype} + ${op.opcol} + ${op.opround}' + #end for + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> + <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" /> + <param name="ignorecase" type="boolean" truevalue="1" falsevalue="0"> + <label>Ignore case while grouping?</label> + </param> + <param name="ignorelines" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are not grouped"> + <option value="62">></option> + <option value="64">@</option> + <option value="43">+</option> + <option value="60"><</option> + <option value="42">*</option> + <option value="45">-</option> + <option value="61">=</option> + <option value="124">|</option> + <option value="63">?</option> + <option value="36">$</option> + <option value="46">.</option> + <option value="58">:</option> + <option value="38">&</option> + <option value="37">%</option> + <option value="94">^</option> + <option value="35">#</option> + </param> + <repeat name="operations" title="Operation"> + <param name="optype" type="select" label="Type"> + <option value="mean">Mean</option> + <option value="median">Median</option> + <option value="mode">Mode</option> + <option value="max">Maximum</option> + <option value="min">Minimum</option> + <option value="sum">Sum</option> + <option value="length">Count</option> + <option value="unique">Count Distinct</option> + <option value="cat">Concatenate</option> + <option value="cat_uniq">Concatenate Distinct</option> + <option value="random">Randomly pick</option> + <option value="std">Standard deviation</option> + </param> + <param name="opcol" label="On column" type="data_column" data_ref="input1" /> + <param name="opround" type="select" label="Round result to nearest integer?"> + <option value="no">NO</option> + <option value="yes">YES</option> + </param> + </repeat> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <requirements> + <requirement type="python-module">numpy</requirement> + </requirements> + <tests> + <!-- Test valid data --> + <test> + <param name="input1" value="1.bed"/> + <param name="groupcol" value="1"/> + <param name="ignorecase" value="true"/> + <param name="optype" value="mean"/> + <param name="opcol" value="2"/> + <param name="opround" value="no"/> + <output name="out_file1" file="groupby_out1.dat"/> + </test> + <!-- Long case but test framework doesn't allow yet + <test> + <param name="input1" value="1.bed"/> + <param name="groupcol" value="1"/> + <param name="ignorecase" value="false"/> + <param name="operations" value='[{"opcol": "2", "__index__": 0, "optype": "mean", "opround": "no"}, {"opcol": "2", "__index__": 1, "optype": "median", "opround": "no"}, {"opcol": "6", "__index__": 2, "optype": "mode", "opround": "no"}, {"opcol": "2", "__index__": 3, "optype": "max", "opround": "no"}, {"opcol": "2", "__index__": 4, "optype": "min", "opround": "no"}, {"opcol": "2", "__index__": 5, "optype": "sum", "opround": "no"}, {"opcol": "1", "__index__": 6, "optype": "length", "opround": "no"}, {"opcol": "1", "__index__": 7, "optype": "unique", "opround": "no"}, {"opcol": "1", "__index__": 8, "optype": "cat", "opround": "no"}, {"opcol": "6", "__index__": 9, "optype": "cat_uniq", "opround": "no"}, {"opcol": "2", "__index__": 10, "optype": "random", "opround": "no"}, {"opcol": "2", "__index__": 11, "optype": "std", "opround": "no"}]'/> + <output name="out_file1" file="groupby_out3.tabular"/> + </test> + --> + <!-- Test data with an invalid value in a column. Can't do it because test framework doesn't allow testing of errors + <test> + <param name="input1" value="1.tabular"/> + <param name="groupcol" value="1"/> + <param name="ignorecase" value="true"/> + <param name="optype" value="mean"/> + <param name="opcol" value="2"/> + <param name="opround" value="no"/> + <output name="out_file1" file="groupby_out2.dat"/> + </test> + --> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Concatenate, and Randomly pick on any column(s). + +The Concatenate function will take, for each group, each item in the specified column and build a comma delimited list. Concatenate Unique will do the same but will build a list of unique items with no repetition. + +Count and Count Unique are equivalent to Concatenate and Concatenate Unique, but will only count the number of items and will return an integer. + +- If multiple modes are present, all are reported. + +----- + +**Example** + +- For the following input:: + + chr22 1000 1003 TTT + chr22 2000 2003 aaa + chr10 2200 2203 TTT + chr10 1200 1203 ttt + chr22 1600 1603 AAA + +- **Grouping on column 4** while ignoring case, and performing operation **Count on column 1** will return:: + + AAA 2 + TTT 3 + +- **Grouping on column 4** while not ignoring case, and performing operation **Count on column 1** will return:: + + aaa 1 + AAA 1 + ttt 1 + TTT 2 + </help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/gsummary.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/gsummary.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,124 @@ +#!/usr/bin/env python +from __future__ import print_function + +import re +import sys +import tempfile +try: + from rpy2.rpy_classic import BASIC_CONVERSION, NO_CONVERSION, r, RException, set_default_mode +except: + # RPy isn't maintained, and doesn't work with R>3.0, use it as a fallback + from rpy import BASIC_CONVERSION, NO_CONVERSION, r, RException, set_default_mode + + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + + +def S3_METHODS( all="key" ): + Group_Math = [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif", + "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh", + "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma", + "cumsum", "cumprod", "cummax", "cummin", "c" ] + Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ] + if all is "key": + return { 'Math': Group_Math, 'Ops': Group_Ops } + + +def main(): + try: + datafile = sys.argv[1] + outfile_name = sys.argv[2] + expression = sys.argv[3] + except: + stop_err( 'Usage: python gsummary.py input_file ouput_file expression' ) + + math_allowed = S3_METHODS()[ 'Math' ] + ops_allowed = S3_METHODS()[ 'Ops' ] + + # Check for invalid expressions + for word in re.compile( '[a-zA-Z]+' ).findall( expression ): + if word and word not in math_allowed: + stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" % ( expression, word ) ) + symbols = set() + for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ): + if symbol and symbol not in ops_allowed: + stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) ) + else: + symbols.add( symbol ) + if len( symbols ) == 1 and ',' in symbols: + # User may have entered a comma-separated list r_data_frame columns + stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression ) + + # Find all column references in the expression + cols = [] + for col in re.compile( 'c[0-9]+' ).findall( expression ): + try: + cols.append( int( col[1:] ) - 1 ) + except: + pass + + tmp_file = tempfile.NamedTemporaryFile( 'w+b' ) + # Write the R header row to the temporary file + hdr_str = "\t".join( "c%s" % str( col + 1 ) for col in cols ) + tmp_file.write( "%s\n" % hdr_str ) + skipped_lines = 0 + first_invalid_line = 0 + i = 0 + for i, line in enumerate( open( datafile ) ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ): + valid = True + fields = line.split( '\t' ) + # Write the R data row to the temporary file + for col in cols: + try: + float( fields[ col ] ) + except: + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i + 1 + valid = False + break + if valid: + data_str = "\t".join( fields[ col ] for col in cols ) + tmp_file.write( "%s\n" % data_str ) + tmp_file.flush() + + if skipped_lines == i + 1: + stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) + else: + # summary function and return labels + set_default_mode( NO_CONVERSION ) + summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) + headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ] + headings_str = "\t".join( headings ) + + r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" ) + + outfile = open( outfile_name, 'w' ) + + for col in re.compile( 'c[0-9]+' ).findall( expression ): + r.assign( col, r[ "$" ]( r_data_frame, col ) ) + try: + summary = summary_func( r( expression ) ) + except RException as s: + outfile.close() + stop_err( "Computation resulted in the following error: %s" % str( s ) ) + summary = summary.as_py( BASIC_CONVERSION ) + outfile.write( "#%s\n" % headings_str ) + if type(summary) is dict: + # using rpy + outfile.write( "%s\n" % "\t".join( [ "%g" % summary[k] for k in headings ] ) ) + else: + # using rpy2 + outfile.write( "%s\n" % "\t".join( [ "%g" % k for k in summary ] ) ) + outfile.close() + + if skipped_lines: + print("Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line )) + + +if __name__ == "__main__": + main() |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/gsummary.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/gsummary.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,82 @@ +<tool id="Summary_Statistics1" name="Summary Statistics" version="1.1.1"> + <description>for any numerical column</description> + <edam_topics> + <edam_topic>topic_2269</edam_topic> + </edam_topics> + <requirements> + <requirement type="package" version="2.7.8">rpy2</requirement> + </requirements> + <stdio> + <exit_code range="1" level="fatal" /> + </stdio> + <command>python $__tool_directory__/gsummary.py "$input" "$out_file1" "$cond"</command> + <inputs> + <param format="tabular" name="input" type="data" label="Summary statistics on" help="Dataset missing? See TIP below"/> + <param name="cond" size="30" type="text" value="c5" label="Column or expression" help="See syntax below"> + <validator type="empty_field" message="Enter a valid column or expression, see syntax below for examples"/> + </param> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="1.bed"/> + <output name="out_file1" file="gsummary_out1.tabular"/> + <param name="cond" value="c2"/> + </test> + </tests> + <help> + +.. class:: warningmark + +This tool expects input datasets consisting of tab-delimited columns (blank or comment lines beginning with a # character are automatically skipped). + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert delimiters to TAB* + +.. class:: infomark + +**TIP:** Computing summary statistics may throw exceptions if the data value in every line of the columns being summarized is not numerical. If a line is missing a value or contains a non-numerical value in the column being summarized, that line is skipped and the value is not included in the statistical computation. The number of invalid skipped lines is documented in the resulting history item. + +.. class:: infomark + +**USING R FUNCTIONS:** Most functions (like *abs*) take only a single expression. *log* can take one or two parameters, like *log(expression,base)* + +Currently, these R functions are supported: *abs, sign, sqrt, floor, ceiling, trunc, round, signif, exp, log, cos, sin, tan, acos, asin, atan, cosh, sinh, tanh, acosh, asinh, atanh, lgamma, gamma, gammaCody, digamma, trigamma, cumsum, cumprod, cummax, cummin* + +----- + +**Syntax** + +This tool computes basic summary statistics on a given column, or on a valid expression containing one or more columns. + +- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file. + +- For example: + + - **log(c5)** calculates the summary statistics for the natural log of column 5 + - **(c5 + c6 + c7) / 3** calculates the summary statistics on the average of columns 5-7 + - **log(c5,10)** summary statistics of the base 10 log of column 5 + - **sqrt(c5+c9)** summary statistics of the square root of column 5 + column 9 + +----- + +**Examples** + +- Input Dataset:: + + c1 c2 c3 c4 c5 c6 + 586 chrX 161416 170887 41108_at 16990 + 73 chrX 505078 532318 35073_at 1700 + 595 chrX 1361578 1388460 33665_s_at 1960 + 74 chrX 1420620 1461919 1185_at 8600 + +- Summary Statistics on column c6 of the above input dataset:: + + #sum mean stdev 0% 25% 50% 75% 100% + 29250.000 7312.500 7198.636 1700.000 1895.000 5280.000 10697.500 16990.000 + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/gsummary.xml.groups --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/gsummary.xml.groups Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,62 @@ +<tool id="Summary Statistics1" name="Summary Statistics"> + <description>of a column in a tab delimited file according to an expression</description> + <command interpreter="python">gsummary.py $input $out_file1 "$cond" "$groups"</command> + <inputs> + <param name="cond" size="40" type="text" value="c5" label="expression"/> + <param name="groups" size="40" type="text" value="none" label="group terms (c1,c4,etc.)"/> + <param format="txt" name="input" type="data" label="summary statistics on"/> + + </inputs> + <outputs> + <data format="txt" name="out_file1" /> + </outputs> + <help> + +.. class:: warningmark + +This tool expects input datasets to consist of tab-delimited columns (blank or comment lines beginning with a # character are automatically skipped). + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +.. class:: infomark + +**TIP:** Computing summary statistics may throw exceptions if the data value in every line of the columns being summarized is not numerical. If a line is missing a value or contains a non-numerical value in the column being summarized, that line is skipped and the value is not included in the statistical computation. The number of invalid skipped lines is documented in the resulting history item. + +**Syntax** + +This tool computes basic summary statistics on a given column, or on an expression containing those columns + +- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file +- To group the summary by the values in a column or columns, specify in the **group terms** box... + + **c1** *group by the values in column 1* + + **c1,c4** *group by the values in column 1, then by the values in column 4* + + +----- + +**Expression examples** + +- **log(c5)** calculates the summary statistics for the natural log of column 5 +- **(c5 + c6 + c7) / 3** calculates the summary statistics on the average of columns 5-7 +- **log(c5,10)** summary statistics of the base 10 log of column 5 +- **sqrt(c5+c9)** summary statistics of the square root of column 5 + column 9 + +**Group examples** + +- **c1** group by the values in column 1 +- **c1,c4** group by the values in column 1, then by the values in column 4 + +----- + +.. class:: infomark + +**TIP:** Most functions (like *abs*) take only a single expression. *log* can take one or two parameters, like *log(expression,base)* + +Currently, these R functions are supported: *abs, sign, sqrt, floor, ceiling, trunc, round, signif, exp, log, cos, sin, tan, acos, asin, atan, cosh, sinh, tanh, acosh, asinh, atanh, lgamma, gamma, gammaCody, digamma, trigamma, cumsum, cumprod, cummax, cummin* + +.. |INFO| image:: ./static/images/icon_info_sml.gif + +</help> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c stats/r_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stats/r_wrapper.sh Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,23 @@ +#!/bin/sh + +### Run R providing the R script in $1 as standard input and passing +### the remaining arguments on the command line + +# Function that writes a message to stderr and exits +fail() +{ + echo "$@" >&2 + exit 1 +} + +# Ensure R executable is found +which R > /dev/null || fail "'R' is required by this tool but was not found on path" + +# Extract first argument +infile=$1; shift + +# Ensure the file exists +test -f $infile || fail "R input file '$infile' does not exist" + +# Invoke R passing file named by first argument to stdin +R --vanilla --slave $* < $infile |
b |
diff -r 000000000000 -r 7621d36a4e9c visualization/LAJ.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/visualization/LAJ.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,11 @@ +#!/usr/bin/env python + +""" +Copies LAV file over to new file for use with LAJ +""" +import shutil +import sys + +assert sys.version_info[:2] >= ( 2, 4 ) + +shutil.copyfile(sys.argv[1], sys.argv[2]) |
b |
diff -r 000000000000 -r 7621d36a4e9c visualization/LAJ.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/visualization/LAJ.xml Mon Apr 30 01:37:51 2018 -0400 |
b |
@@ -0,0 +1,42 @@ +<tool id="laj_1" name="LAJ" version="1.0.0"> +<description>Pairwise Alignment Viewer</description> + <command interpreter="python">LAJ.py $maf_input $out_file1</command> + <inputs> + <param name="maf_input" type="data" format="lav" label="Alignment File" optional="False"/> + <param name="seq_file1" type="data" format="fasta" label="First Sequence File" optional="True"/> + <param name="seq_file2" type="data" format="fasta" label="Second Sequence File" optional="True"/> + <param name="exonfile" type="data" format="txt" label="Exon File" optional="True"/> + <param name="repeatfile" type="data" format="txt" label="Repeat File" optional="True"/> + <param name="annotationfile" type="data" format="txt" label="Annotation File" optional="True"/> + <param name="underlayfile" type="data" format="txt" label="Underlay File" optional="True"/> + <param name="highlightfile" type="data" format="txt" label="Highlight File" optional="True"/> + </inputs> + <outputs> + <data name="out_file1" format="laj"/> + </outputs> +<help> +You can use this tool to view a set of LAV alignments. You may include FASTA formatted sequences for both species. + +For detailed information on LAJ, click here_. + +.. _here: http://globin.cse.psu.edu/dist/laj/ + +Laj is a tool for viewing and manipulating the output from pairwise alignment programs such as blastz. It can display interactive dotplot, pip, and text representations of the alignments, a diagram showing the locations of exons and repeats, and annotation links to other web sites containing additional information about particular regions. + +.. class:: infomark + +**Note:** If you save output from the applet, you will need to manually refresh your history. + + </help> + <code file="LAJ_code.py"/> + <citations> + <citation type="bibtex"> + @misc{Miller2005, +author = {Miller Lab}, +year = {2005}, +title = {Laj}, +url = {http://globin.bx.psu.edu/dist/laj/}, +} + </citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 7621d36a4e9c visualization/LAJ_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/visualization/LAJ_code.py Mon Apr 30 01:37:51 2018 -0400 |
[ |
@@ -0,0 +1,42 @@ +# post processing, add sequence and additional annoation info if available +from six.moves.urllib.parse import urlencode + +from galaxy.datatypes.images import create_applet_tag_peek + + +def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): + primary_data = next(iter(out_data.values())) + + # default params for LAJ type + params = { + "alignfile1": "display?id=%s" % primary_data.id, + "buttonlabel": "Launch LAJ", + "title": "LAJ in Galaxy", + "posturl": "history_add_to?%s" % urlencode( { 'history_id': primary_data.history_id, 'ext': 'lav', 'name': 'LAJ Output', 'info': 'Added by LAJ', 'dbkey': primary_data.dbkey } ) + } + for name, data in inp_data.items(): + if name == "maf_input": + params["alignfile1"] = "display?id=%s" % data.id + elif name == "seq_file1" and data.state == data.states.OK and data.has_data(): + params["file1seq1"] = "display?id=%s" % data.id + elif name == "seq_file2" and data.state == data.states.OK and data.has_data(): + params["file1seq2"] = "display?id=%s" % data.id + elif name == "exonfile" and data.state == data.states.OK and data.has_data(): + params["exonfile"] = "display?id=%s" % data.id + elif name == "repeatfile" and data.state == data.states.OK and data.has_data(): + params["repeatfile"] = "display?id=%s" % data.id + elif name == "annotationfile" and data.state == data.states.OK and data.has_data(): + params["annotationfile"] = "display?id=%s" % data.id + elif name == "underlayfile" and data.state == data.states.OK and data.has_data(): + params["underlayfile"] = "display?id=%s" % data.id + elif name == "highlightfile" and data.state == data.states.OK and data.has_data(): + params["highlightfile"] = "display?id=%s" % data.id + + if "file1seq1" not in params and "file1seq2" not in params: + params["noseq"] = "true" + + class_name = "edu.psu.cse.bio.laj.LajApplet.class" + archive = "/static/laj/laj.jar" + primary_data.peek = create_applet_tag_peek( class_name, archive, params ) + app.model.context.add( primary_data ) + app.model.context.flush() |