Mercurial > repos > iuc > snpsift_dbnsfp_datatypes
changeset 0:0c4372b93e85 draft default tip
Uploaded
author | iuc |
---|---|
date | Thu, 22 Jan 2015 08:04:59 -0500 |
parents | |
children | |
files | datatypes_conf.xml snpsift_dbnsfp.py tabular_to_dbnsfp.py tabular_to_dbnsfp.xml |
diffstat | 4 files changed, 144 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Thu Jan 22 08:04:59 2015 -0500 @@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="snpsift_dbnsfp.py"/> + </datatype_files> + <registration> + <datatype extension="snpsiftdbnsfp" type="galaxy.datatypes.snpsift_dbnsfp:SnpSiftDbNSFP" display_in_upload="True"/> + <datatype extension="dbnsfp.tabular" type="galaxy.datatypes.tabular:Tabular" subclass="True" display_in_upload="True"> + <converter file="tabular_to_dbnsfp.xml" target_datatype="snpsiftdbnsfp"/> + </datatype> + </registration> +</datatypes> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpsift_dbnsfp.py Thu Jan 22 08:04:59 2015 -0500 @@ -0,0 +1,85 @@ +""" +SnpSift dbNSFP datatypes +""" +import os,os.path,re,sys,gzip,logging +import traceback +import galaxy.datatypes.data +from galaxy.datatypes.data import Text +from galaxy.datatypes.metadata import MetadataElement + +log = logging.getLogger(__name__) + +class SnpSiftDbNSFP( Text ): + """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """ + MetadataElement( name='reference_name', default='dbSNFP' , desc='Reference Name', readonly=True, visible=True, set_in_upload=True, no_value='dbSNFP' ) + MetadataElement( name="bgzip", default=None, desc="dbNSFP bgzip", readonly=True, visible=True, no_value=None ) + MetadataElement( name="index", default=None, desc="Tabix Index File", readonly=True, visible=True, no_value=None) + MetadataElement( name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[] ) + file_ext = "snpsiftdbnsfp" + composite_type = 'auto_primary_file' + allow_datatype_change = False + """ + ## The dbNSFP file is a tabular file with 1 header line + ## The first 4 columns are required to be: chrom pos ref alt + ## These match columns 1,2,4,5 of the VCF file + ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix + ## Example: + ## Compress using block-gzip algorithm + bgzip dbNSFP2.3.txt + ## Create tabix index + tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz + """ + def __init__( self, **kwd ): + Text.__init__( self, **kwd ) + self.add_composite_file( '%s.grp', description = 'Group File', substitute_name_with_metadata = 'reference_name', is_binary = False ) + self.add_composite_file( '%s.ti', description = '', substitute_name_with_metadata = 'reference_name', is_binary = False ) + def init_meta( self, dataset, copy_from=None ): + Text.init_meta( self, dataset, copy_from=copy_from ) + def generate_primary_file( self, dataset = None ): + """ + This is called only at upload to write the html file + cannot rename the datasets here - they come with the default unfortunately + """ + regenerate_primary_file( self, dataset) + def regenerate_primary_file(self,dataset): + """ + cannot do this until we are setting metadata + """ + annotations = "dbNSFP Annotations: %s\n" % ','.join(dataset.metadata.annotation) + f = open(dataset.file_name,'a') + if dataset.metadata.bgzip: + bn = dataset.metadata.bgzip + f.write(bn) + f.write('\n') + f.write(annotations) + f.close() + def set_meta( self, dataset, overwrite=True, **kwd ): + try: + efp = dataset.extra_files_path + if os.path.exists(efp): + flist = os.listdir(efp) + for i,fname in enumerate(flist): + if fname.endswith('.gz'): + dataset.metadata.bgzip = fname + try: + fh = gzip.open(os.path.join(efp,fname),'r') + buf = fh.read(5000) + lines = buf.splitlines() + headers = lines[0].split('\t') + dataset.metadata.annotation = headers[4:] + except Exception,e: + log.warn("set_meta fname: %s %s" % (fname,str(e))) + traceback.print_stack(file=sys.stderr) + finally: + fh.close() + if fname.endswith('.tbi'): + dataset.metadata.index = fname + self.regenerate_primary_file(dataset) + except Exception,e: + log.warn("set_meta fname: %s %s" % (dataset.file_name if dataset and dataset.file_name else 'Unkwown',str(e))) + traceback.print_stack(file=sys.stderr) + +if __name__ == '__main__': + import doctest + doctest.testmod(sys.modules[__name__]) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tabular_to_dbnsfp.py Thu Jan 22 08:04:59 2015 -0500 @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +""" +Uses pysam to bgzip a file + +usage: %prog in_file out_file +""" + +from galaxy import eggs +import pkg_resources; pkg_resources.require( "pysam" ) +import ctabix, subprocess, tempfile, sys, optparse, os.path + +def main(): + # Read options, args. + usage = "Usage: %prog [options] tabular_input_file bgzip_output_file" + parser = optparse.OptionParser(usage = usage) + parser.add_option( '-c', '--chr-col', type='int', default=0, dest='chrom_col' ) + parser.add_option( '-s', '--start-col', type='int', default=1, dest='start_col' ) + parser.add_option( '-e', '--end-col', type='int', default=1, dest='end_col' ) + (options, args) = parser.parse_args() + if len(args) != 2: + parser.print_usage() + exit(1) + input_fname, output_fname = args + output_dir = os.path.dirname(output_fname) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + ctabix.tabix_compress(input_fname, output_fname, force=True) + # Column indices are 0-based. + ctabix.tabix_index(output_fname, seq_col=options.chrom_col,start_col=options.start_col,end_col=options.end_col) + +if __name__ == "__main__": + main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tabular_to_dbnsfp.xml Thu Jan 22 08:04:59 2015 -0500 @@ -0,0 +1,12 @@ +<tool id="tabular_to_dbnsfp" name="Convert tabular to dbnsfp" version="1.0.0"> + <description></description> + <command interpreter="python">tabular_to_dbnsfp.py $input $dbnsfp.extra_files_path/dbNSFP.gz</command> + <inputs> + <param format="tabular" name="input" type="data" label="Choose a dbnsfp tabular file"/> + </inputs> + <outputs> + <data format="snpsiftdbnsfp" name="dbnsfp"/> + </outputs> + <help> + </help> +</tool>