comparison data_manager/data_manager_snpsift_dbnsfp.py @ 0:0e9e3bb5776a draft

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_snpsift_dbnsfp commit 5316af00b4a71a7b526cbc9540d5158749cc38e4
author iuc
date Tue, 07 Jun 2016 10:23:16 -0400
parents
children 3d4cd0e3891f
comparison
equal deleted inserted replaced
-1:000000000000 0:0e9e3bb5776a
1 #!/usr/bin/env python
2
3 import gzip
4 import json
5 import optparse
6 import os
7 import os.path
8 import re
9 import shutil
10 import sys
11 import urllib
12 import zipfile
13
14 from pysam import ctabix
15
16 """
17 # Install dbNSFP databases
18 # from DbNsfp site
19 # Download dbNSFP database
20 $ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip
21 # Uncompress
22 $ unzip dbNSFP2.4.zip
23 # Create a single file version
24 $ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt
25 # Compress using block-gzip algorithm
26 bgzip dbNSFP2.4.txt
27 # Create tabix index
28 tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz
29
30 data_table:
31
32 <table name="snpsift_dbnsfps" comment_char="#">
33 <columns>key, build, name, value, annotations</columns>
34 <file path="tool-data/snpsift_dbnsfps.loc" />
35 </table>
36
37 #id build description path annotations
38 #GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.gz SIFT_pred,Uniprot_acc
39 #GRCh38_dbNSFP2.7 GRCh38 GRCh38 dbNSFP2.7 /depot/snpeff/dbNSFP2.7.gz SIFT_pred,Uniprot_acc
40 """
41
42 data_table = 'snpsift_dbnsfps'
43 softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/'
44 dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)'
45 tokenize = re.compile(r'(\d+)|(\D+)').findall
46 dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?'
47
48
49 def stop_err(msg):
50 sys.stderr.write(msg)
51 sys.exit(1)
52
53
54 def get_nsfp_genome_version(name):
55 genome_version = 'hg19'
56 dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?'
57 m = re.match(dbNSFP_name_pat, name)
58 if m:
59 (base, mid, ver) = m.groups()
60 if base == 'dbscSNV':
61 genome_version = 'hg19'
62 else:
63 genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18'
64 return genome_version
65
66
67 def get_annotations(gzip_path):
68 annotations = None
69 fh = None
70 try:
71 fh = gzip.open(gzip_path, 'r')
72 buf = fh.read(10000)
73 lines = buf.splitlines()
74 headers = lines[0].split('\t')
75 annotations = ','.join([x.strip() for x in headers[4:]])
76 except Exception as e:
77 stop_err('Error Reading annotations %s : %s' % (gzip_path, e))
78 finally:
79 if fh:
80 fh.close()
81 return annotations
82
83
84 def tabix_file(input_fname, output_fname):
85 print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname)
86 ctabix.tabix_compress(input_fname, output_fname, force=True)
87 # Column indices are 0-based.
88 ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1)
89
90
91 def natural_sortkey(string):
92 return tuple(int(num) if num else alpha for num, alpha in tokenize(string))
93
94
95 def download_dbnsfp_database(url, output_file):
96 dbnsfp_tsv = None
97 file_path = 'downloaded_file'
98 urllib.urlretrieve(url, file_path)
99 with zipfile.ZipFile(file_path, 'r') as my_zip:
100 dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv'
101 wtr = open(dbnsfp_tsv, 'w')
102 allfiles = [info.filename for info in my_zip.infolist()]
103 files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)]
104 files = sorted(files, key=natural_sortkey)
105 for j, file in enumerate(files):
106 fh = my_zip.open(file, 'rU')
107 for i, line in enumerate(fh):
108 if j > 0 and i == 0:
109 continue
110 wtr.write(line)
111 return dbnsfp_tsv
112
113
114 def main():
115 # Parse Command Line
116 parser = optparse.OptionParser()
117 parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version')
118 parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset')
119 parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file')
120 parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset')
121 parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset')
122 (options, args) = parser.parse_args()
123
124 filename = args[0]
125 params = json.loads(open(filename).read())
126 target_directory = params['output_data'][0]['extra_files_path']
127 if not os.path.exists(target_directory):
128 os.mkdir(target_directory)
129 data_manager_dict = {}
130 genome_version = options.dbkey if options.dbkey else 'unknown'
131 dbnsfp_tsv = None
132 db_name = None
133 bzip_path = None
134 if options.softgenetics:
135 dbnsfp_url = softgenetics_url + options.softgenetics
136 db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics)
137 genome_version = get_nsfp_genome_version(options.softgenetics)
138 tsv = db_name + '.tsv'
139 dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv)
140 elif options.dbnsfp_tabular:
141 db_name = options.db_name
142 dbnsfp_tsv = options.dbnsfp_tabular
143 elif options.snpsiftdbnsfp:
144 (dirpath, bgzip_name) = os.path.split(options.snpsiftdbnsfp)
145 idxpath = options.snpsiftdbnsfp + '.tbi'
146 shutil.copy(options.snpsiftdbnsfp, target_directory)
147 shutil.copy(idxpath, target_directory)
148 bzip_path = os.path.join(target_directory, bgzip_name)
149 db_name = re.sub('(.txt)?.gz$', '', bgzip_name)
150 else:
151 stop_err('Either --softgenetics or --dbnsfp_tabular required')
152 if dbnsfp_tsv:
153 bgzip_name = '%s.txt.gz' % db_name
154 bzip_path = os.path.join(target_directory, bgzip_name)
155 tabix_file(dbnsfp_tsv, bzip_path)
156 annotations = get_annotations(bzip_path)
157 # Create the SnpSift dbNSFP Reference Data
158 data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations)
159 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
160 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
161 data_manager_dict['data_tables'][data_table].append(data_table_entry)
162
163 # save info to json file
164 open(filename, 'wb').write(json.dumps(data_manager_dict))
165
166 if __name__ == "__main__":
167 main()