comparison data_manager/data_manager_snpsift_dbnsfp.py @ 7:fe8a9ab8daf9 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_snpsift_dbnsfp commit 02d2967f77e3fa5a18aea63dc84aa9ab418dc165"
author iuc
date Sun, 22 Nov 2020 12:54:01 +0000
parents e57f0b0bc73b
children
comparison
equal deleted inserted replaced
6:e57f0b0bc73b 7:fe8a9ab8daf9
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 from __future__ import print_function
2 3
3 import gzip 4 import gzip
4 import json 5 import json
5 import optparse 6 import optparse
6 import os 7 import os
44 dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)' 45 dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)'
45 tokenize = re.compile(r'(\d+)|(\D+)').findall 46 tokenize = re.compile(r'(\d+)|(\D+)').findall
46 dbNSFP_name_pat = r'dbNSFP(v|_light)?(\d*).*?' 47 dbNSFP_name_pat = r'dbNSFP(v|_light)?(\d*).*?'
47 48
48 49
49 def stop_err(msg):
50 sys.stderr.write(msg)
51 sys.exit(1)
52
53
54 def get_nsfp_genome_version(name): 50 def get_nsfp_genome_version(name):
55 genome_version = 'hg19' 51 genome_version = 'hg19'
56 dbNSFP_name_pat = r'(dbscSNV|dbNSFP(v|_light)?)(\d*).*?' 52 dbNSFP_name_pat = r'(dbscSNV|dbNSFP(v|_light)?)(\d*).*?'
57 m = re.match(dbNSFP_name_pat, name) 53 m = re.match(dbNSFP_name_pat, name)
58 if m: 54 if m:
64 return genome_version 60 return genome_version
65 61
66 62
67 def get_annotations(gzip_path): 63 def get_annotations(gzip_path):
68 annotations = None 64 annotations = None
69 fh = None
70 try: 65 try:
71 fh = gzip.open(gzip_path, 'r') 66 with gzip.open(gzip_path, 'r') as fh:
72 buf = fh.read(10000) 67 buf = fh.read(10000)
73 lines = buf.splitlines() 68 lines = buf.splitlines()
74 headers = lines[0].split('\t') 69 headers = lines[0].split('\t')
75 annotations = ','.join([x.strip() for x in headers[4:]]) 70 annotations = ','.join([x.strip() for x in headers[4:]])
76 except Exception as e: 71 except Exception as e:
77 stop_err('Error Reading annotations %s : %s' % (gzip_path, e)) 72 sys.exit('Error Reading annotations %s : %s' % (gzip_path, e))
78 finally:
79 if fh:
80 fh.close()
81 return annotations 73 return annotations
82 74
83 75
84 def tabix_file(input_fname, output_fname): 76 def tabix_file(input_fname, output_fname):
85 print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname) 77 print("tabix_file: %s -> %s" % (input_fname, output_fname))
86 ctabix.tabix_compress(input_fname, output_fname, force=True) 78 ctabix.tabix_compress(input_fname, output_fname, force=True)
87 # Column indices are 0-based. 79 # Column indices are 0-based.
88 ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1) 80 ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1)
89 81
90 82
117 pos = int(line.split('\t')[1]) 109 pos = int(line.split('\t')[1])
118 if lastpos and pos < lastpos: 110 if lastpos and pos < lastpos:
119 tfh.close() 111 tfh.close()
120 tempfiles.append(file + "_%d" % len(tempfiles)) 112 tempfiles.append(file + "_%d" % len(tempfiles))
121 tfh = open(tempfiles[-1], 'w') 113 tfh = open(tempfiles[-1], 'w')
122 print >> sys.stderr, "%s [%d] pos: %d < %d" % (file, i, pos, lastpos) 114 print("%s [%d] pos: %d < %d" % (file, i, pos, lastpos), file=sys.stderr)
123 lastpos = pos 115 lastpos = pos
124 tfh.write(line) 116 tfh.write(line)
125 tfh.close() 117 tfh.close()
126 if len(tempfiles) == 1: 118 if len(tempfiles) == 1:
127 with open(tempfiles[0], 'r') as tfh: 119 with open(tempfiles[0], 'r') as tfh:
154 parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset') 146 parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset')
155 parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset') 147 parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset')
156 (options, args) = parser.parse_args() 148 (options, args) = parser.parse_args()
157 149
158 filename = args[0] 150 filename = args[0]
159 params = json.loads(open(filename).read()) 151 with open(filename) as fh:
152 params = json.load(fh)
160 target_directory = params['output_data'][0]['extra_files_path'] 153 target_directory = params['output_data'][0]['extra_files_path']
161 if not os.path.exists(target_directory): 154 if not os.path.exists(target_directory):
162 os.mkdir(target_directory) 155 os.mkdir(target_directory)
163 data_manager_dict = {} 156 data_manager_dict = {}
164 genome_version = options.dbkey if options.dbkey else 'unknown' 157 genome_version = options.dbkey if options.dbkey else 'unknown'
180 shutil.copy(options.snpsiftdbnsfp, target_directory) 173 shutil.copy(options.snpsiftdbnsfp, target_directory)
181 shutil.copy(idxpath, target_directory) 174 shutil.copy(idxpath, target_directory)
182 bzip_path = os.path.join(target_directory, bgzip_name) 175 bzip_path = os.path.join(target_directory, bgzip_name)
183 db_name = re.sub('(.txt)?.gz$', '', bgzip_name) 176 db_name = re.sub('(.txt)?.gz$', '', bgzip_name)
184 else: 177 else:
185 stop_err('Either --softgenetics or --dbnsfp_tabular required') 178 sys.exit('Either --softgenetics or --dbnsfp_tabular required')
186 if dbnsfp_tsv: 179 if dbnsfp_tsv:
187 bgzip_name = '%s.txt.gz' % db_name 180 bgzip_name = '%s.txt.gz' % db_name
188 bzip_path = os.path.join(target_directory, bgzip_name) 181 bzip_path = os.path.join(target_directory, bgzip_name)
189 tabix_file(dbnsfp_tsv, bzip_path) 182 tabix_file(dbnsfp_tsv, bzip_path)
190 annotations = get_annotations(bzip_path) 183 annotations = get_annotations(bzip_path)
193 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) 186 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
194 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) 187 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
195 data_manager_dict['data_tables'][data_table].append(data_table_entry) 188 data_manager_dict['data_tables'][data_table].append(data_table_entry)
196 189
197 # save info to json file 190 # save info to json file
198 open(filename, 'w').write(json.dumps(data_manager_dict, sort_keys=True)) 191 with open(filename, 'w') as fh:
192 json.dump(data_manager_dict, fh, sort_keys=True)
199 193
200 194
201 if __name__ == "__main__": 195 if __name__ == "__main__":
202 main() 196 main()