Mercurial > repos > iuc > data_manager_snpsift_dbnsfp
comparison data_manager/data_manager_snpsift_dbnsfp.py @ 7:fe8a9ab8daf9 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_snpsift_dbnsfp commit 02d2967f77e3fa5a18aea63dc84aa9ab418dc165"
| author | iuc |
|---|---|
| date | Sun, 22 Nov 2020 12:54:01 +0000 |
| parents | e57f0b0bc73b |
| children |
comparison
equal
deleted
inserted
replaced
| 6:e57f0b0bc73b | 7:fe8a9ab8daf9 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 from __future__ import print_function | |
| 2 | 3 |
| 3 import gzip | 4 import gzip |
| 4 import json | 5 import json |
| 5 import optparse | 6 import optparse |
| 6 import os | 7 import os |
| 44 dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)' | 45 dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)' |
| 45 tokenize = re.compile(r'(\d+)|(\D+)').findall | 46 tokenize = re.compile(r'(\d+)|(\D+)').findall |
| 46 dbNSFP_name_pat = r'dbNSFP(v|_light)?(\d*).*?' | 47 dbNSFP_name_pat = r'dbNSFP(v|_light)?(\d*).*?' |
| 47 | 48 |
| 48 | 49 |
| 49 def stop_err(msg): | |
| 50 sys.stderr.write(msg) | |
| 51 sys.exit(1) | |
| 52 | |
| 53 | |
| 54 def get_nsfp_genome_version(name): | 50 def get_nsfp_genome_version(name): |
| 55 genome_version = 'hg19' | 51 genome_version = 'hg19' |
| 56 dbNSFP_name_pat = r'(dbscSNV|dbNSFP(v|_light)?)(\d*).*?' | 52 dbNSFP_name_pat = r'(dbscSNV|dbNSFP(v|_light)?)(\d*).*?' |
| 57 m = re.match(dbNSFP_name_pat, name) | 53 m = re.match(dbNSFP_name_pat, name) |
| 58 if m: | 54 if m: |
| 64 return genome_version | 60 return genome_version |
| 65 | 61 |
| 66 | 62 |
| 67 def get_annotations(gzip_path): | 63 def get_annotations(gzip_path): |
| 68 annotations = None | 64 annotations = None |
| 69 fh = None | |
| 70 try: | 65 try: |
| 71 fh = gzip.open(gzip_path, 'r') | 66 with gzip.open(gzip_path, 'r') as fh: |
| 72 buf = fh.read(10000) | 67 buf = fh.read(10000) |
| 73 lines = buf.splitlines() | 68 lines = buf.splitlines() |
| 74 headers = lines[0].split('\t') | 69 headers = lines[0].split('\t') |
| 75 annotations = ','.join([x.strip() for x in headers[4:]]) | 70 annotations = ','.join([x.strip() for x in headers[4:]]) |
| 76 except Exception as e: | 71 except Exception as e: |
| 77 stop_err('Error Reading annotations %s : %s' % (gzip_path, e)) | 72 sys.exit('Error Reading annotations %s : %s' % (gzip_path, e)) |
| 78 finally: | |
| 79 if fh: | |
| 80 fh.close() | |
| 81 return annotations | 73 return annotations |
| 82 | 74 |
| 83 | 75 |
| 84 def tabix_file(input_fname, output_fname): | 76 def tabix_file(input_fname, output_fname): |
| 85 print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname) | 77 print("tabix_file: %s -> %s" % (input_fname, output_fname)) |
| 86 ctabix.tabix_compress(input_fname, output_fname, force=True) | 78 ctabix.tabix_compress(input_fname, output_fname, force=True) |
| 87 # Column indices are 0-based. | 79 # Column indices are 0-based. |
| 88 ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1) | 80 ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1) |
| 89 | 81 |
| 90 | 82 |
| 117 pos = int(line.split('\t')[1]) | 109 pos = int(line.split('\t')[1]) |
| 118 if lastpos and pos < lastpos: | 110 if lastpos and pos < lastpos: |
| 119 tfh.close() | 111 tfh.close() |
| 120 tempfiles.append(file + "_%d" % len(tempfiles)) | 112 tempfiles.append(file + "_%d" % len(tempfiles)) |
| 121 tfh = open(tempfiles[-1], 'w') | 113 tfh = open(tempfiles[-1], 'w') |
| 122 print >> sys.stderr, "%s [%d] pos: %d < %d" % (file, i, pos, lastpos) | 114 print("%s [%d] pos: %d < %d" % (file, i, pos, lastpos), file=sys.stderr) |
| 123 lastpos = pos | 115 lastpos = pos |
| 124 tfh.write(line) | 116 tfh.write(line) |
| 125 tfh.close() | 117 tfh.close() |
| 126 if len(tempfiles) == 1: | 118 if len(tempfiles) == 1: |
| 127 with open(tempfiles[0], 'r') as tfh: | 119 with open(tempfiles[0], 'r') as tfh: |
| 154 parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset') | 146 parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset') |
| 155 parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset') | 147 parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset') |
| 156 (options, args) = parser.parse_args() | 148 (options, args) = parser.parse_args() |
| 157 | 149 |
| 158 filename = args[0] | 150 filename = args[0] |
| 159 params = json.loads(open(filename).read()) | 151 with open(filename) as fh: |
| 152 params = json.load(fh) | |
| 160 target_directory = params['output_data'][0]['extra_files_path'] | 153 target_directory = params['output_data'][0]['extra_files_path'] |
| 161 if not os.path.exists(target_directory): | 154 if not os.path.exists(target_directory): |
| 162 os.mkdir(target_directory) | 155 os.mkdir(target_directory) |
| 163 data_manager_dict = {} | 156 data_manager_dict = {} |
| 164 genome_version = options.dbkey if options.dbkey else 'unknown' | 157 genome_version = options.dbkey if options.dbkey else 'unknown' |
| 180 shutil.copy(options.snpsiftdbnsfp, target_directory) | 173 shutil.copy(options.snpsiftdbnsfp, target_directory) |
| 181 shutil.copy(idxpath, target_directory) | 174 shutil.copy(idxpath, target_directory) |
| 182 bzip_path = os.path.join(target_directory, bgzip_name) | 175 bzip_path = os.path.join(target_directory, bgzip_name) |
| 183 db_name = re.sub('(.txt)?.gz$', '', bgzip_name) | 176 db_name = re.sub('(.txt)?.gz$', '', bgzip_name) |
| 184 else: | 177 else: |
| 185 stop_err('Either --softgenetics or --dbnsfp_tabular required') | 178 sys.exit('Either --softgenetics or --dbnsfp_tabular required') |
| 186 if dbnsfp_tsv: | 179 if dbnsfp_tsv: |
| 187 bgzip_name = '%s.txt.gz' % db_name | 180 bgzip_name = '%s.txt.gz' % db_name |
| 188 bzip_path = os.path.join(target_directory, bgzip_name) | 181 bzip_path = os.path.join(target_directory, bgzip_name) |
| 189 tabix_file(dbnsfp_tsv, bzip_path) | 182 tabix_file(dbnsfp_tsv, bzip_path) |
| 190 annotations = get_annotations(bzip_path) | 183 annotations = get_annotations(bzip_path) |
| 193 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | 186 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) |
| 194 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | 187 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) |
| 195 data_manager_dict['data_tables'][data_table].append(data_table_entry) | 188 data_manager_dict['data_tables'][data_table].append(data_table_entry) |
| 196 | 189 |
| 197 # save info to json file | 190 # save info to json file |
| 198 open(filename, 'w').write(json.dumps(data_manager_dict, sort_keys=True)) | 191 with open(filename, 'w') as fh: |
| 192 json.dump(data_manager_dict, fh, sort_keys=True) | |
| 199 | 193 |
| 200 | 194 |
| 201 if __name__ == "__main__": | 195 if __name__ == "__main__": |
| 202 main() | 196 main() |
