Mercurial > repos > iuc > data_manager_snpeff
comparison data_manager/data_manager_snpEff_download.py @ 1:85a23e2dd92b draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_snpeff commit 88c982c5adcd32b11d98428fc554a4fdfcc19584
| author | iuc |
|---|---|
| date | Tue, 07 Jun 2016 10:11:50 -0400 |
| parents | 9ac823a8b328 |
| children | 847b0f43c0e5 |
comparison
equal
deleted
inserted
replaced
| 0:9ac823a8b328 | 1:85a23e2dd92b |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 import gzip |
| 3 import sys | 3 import json |
| 4 import optparse | |
| 4 import os | 5 import os |
| 5 import re | 6 import re |
| 6 import tempfile | |
| 7 import subprocess | 7 import subprocess |
| 8 import fileinput | 8 import sys |
| 9 import shutil | 9 |
| 10 import optparse | |
| 11 import urllib2 | |
| 12 import gzip | |
| 13 from ftplib import FTP | |
| 14 import tarfile | |
| 15 | |
| 16 from galaxy.util.json import from_json_string, to_json_string | |
| 17 | 10 |
| 18 def stop_err(msg): | 11 def stop_err(msg): |
| 19 sys.stderr.write(msg) | 12 sys.stderr.write(msg) |
| 20 sys.exit(1) | 13 sys.exit(1) |
| 21 | 14 |
| 22 | 15 |
| 23 def fetch_databases(jar_path,genome_list=None): | 16 def fetch_databases(jar_path, genome_list=None): |
| 24 snpDBs = dict() | 17 snpDBs = dict() |
| 25 (snpEff_dir,snpEff_jar) = os.path.split(jar_path) | 18 (snpEff_dir, snpEff_jar) = os.path.split(jar_path) |
| 26 databases_path = 'databases.out' | 19 databases_path = 'databases.out' |
| 27 databases_output = open(databases_path,'w') | 20 databases_output = open(databases_path, 'w') |
| 28 args = [ 'java','-jar', ] | 21 args = [ 'java', '-jar' ] |
| 29 args.append( snpEff_jar ) | 22 args.append( snpEff_jar ) |
| 30 args.append( 'databases' ) | 23 args.append( 'databases' ) |
| 31 # tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-data-manager-snpEff-stderr" ) | 24 # tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-data-manager-snpEff-stderr" ) |
| 32 # databases_output = open(databases_path) | 25 # databases_output = open(databases_path) |
| 33 # proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stdout=databases_output.fileno(), stderr=tmp_stderr.fileno() ) | 26 # proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stdout=databases_output.fileno(), stderr=tmp_stderr.fileno() ) |
| 35 return_code = proc.wait() | 28 return_code = proc.wait() |
| 36 if return_code: | 29 if return_code: |
| 37 sys.exit( return_code ) | 30 sys.exit( return_code ) |
| 38 databases_output.close() | 31 databases_output.close() |
| 39 try: | 32 try: |
| 40 fh = open(databases_path,'r') | 33 fh = open(databases_path, 'r') |
| 41 for i,line in enumerate(fh): | 34 for i, line in enumerate(fh): |
| 42 fields = line.split('\t') | 35 fields = line.split('\t') |
| 43 if len(fields) >= 2: | 36 if len(fields) >= 2: |
| 44 genome_version = fields[0].strip() | 37 genome_version = fields[0].strip() |
| 45 if genome_list and genome_version not in genome_list: | 38 if genome_list and genome_version not in genome_list: |
| 46 continue | 39 continue |
| 47 if genome_version.startswith("Genome") or genome_version.startswith("-"): | 40 if genome_version.startswith("Genome") or genome_version.startswith("-"): |
| 48 continue | 41 continue |
| 49 description = fields[1].strip() | 42 description = fields[1].strip() |
| 50 snpDBs[genome_version] = description; | 43 snpDBs[genome_version] = description |
| 51 except Exception, e: | 44 except Exception as e: |
| 52 stop_err( 'Error parsing %s %s\n' % (config,str( e )) ) | 45 stop_err( 'Error parsing %s %s\n' % (databases_path, str( e )) ) |
| 53 else: | 46 else: |
| 54 fh.close() | 47 fh.close() |
| 55 return snpDBs | 48 return snpDBs |
| 56 | 49 |
| 57 def getOrganismNames(jar_path,genomes,organisms) : | 50 |
| 51 def getOrganismNames(jar_path, genomes, organisms): | |
| 58 genome_list = genomes.split(',') | 52 genome_list = genomes.split(',') |
| 59 organism_list = organisms.split(',') if organisms else [] | 53 organism_list = organisms.split(',') if organisms else [] |
| 60 if len(genome_list) != len(organism_list): | 54 if len(genome_list) != len(organism_list): |
| 61 descriptions = [] | 55 descriptions = [] |
| 62 snpDBdict = fetch_databases(jar_path,genome_list=genome_list); | 56 snpDBdict = fetch_databases(jar_path, genome_list=genome_list) |
| 63 for genome in snpDBdict: | 57 for genome in snpDBdict: |
| 64 descriptions.append(snpDBdict[genome] if genome in snpDBdict else genome) | 58 descriptions.append(snpDBdict[genome] if genome in snpDBdict else genome) |
| 65 return ','.join(descriptions) | 59 return ','.join(descriptions) |
| 66 return organisms | 60 return organisms |
| 61 | |
| 67 | 62 |
| 68 def getSnpeffVersion(jar_path): | 63 def getSnpeffVersion(jar_path): |
| 69 snpeff_version = 'SnpEff ?.?' | 64 snpeff_version = 'SnpEff ?.?' |
| 70 (snpEff_dir,snpEff_jar) = os.path.split(jar_path) | 65 (snpEff_dir, snpEff_jar) = os.path.split(jar_path) |
| 71 stderr_path = 'snpeff.err' | 66 stderr_path = 'snpeff.err' |
| 72 stderr_fh = open(stderr_path,'w') | 67 stderr_fh = open(stderr_path, 'w') |
| 73 args = [ 'java','-jar', ] | 68 args = [ 'java', '-jar' ] |
| 74 args.append( snpEff_jar ) | 69 args.append( snpEff_jar ) |
| 75 args.append( '-h' ) | 70 args.append( '-h' ) |
| 76 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() ) | 71 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() ) |
| 77 return_code = proc.wait() | 72 return_code = proc.wait() |
| 78 if return_code != 255: | 73 if return_code != 255: |
| 79 sys.exit( return_code ) | 74 sys.exit( return_code ) |
| 80 stderr_fh.close() | 75 stderr_fh.close() |
| 81 fh = open(stderr_path,'r') | 76 fh = open(stderr_path, 'r') |
| 82 for line in fh: | 77 for line in fh: |
| 83 m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line) | 78 m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$', line) |
| 84 if m: | 79 if m: |
| 85 snpeff_version = m.groups()[0] + m.groups()[1] | 80 snpeff_version = m.groups()[0] + m.groups()[1] |
| 86 break | 81 break |
| 87 fh.close() | 82 fh.close() |
| 88 return snpeff_version | 83 return snpeff_version |
| 89 | 84 |
| 85 | |
| 90 # Starting with SnpEff 4.1 the .bin files contain the SnpEff version: | 86 # Starting with SnpEff 4.1 the .bin files contain the SnpEff version: |
| 91 # Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed): | 87 # Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed): |
| 92 """ | 88 # |
| 93 SnpEff 4.1 | 89 # SnpEff 4.1 |
| 94 CHROMOSOME 2 1 0 179197 GL000219.1 false | 90 # CHROMOSOME 2 1 0 179197 GL000219.1 false |
| 95 CHROMOSOME 3 1 0 81347269 HSCHR17_1 false | 91 # CHROMOSOME 3 1 0 81347269 HSCHR17_1 false |
| 96 """ | |
| 97 def getSnpeffVersionFromFile(path): | 92 def getSnpeffVersionFromFile(path): |
| 98 snpeff_version = None | 93 snpeff_version = None |
| 99 try: | 94 try: |
| 100 fh = gzip.open(path, 'rb') | 95 fh = gzip.open(path, 'rb') |
| 101 buf = fh.read(100) | 96 buf = fh.read(100) |
| 102 lines = buf.splitlines() | 97 lines = buf.splitlines() |
| 103 m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip()) | 98 m = re.match('^(SnpEff)\s+(\d+\.\d+).*$', lines[0].strip()) |
| 104 if m: | 99 if m: |
| 105 snpeff_version = m.groups()[0] + m.groups()[1] | 100 snpeff_version = m.groups()[0] + m.groups()[1] |
| 106 fh.close() | 101 fh.close() |
| 107 except Exception, e: | 102 except Exception as e: |
| 108 stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path,str( e )) ) | 103 stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path, str( e )) ) |
| 109 return snpeff_version | 104 return snpeff_version |
| 110 | 105 |
| 111 """ | 106 |
| 112 # Download human database 'hg19' | 107 # Download human database 'hg19' |
| 113 java -jar snpEff.jar download -v hg19 | 108 # java -jar snpEff.jar download -v hg19 |
| 114 | 109 # |
| 115 <command>java -jar \$SNPEFF_JAR_PATH/snpEff.jar download -c \$JAVA_JAR_PATH/snpEff.config $genomeVersion > $logfile </command> | 110 # <command>java -jar \$SNPEFF_JAR_PATH/snpEff.jar download -c \$JAVA_JAR_PATH/snpEff.config $genomeVersion > $logfile </command> |
| 116 | 111 # |
| 117 snpEffectPredictor.bin | 112 # snpEffectPredictor.bin |
| 118 regulation_HeLa-S3.bin | 113 # regulation_HeLa-S3.bin |
| 119 regulation_pattern = 'regulation_(.+).bin' | 114 # regulation_pattern = 'regulation_(.+).bin' |
| 120 """ | |
| 121 def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism): | 115 def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism): |
| 122 ## get data_dir from config | 116 # get data_dir from config |
| 123 ##--- | 117 # --- |
| 124 ## Databases are stored here | 118 # Databases are stored here |
| 125 ## E.g.: Information for 'hg19' is stored in data_dir/hg19/ | 119 # E.g.: Information for 'hg19' is stored in data_dir/hg19/ |
| 126 ## | 120 # |
| 127 ## Note: Since version 2.1 you can use tilde ('~') as first character to refer to your home directory | 121 # Note: Since version 2.1 you can use tilde ('~') as first character to refer to your home directory |
| 128 ##--- | 122 # --- |
| 129 #data_dir = ~/snpEff/data/ | 123 # data_dir = ~/snpEff/data/ |
| 130 data_dir = target_directory | 124 data_dir = target_directory |
| 131 (snpEff_dir,snpEff_jar) = os.path.split(jar_path) | 125 (snpEff_dir, snpEff_jar) = os.path.split(jar_path) |
| 132 args = [ 'java','-jar' ] | 126 args = [ 'java', '-jar' ] |
| 133 args.append( jar_path ) | 127 args.append( jar_path ) |
| 134 args.append( 'download' ) | 128 args.append( 'download' ) |
| 135 args.append( '-c' ) | 129 args.append( '-c' ) |
| 136 args.append( config ) | 130 args.append( config ) |
| 137 args.append( '-dataDir' ) | 131 args.append( '-dataDir' ) |
| 140 args.append( genome_version ) | 134 args.append( genome_version ) |
| 141 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir ) | 135 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir ) |
| 142 return_code = proc.wait() | 136 return_code = proc.wait() |
| 143 if return_code: | 137 if return_code: |
| 144 sys.exit( return_code ) | 138 sys.exit( return_code ) |
| 145 ## search data_dir/genome_version for files | 139 # search data_dir/genome_version for files |
| 146 regulation_pattern = 'regulation_(.+).bin' | 140 regulation_pattern = 'regulation_(.+).bin' |
| 147 # annotation files that are included in snpEff by a flag | 141 # annotation files that are included in snpEff by a flag |
| 148 annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'} | 142 annotations_dict = {'nextProt.bin': '-nextprot', 'motif.bin': '-motif'} |
| 149 genome_path = os.path.join(data_dir,genome_version) | 143 genome_path = os.path.join(data_dir, genome_version) |
| 150 snpeff_version = getSnpeffVersion(jar_path) | 144 snpeff_version = getSnpeffVersion(jar_path) |
| 151 key = snpeff_version + '_' + genome_version | 145 key = snpeff_version + '_' + genome_version |
| 152 if os.path.isdir(genome_path): | 146 if os.path.isdir(genome_path): |
| 153 for root, dirs, files in os.walk(genome_path): | 147 for root, dirs, files in os.walk(genome_path): |
| 154 for fname in files: | 148 for fname in files: |
| 155 if fname.startswith('snpEffectPredictor'): | 149 if fname.startswith('snpEffectPredictor'): |
| 156 # if snpEffectPredictor.bin download succeeded | 150 # if snpEffectPredictor.bin download succeeded |
| 157 name = genome_version + (' : ' + organism if organism else '') | 151 name = genome_version + (' : ' + organism if organism else '') |
| 158 # version = getSnpeffVersionFromFile(os.path.join(root,fname)) | 152 # version = getSnpeffVersionFromFile(os.path.join(root,fname)) |
| 159 data_table_entry = dict(key=key,version=snpeff_version,value=genome_version, name=name, path=data_dir) | 153 data_table_entry = dict(key=key, version=snpeff_version, value=genome_version, name=name, path=data_dir) |
| 160 _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry ) | 154 _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry ) |
| 161 else: | 155 else: |
| 162 m = re.match(regulation_pattern,fname) | 156 m = re.match(regulation_pattern, fname) |
| 163 if m: | 157 if m: |
| 164 name = m.groups()[0] | 158 name = m.groups()[0] |
| 165 data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=name, name=name) | 159 data_table_entry = dict(key=key, version=snpeff_version, genome=genome_version, value=name, name=name) |
| 166 _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry ) | 160 _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry ) |
| 167 elif fname in annotations_dict: | 161 elif fname in annotations_dict: |
| 168 value = annotations_dict[fname] | 162 value = annotations_dict[fname] |
| 169 name = value.lstrip('-') | 163 name = value.lstrip('-') |
| 170 data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=value, name=name) | 164 data_table_entry = dict(key=key, version=snpeff_version, genome=genome_version, value=value, name=name) |
| 171 _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry ) | 165 _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry ) |
| 172 return data_manager_dict | 166 return data_manager_dict |
| 167 | |
| 173 | 168 |
| 174 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ): | 169 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ): |
| 175 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | 170 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) |
| 176 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] ) | 171 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] ) |
| 177 data_manager_dict['data_tables'][data_table].append( data_table_entry ) | 172 data_manager_dict['data_tables'][data_table].append( data_table_entry ) |
| 178 return data_manager_dict | 173 return data_manager_dict |
| 179 | 174 |
| 175 | |
| 180 def main(): | 176 def main(): |
| 181 #Parse Command Line | 177 # Parse Command Line |
| 182 parser = optparse.OptionParser() | 178 parser = optparse.OptionParser() |
| 183 parser.add_option( '-j', '--jar_path', dest='jar_path', action='store', type="string", default=None, help='snpEff.jar path' ) | 179 parser.add_option( '-j', '--jar_path', dest='jar_path', action='store', type="string", default=None, help='snpEff.jar path' ) |
| 184 parser.add_option( '-c', '--config', dest='config', action='store', type="string", default=None, help='snpEff.config path' ) | 180 parser.add_option( '-c', '--config', dest='config', action='store', type="string", default=None, help='snpEff.config path' ) |
| 185 parser.add_option( '-g', '--genome_version', dest='genome_version', action='store', type="string", default=None, help='genome_version' ) | 181 parser.add_option( '-g', '--genome_version', dest='genome_version', action='store', type="string", default=None, help='genome_version' ) |
| 186 parser.add_option( '-o', '--organism', dest='organism', action='store', type="string", default=None, help='organism name' ) | 182 parser.add_option( '-o', '--organism', dest='organism', action='store', type="string", default=None, help='organism name' ) |
| 187 (options, args) = parser.parse_args() | 183 (options, args) = parser.parse_args() |
| 188 | 184 |
| 189 filename = args[0] | 185 filename = args[0] |
| 190 | 186 |
| 191 params = from_json_string( open( filename ).read() ) | 187 params = json.loads( open( filename ).read() ) |
| 192 target_directory = params[ 'output_data' ][0]['extra_files_path'] | 188 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
| 193 os.mkdir( target_directory ) | 189 os.mkdir( target_directory ) |
| 194 data_manager_dict = {} | 190 data_manager_dict = {} |
| 195 | 191 |
| 196 | 192 # Create SnpEff Reference Data |
| 197 #Create SnpEff Reference Data | 193 for genome_version, organism in zip(options.genome_version.split(','), getOrganismNames(options.jar_path, options.genome_version, options.organism).split(',')): |
| 198 for genome_version, organism in zip(options.genome_version.split(','), getOrganismNames(options.jar_path,options.genome_version,options.organism).split(',')): | |
| 199 download_database( data_manager_dict, target_directory, options.jar_path, options.config, genome_version, organism ) | 194 download_database( data_manager_dict, target_directory, options.jar_path, options.config, genome_version, organism ) |
| 200 | 195 |
| 201 #save info to json file | 196 # save info to json file |
| 202 open( filename, 'wb' ).write( to_json_string( data_manager_dict ) ) | 197 open( filename, 'wb' ).write( json.dumps( data_manager_dict ) ) |
| 203 | 198 |
| 204 if __name__ == "__main__": main() | 199 if __name__ == "__main__": |
| 205 | 200 main() |
