Mercurial > repos > iuc > data_manager_snpeff
comparison data_manager/data_manager_snpEff_download.py @ 1:85a23e2dd92b draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_snpeff commit 88c982c5adcd32b11d98428fc554a4fdfcc19584
author | iuc |
---|---|
date | Tue, 07 Jun 2016 10:11:50 -0400 |
parents | 9ac823a8b328 |
children | 847b0f43c0e5 |
comparison
equal
deleted
inserted
replaced
0:9ac823a8b328 | 1:85a23e2dd92b |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 import gzip |
3 import sys | 3 import json |
4 import optparse | |
4 import os | 5 import os |
5 import re | 6 import re |
6 import tempfile | |
7 import subprocess | 7 import subprocess |
8 import fileinput | 8 import sys |
9 import shutil | 9 |
10 import optparse | |
11 import urllib2 | |
12 import gzip | |
13 from ftplib import FTP | |
14 import tarfile | |
15 | |
16 from galaxy.util.json import from_json_string, to_json_string | |
17 | 10 |
18 def stop_err(msg): | 11 def stop_err(msg): |
19 sys.stderr.write(msg) | 12 sys.stderr.write(msg) |
20 sys.exit(1) | 13 sys.exit(1) |
21 | 14 |
22 | 15 |
23 def fetch_databases(jar_path,genome_list=None): | 16 def fetch_databases(jar_path, genome_list=None): |
24 snpDBs = dict() | 17 snpDBs = dict() |
25 (snpEff_dir,snpEff_jar) = os.path.split(jar_path) | 18 (snpEff_dir, snpEff_jar) = os.path.split(jar_path) |
26 databases_path = 'databases.out' | 19 databases_path = 'databases.out' |
27 databases_output = open(databases_path,'w') | 20 databases_output = open(databases_path, 'w') |
28 args = [ 'java','-jar', ] | 21 args = [ 'java', '-jar' ] |
29 args.append( snpEff_jar ) | 22 args.append( snpEff_jar ) |
30 args.append( 'databases' ) | 23 args.append( 'databases' ) |
31 # tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-data-manager-snpEff-stderr" ) | 24 # tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-data-manager-snpEff-stderr" ) |
32 # databases_output = open(databases_path) | 25 # databases_output = open(databases_path) |
33 # proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stdout=databases_output.fileno(), stderr=tmp_stderr.fileno() ) | 26 # proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stdout=databases_output.fileno(), stderr=tmp_stderr.fileno() ) |
35 return_code = proc.wait() | 28 return_code = proc.wait() |
36 if return_code: | 29 if return_code: |
37 sys.exit( return_code ) | 30 sys.exit( return_code ) |
38 databases_output.close() | 31 databases_output.close() |
39 try: | 32 try: |
40 fh = open(databases_path,'r') | 33 fh = open(databases_path, 'r') |
41 for i,line in enumerate(fh): | 34 for i, line in enumerate(fh): |
42 fields = line.split('\t') | 35 fields = line.split('\t') |
43 if len(fields) >= 2: | 36 if len(fields) >= 2: |
44 genome_version = fields[0].strip() | 37 genome_version = fields[0].strip() |
45 if genome_list and genome_version not in genome_list: | 38 if genome_list and genome_version not in genome_list: |
46 continue | 39 continue |
47 if genome_version.startswith("Genome") or genome_version.startswith("-"): | 40 if genome_version.startswith("Genome") or genome_version.startswith("-"): |
48 continue | 41 continue |
49 description = fields[1].strip() | 42 description = fields[1].strip() |
50 snpDBs[genome_version] = description; | 43 snpDBs[genome_version] = description |
51 except Exception, e: | 44 except Exception as e: |
52 stop_err( 'Error parsing %s %s\n' % (config,str( e )) ) | 45 stop_err( 'Error parsing %s %s\n' % (databases_path, str( e )) ) |
53 else: | 46 else: |
54 fh.close() | 47 fh.close() |
55 return snpDBs | 48 return snpDBs |
56 | 49 |
57 def getOrganismNames(jar_path,genomes,organisms) : | 50 |
51 def getOrganismNames(jar_path, genomes, organisms): | |
58 genome_list = genomes.split(',') | 52 genome_list = genomes.split(',') |
59 organism_list = organisms.split(',') if organisms else [] | 53 organism_list = organisms.split(',') if organisms else [] |
60 if len(genome_list) != len(organism_list): | 54 if len(genome_list) != len(organism_list): |
61 descriptions = [] | 55 descriptions = [] |
62 snpDBdict = fetch_databases(jar_path,genome_list=genome_list); | 56 snpDBdict = fetch_databases(jar_path, genome_list=genome_list) |
63 for genome in snpDBdict: | 57 for genome in snpDBdict: |
64 descriptions.append(snpDBdict[genome] if genome in snpDBdict else genome) | 58 descriptions.append(snpDBdict[genome] if genome in snpDBdict else genome) |
65 return ','.join(descriptions) | 59 return ','.join(descriptions) |
66 return organisms | 60 return organisms |
61 | |
67 | 62 |
68 def getSnpeffVersion(jar_path): | 63 def getSnpeffVersion(jar_path): |
69 snpeff_version = 'SnpEff ?.?' | 64 snpeff_version = 'SnpEff ?.?' |
70 (snpEff_dir,snpEff_jar) = os.path.split(jar_path) | 65 (snpEff_dir, snpEff_jar) = os.path.split(jar_path) |
71 stderr_path = 'snpeff.err' | 66 stderr_path = 'snpeff.err' |
72 stderr_fh = open(stderr_path,'w') | 67 stderr_fh = open(stderr_path, 'w') |
73 args = [ 'java','-jar', ] | 68 args = [ 'java', '-jar' ] |
74 args.append( snpEff_jar ) | 69 args.append( snpEff_jar ) |
75 args.append( '-h' ) | 70 args.append( '-h' ) |
76 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() ) | 71 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() ) |
77 return_code = proc.wait() | 72 return_code = proc.wait() |
78 if return_code != 255: | 73 if return_code != 255: |
79 sys.exit( return_code ) | 74 sys.exit( return_code ) |
80 stderr_fh.close() | 75 stderr_fh.close() |
81 fh = open(stderr_path,'r') | 76 fh = open(stderr_path, 'r') |
82 for line in fh: | 77 for line in fh: |
83 m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line) | 78 m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$', line) |
84 if m: | 79 if m: |
85 snpeff_version = m.groups()[0] + m.groups()[1] | 80 snpeff_version = m.groups()[0] + m.groups()[1] |
86 break | 81 break |
87 fh.close() | 82 fh.close() |
88 return snpeff_version | 83 return snpeff_version |
89 | 84 |
85 | |
90 # Starting with SnpEff 4.1 the .bin files contain the SnpEff version: | 86 # Starting with SnpEff 4.1 the .bin files contain the SnpEff version: |
91 # Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed): | 87 # Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed): |
92 """ | 88 # |
93 SnpEff 4.1 | 89 # SnpEff 4.1 |
94 CHROMOSOME 2 1 0 179197 GL000219.1 false | 90 # CHROMOSOME 2 1 0 179197 GL000219.1 false |
95 CHROMOSOME 3 1 0 81347269 HSCHR17_1 false | 91 # CHROMOSOME 3 1 0 81347269 HSCHR17_1 false |
96 """ | |
97 def getSnpeffVersionFromFile(path): | 92 def getSnpeffVersionFromFile(path): |
98 snpeff_version = None | 93 snpeff_version = None |
99 try: | 94 try: |
100 fh = gzip.open(path, 'rb') | 95 fh = gzip.open(path, 'rb') |
101 buf = fh.read(100) | 96 buf = fh.read(100) |
102 lines = buf.splitlines() | 97 lines = buf.splitlines() |
103 m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip()) | 98 m = re.match('^(SnpEff)\s+(\d+\.\d+).*$', lines[0].strip()) |
104 if m: | 99 if m: |
105 snpeff_version = m.groups()[0] + m.groups()[1] | 100 snpeff_version = m.groups()[0] + m.groups()[1] |
106 fh.close() | 101 fh.close() |
107 except Exception, e: | 102 except Exception as e: |
108 stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path,str( e )) ) | 103 stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path, str( e )) ) |
109 return snpeff_version | 104 return snpeff_version |
110 | 105 |
111 """ | 106 |
112 # Download human database 'hg19' | 107 # Download human database 'hg19' |
113 java -jar snpEff.jar download -v hg19 | 108 # java -jar snpEff.jar download -v hg19 |
114 | 109 # |
115 <command>java -jar \$SNPEFF_JAR_PATH/snpEff.jar download -c \$JAVA_JAR_PATH/snpEff.config $genomeVersion > $logfile </command> | 110 # <command>java -jar \$SNPEFF_JAR_PATH/snpEff.jar download -c \$JAVA_JAR_PATH/snpEff.config $genomeVersion > $logfile </command> |
116 | 111 # |
117 snpEffectPredictor.bin | 112 # snpEffectPredictor.bin |
118 regulation_HeLa-S3.bin | 113 # regulation_HeLa-S3.bin |
119 regulation_pattern = 'regulation_(.+).bin' | 114 # regulation_pattern = 'regulation_(.+).bin' |
120 """ | |
121 def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism): | 115 def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism): |
122 ## get data_dir from config | 116 # get data_dir from config |
123 ##--- | 117 # --- |
124 ## Databases are stored here | 118 # Databases are stored here |
125 ## E.g.: Information for 'hg19' is stored in data_dir/hg19/ | 119 # E.g.: Information for 'hg19' is stored in data_dir/hg19/ |
126 ## | 120 # |
127 ## Note: Since version 2.1 you can use tilde ('~') as first character to refer to your home directory | 121 # Note: Since version 2.1 you can use tilde ('~') as first character to refer to your home directory |
128 ##--- | 122 # --- |
129 #data_dir = ~/snpEff/data/ | 123 # data_dir = ~/snpEff/data/ |
130 data_dir = target_directory | 124 data_dir = target_directory |
131 (snpEff_dir,snpEff_jar) = os.path.split(jar_path) | 125 (snpEff_dir, snpEff_jar) = os.path.split(jar_path) |
132 args = [ 'java','-jar' ] | 126 args = [ 'java', '-jar' ] |
133 args.append( jar_path ) | 127 args.append( jar_path ) |
134 args.append( 'download' ) | 128 args.append( 'download' ) |
135 args.append( '-c' ) | 129 args.append( '-c' ) |
136 args.append( config ) | 130 args.append( config ) |
137 args.append( '-dataDir' ) | 131 args.append( '-dataDir' ) |
140 args.append( genome_version ) | 134 args.append( genome_version ) |
141 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir ) | 135 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir ) |
142 return_code = proc.wait() | 136 return_code = proc.wait() |
143 if return_code: | 137 if return_code: |
144 sys.exit( return_code ) | 138 sys.exit( return_code ) |
145 ## search data_dir/genome_version for files | 139 # search data_dir/genome_version for files |
146 regulation_pattern = 'regulation_(.+).bin' | 140 regulation_pattern = 'regulation_(.+).bin' |
147 # annotation files that are included in snpEff by a flag | 141 # annotation files that are included in snpEff by a flag |
148 annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'} | 142 annotations_dict = {'nextProt.bin': '-nextprot', 'motif.bin': '-motif'} |
149 genome_path = os.path.join(data_dir,genome_version) | 143 genome_path = os.path.join(data_dir, genome_version) |
150 snpeff_version = getSnpeffVersion(jar_path) | 144 snpeff_version = getSnpeffVersion(jar_path) |
151 key = snpeff_version + '_' + genome_version | 145 key = snpeff_version + '_' + genome_version |
152 if os.path.isdir(genome_path): | 146 if os.path.isdir(genome_path): |
153 for root, dirs, files in os.walk(genome_path): | 147 for root, dirs, files in os.walk(genome_path): |
154 for fname in files: | 148 for fname in files: |
155 if fname.startswith('snpEffectPredictor'): | 149 if fname.startswith('snpEffectPredictor'): |
156 # if snpEffectPredictor.bin download succeeded | 150 # if snpEffectPredictor.bin download succeeded |
157 name = genome_version + (' : ' + organism if organism else '') | 151 name = genome_version + (' : ' + organism if organism else '') |
158 # version = getSnpeffVersionFromFile(os.path.join(root,fname)) | 152 # version = getSnpeffVersionFromFile(os.path.join(root,fname)) |
159 data_table_entry = dict(key=key,version=snpeff_version,value=genome_version, name=name, path=data_dir) | 153 data_table_entry = dict(key=key, version=snpeff_version, value=genome_version, name=name, path=data_dir) |
160 _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry ) | 154 _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry ) |
161 else: | 155 else: |
162 m = re.match(regulation_pattern,fname) | 156 m = re.match(regulation_pattern, fname) |
163 if m: | 157 if m: |
164 name = m.groups()[0] | 158 name = m.groups()[0] |
165 data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=name, name=name) | 159 data_table_entry = dict(key=key, version=snpeff_version, genome=genome_version, value=name, name=name) |
166 _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry ) | 160 _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry ) |
167 elif fname in annotations_dict: | 161 elif fname in annotations_dict: |
168 value = annotations_dict[fname] | 162 value = annotations_dict[fname] |
169 name = value.lstrip('-') | 163 name = value.lstrip('-') |
170 data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=value, name=name) | 164 data_table_entry = dict(key=key, version=snpeff_version, genome=genome_version, value=value, name=name) |
171 _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry ) | 165 _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry ) |
172 return data_manager_dict | 166 return data_manager_dict |
167 | |
173 | 168 |
174 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ): | 169 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ): |
175 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | 170 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) |
176 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] ) | 171 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] ) |
177 data_manager_dict['data_tables'][data_table].append( data_table_entry ) | 172 data_manager_dict['data_tables'][data_table].append( data_table_entry ) |
178 return data_manager_dict | 173 return data_manager_dict |
179 | 174 |
175 | |
180 def main(): | 176 def main(): |
181 #Parse Command Line | 177 # Parse Command Line |
182 parser = optparse.OptionParser() | 178 parser = optparse.OptionParser() |
183 parser.add_option( '-j', '--jar_path', dest='jar_path', action='store', type="string", default=None, help='snpEff.jar path' ) | 179 parser.add_option( '-j', '--jar_path', dest='jar_path', action='store', type="string", default=None, help='snpEff.jar path' ) |
184 parser.add_option( '-c', '--config', dest='config', action='store', type="string", default=None, help='snpEff.config path' ) | 180 parser.add_option( '-c', '--config', dest='config', action='store', type="string", default=None, help='snpEff.config path' ) |
185 parser.add_option( '-g', '--genome_version', dest='genome_version', action='store', type="string", default=None, help='genome_version' ) | 181 parser.add_option( '-g', '--genome_version', dest='genome_version', action='store', type="string", default=None, help='genome_version' ) |
186 parser.add_option( '-o', '--organism', dest='organism', action='store', type="string", default=None, help='organism name' ) | 182 parser.add_option( '-o', '--organism', dest='organism', action='store', type="string", default=None, help='organism name' ) |
187 (options, args) = parser.parse_args() | 183 (options, args) = parser.parse_args() |
188 | 184 |
189 filename = args[0] | 185 filename = args[0] |
190 | 186 |
191 params = from_json_string( open( filename ).read() ) | 187 params = json.loads( open( filename ).read() ) |
192 target_directory = params[ 'output_data' ][0]['extra_files_path'] | 188 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
193 os.mkdir( target_directory ) | 189 os.mkdir( target_directory ) |
194 data_manager_dict = {} | 190 data_manager_dict = {} |
195 | 191 |
196 | 192 # Create SnpEff Reference Data |
197 #Create SnpEff Reference Data | 193 for genome_version, organism in zip(options.genome_version.split(','), getOrganismNames(options.jar_path, options.genome_version, options.organism).split(',')): |
198 for genome_version, organism in zip(options.genome_version.split(','), getOrganismNames(options.jar_path,options.genome_version,options.organism).split(',')): | |
199 download_database( data_manager_dict, target_directory, options.jar_path, options.config, genome_version, organism ) | 194 download_database( data_manager_dict, target_directory, options.jar_path, options.config, genome_version, organism ) |
200 | 195 |
201 #save info to json file | 196 # save info to json file |
202 open( filename, 'wb' ).write( to_json_string( data_manager_dict ) ) | 197 open( filename, 'wb' ).write( json.dumps( data_manager_dict ) ) |
203 | 198 |
204 if __name__ == "__main__": main() | 199 if __name__ == "__main__": |
205 | 200 main() |