Mercurial > repos > charles-bernard > data_manager_build_alfa_indexes
comparison data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py @ 28:9139892d06a2 draft
Uploaded
author | charles-bernard |
---|---|
date | Thu, 08 Dec 2016 03:43:26 -0500 |
parents | 5dafa8e43d3e |
children | 0c821f76e2e5 |
comparison
equal
deleted
inserted
replaced
27:4f70c9afd89d | 28:9139892d06a2 |
---|---|
26 def get_page_content(url): | 26 def get_page_content(url): |
27 req = urllib2.Request(url) | 27 req = urllib2.Request(url) |
28 page = urllib2.urlopen(req) | 28 page = urllib2.urlopen(req) |
29 return page.read() | 29 return page.read() |
30 | 30 |
31 | |
32 def download_file(link, local_file_name): | 31 def download_file(link, local_file_name): |
33 req = urllib2.Request(link) | 32 req = urllib2.Request(link) |
34 src_file = urllib2.urlopen(req) | 33 src_file = urllib2.urlopen(req) |
35 local_file = open(local_file_name, 'wb') | 34 local_file = open(local_file_name, 'wb') |
36 local_file.write(src_file.read()) | 35 local_file.write(src_file.read()) |
49 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | 48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) |
50 data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry ) | 49 data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry ) |
51 return data_manager_dict | 50 return data_manager_dict |
52 | 51 |
53 def standardize_species_name(species_name): | 52 def standardize_species_name(species_name): |
53 # substitute all capital letters, replace every succession of chars that are not letters to one underscore | |
54 standard_species_name = re.sub(r'[)]$', '', species_name) | 54 standard_species_name = re.sub(r'[)]$', '', species_name) |
55 standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) | 55 standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) |
56 return standard_species_name.lower() | 56 return standard_species_name.lower() |
57 | 57 |
58 def get_ensembl_url_root(kingdom): | 58 def get_ensembl_url_root(kingdom): |
64 root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom | 64 root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom |
65 print("-> Determined !\n") | 65 print("-> Determined !\n") |
66 return root | 66 return root |
67 | 67 |
68 def test_ensembl_species_exists(kingdom, url, species_name): | 68 def test_ensembl_species_exists(kingdom, url, species_name): |
69 """ | |
70 Test if a species exist on the ftp & return the species name with the species_line if so. | |
71 if the species_name matches a single string, then this string will be returned as the species name | |
72 if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run | |
73 """ | |
69 print("____________________________________________________________") | 74 print("____________________________________________________________") |
70 print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) | 75 print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) |
71 list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) | 76 list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) |
72 if kingdom=='vertebrates': | 77 if kingdom=='vertebrates': |
73 download_file(url, list_species_file_name) | 78 download_file(url, list_species_file_name) |
109 list_species[i] = columns[1] | 114 list_species[i] = columns[1] |
110 exact_match = re.search('^%s$' % species_name, list_species[i]) | 115 exact_match = re.search('^%s$' % species_name, list_species[i]) |
111 if exact_match: | 116 if exact_match: |
112 print("-> Referenced !\n") | 117 print("-> Referenced !\n") |
113 return species_name, species_lines[i] | 118 return species_name, species_lines[i] |
114 msg = 'The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\nPlease retry with one of the following species names:\n%s' % (species_name, list_species[0:]) | 119 msg = ("The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\n" |
120 "Please retry with one of these following species names:\n" % species_name) | |
121 for s in list_species: | |
122 msg = ("%s- %s\n" % (msg, s)) | |
115 sys.exit(msg) | 123 sys.exit(msg) |
116 | 124 |
117 def get_ensembl_collection(kingdom, species_line): | 125 def get_ensembl_collection(kingdom, species_line): |
118 print("*** Extracting the %s_collection of the species" % kingdom) | 126 print("*** Extracting the %s_collection of the species" % kingdom) |
119 collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) | 127 collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) |
182 | 190 |
183 if options.output_filename == None: | 191 if options.output_filename == None: |
184 msg = 'No json output file specified' | 192 msg = 'No json output file specified' |
185 sys.exit(msg) | 193 sys.exit(msg) |
186 output_filename = options.output_filename | 194 output_filename = options.output_filename |
195 | |
196 # Interestingly the output file to return is not empty initially. | |
197 # it contains a dictionary, with notably the path to the dir where the alfa_indexes | |
198 # are expected to be found | |
187 params = from_json_string(open(output_filename).read()) | 199 params = from_json_string(open(output_filename).read()) |
188 target_directory = params['output_data'][0]['extra_files_path'] | 200 target_directory = params['output_data'][0]['extra_files_path'] |
189 os.mkdir(target_directory) | 201 os.mkdir(target_directory) |
190 | 202 |
191 tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') | 203 tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') |
207 unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] | 219 unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] |
208 add_data_table_entry(data_manager_dict, data_table_entry) | 220 add_data_table_entry(data_manager_dict, data_table_entry) |
209 | 221 |
210 print("____________________________________________________________") | 222 print("____________________________________________________________") |
211 print("*** General Info") | 223 print("*** General Info") |
212 print("TMP DIR:\t%s" % tmp_dir) | |
213 print("TARGET DIR:\t%s" % target_directory) | |
214 print("URL ROOT:\t%s" % url) | 224 print("URL ROOT:\t%s" % url) |
215 print("SPECIES:\t%s" % data_table_entry['species']) | 225 print("SPECIES:\t%s" % data_table_entry['species']) |
216 print("VERSION:\t%s" % data_table_entry['version']) | 226 print("VERSION:\t%s" % data_table_entry['version']) |
217 print("RELEASE:\t%s" % data_table_entry['release']) | 227 print("RELEASE:\t%s" % data_table_entry['release']) |
218 print("VALUE:\t%s" % data_table_entry['value']) | 228 print("VALUE:\t%s" % data_table_entry['value']) |
219 print("DBKEY:\t%s" % data_table_entry['dbkey']) | 229 print("DBKEY:\t%s" % data_table_entry['dbkey']) |
220 print("NAME:\t%s" % data_table_entry['name']) | 230 print("NAME:\t%s" % data_table_entry['name']) |
221 print("PREFIX:\t%s" % data_table_entry['prefix']) | 231 print("PREFIX:\t%s" % data_table_entry['prefix']) |
222 print("____________________________________________________________") | |
223 print("*** Intial dictionary") | |
224 print("%s" % params) | |
225 | |
226 | 232 |
227 shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) | 233 shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) |
228 shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) | 234 shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) |
229 | 235 |
230 cleanup_before_exit(tmp_dir) | 236 cleanup_before_exit(tmp_dir) |