comparison data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py @ 28:9139892d06a2 draft

Uploaded
author charles-bernard
date Thu, 08 Dec 2016 03:43:26 -0500
parents 5dafa8e43d3e
children 0c821f76e2e5
comparison
equal deleted inserted replaced
27:4f70c9afd89d 28:9139892d06a2
26 def get_page_content(url): 26 def get_page_content(url):
27 req = urllib2.Request(url) 27 req = urllib2.Request(url)
28 page = urllib2.urlopen(req) 28 page = urllib2.urlopen(req)
29 return page.read() 29 return page.read()
30 30
31
32 def download_file(link, local_file_name): 31 def download_file(link, local_file_name):
33 req = urllib2.Request(link) 32 req = urllib2.Request(link)
34 src_file = urllib2.urlopen(req) 33 src_file = urllib2.urlopen(req)
35 local_file = open(local_file_name, 'wb') 34 local_file = open(local_file_name, 'wb')
36 local_file.write(src_file.read()) 35 local_file.write(src_file.read())
49 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) 48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
50 data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry ) 49 data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry )
51 return data_manager_dict 50 return data_manager_dict
52 51
53 def standardize_species_name(species_name): 52 def standardize_species_name(species_name):
53 # substitute all capital letters, replace every succession of chars that are not letters to one underscore
54 standard_species_name = re.sub(r'[)]$', '', species_name) 54 standard_species_name = re.sub(r'[)]$', '', species_name)
55 standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) 55 standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name)
56 return standard_species_name.lower() 56 return standard_species_name.lower()
57 57
58 def get_ensembl_url_root(kingdom): 58 def get_ensembl_url_root(kingdom):
64 root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom 64 root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom
65 print("-> Determined !\n") 65 print("-> Determined !\n")
66 return root 66 return root
67 67
68 def test_ensembl_species_exists(kingdom, url, species_name): 68 def test_ensembl_species_exists(kingdom, url, species_name):
69 """
70 Test if a species exist on the ftp & return the species name with the species_line if so.
71 if the species_name matches a single string, then this string will be returned as the species name
72 if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run
73 """
69 print("____________________________________________________________") 74 print("____________________________________________________________")
70 print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) 75 print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))
71 list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) 76 list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:])
72 if kingdom=='vertebrates': 77 if kingdom=='vertebrates':
73 download_file(url, list_species_file_name) 78 download_file(url, list_species_file_name)
109 list_species[i] = columns[1] 114 list_species[i] = columns[1]
110 exact_match = re.search('^%s$' % species_name, list_species[i]) 115 exact_match = re.search('^%s$' % species_name, list_species[i])
111 if exact_match: 116 if exact_match:
112 print("-> Referenced !\n") 117 print("-> Referenced !\n")
113 return species_name, species_lines[i] 118 return species_name, species_lines[i]
114 msg = 'The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\nPlease retry with one of the following species names:\n%s' % (species_name, list_species[0:]) 119 msg = ("The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\n"
120 "Please retry with one of these following species names:\n" % species_name)
121 for s in list_species:
122 msg = ("%s- %s\n" % (msg, s))
115 sys.exit(msg) 123 sys.exit(msg)
116 124
117 def get_ensembl_collection(kingdom, species_line): 125 def get_ensembl_collection(kingdom, species_line):
118 print("*** Extracting the %s_collection of the species" % kingdom) 126 print("*** Extracting the %s_collection of the species" % kingdom)
119 collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) 127 collection_regex = re.compile('%s_.+_collection' % kingdom.lower())
182 190
183 if options.output_filename == None: 191 if options.output_filename == None:
184 msg = 'No json output file specified' 192 msg = 'No json output file specified'
185 sys.exit(msg) 193 sys.exit(msg)
186 output_filename = options.output_filename 194 output_filename = options.output_filename
195
196 # Interestingly the output file to return is not empty initially.
197 # it contains a dictionary, with notably the path to the dir where the alfa_indexes
198 # are expected to be found
187 params = from_json_string(open(output_filename).read()) 199 params = from_json_string(open(output_filename).read())
188 target_directory = params['output_data'][0]['extra_files_path'] 200 target_directory = params['output_data'][0]['extra_files_path']
189 os.mkdir(target_directory) 201 os.mkdir(target_directory)
190 202
191 tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') 203 tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='')
207 unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] 219 unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix']
208 add_data_table_entry(data_manager_dict, data_table_entry) 220 add_data_table_entry(data_manager_dict, data_table_entry)
209 221
210 print("____________________________________________________________") 222 print("____________________________________________________________")
211 print("*** General Info") 223 print("*** General Info")
212 print("TMP DIR:\t%s" % tmp_dir)
213 print("TARGET DIR:\t%s" % target_directory)
214 print("URL ROOT:\t%s" % url) 224 print("URL ROOT:\t%s" % url)
215 print("SPECIES:\t%s" % data_table_entry['species']) 225 print("SPECIES:\t%s" % data_table_entry['species'])
216 print("VERSION:\t%s" % data_table_entry['version']) 226 print("VERSION:\t%s" % data_table_entry['version'])
217 print("RELEASE:\t%s" % data_table_entry['release']) 227 print("RELEASE:\t%s" % data_table_entry['release'])
218 print("VALUE:\t%s" % data_table_entry['value']) 228 print("VALUE:\t%s" % data_table_entry['value'])
219 print("DBKEY:\t%s" % data_table_entry['dbkey']) 229 print("DBKEY:\t%s" % data_table_entry['dbkey'])
220 print("NAME:\t%s" % data_table_entry['name']) 230 print("NAME:\t%s" % data_table_entry['name'])
221 print("PREFIX:\t%s" % data_table_entry['prefix']) 231 print("PREFIX:\t%s" % data_table_entry['prefix'])
222 print("____________________________________________________________")
223 print("*** Intial dictionary")
224 print("%s" % params)
225
226 232
227 shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) 233 shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name))
228 shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) 234 shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))
229 235
230 cleanup_before_exit(tmp_dir) 236 cleanup_before_exit(tmp_dir)