# HG changeset patch # User charles-bernard # Date 1477953611 14400 # Node ID 9bc3d77b5661f0a57d95f91923345e6acda14489 # Parent db5d2fac3a16471b88d978fa52c39ab2007590c2 Uploaded diff -r db5d2fac3a16 -r 9bc3d77b5661 data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py --- a/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Mon Oct 31 09:18:34 2016 -0400 +++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Mon Oct 31 18:40:11 2016 -0400 @@ -12,7 +12,6 @@ from optparse import OptionParser from galaxy.util.json import from_json_string, to_json_string - def get_arg(): parser = OptionParser() parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str") @@ -38,7 +37,6 @@ local_file.write(src_file.read()) local_file.close() - def uncompress_gz(gz_file_name, uncompressed_file_name): print("____________________________________________________________") print("*** Uncompressing %s" % gz_file_name) @@ -48,19 +46,16 @@ uncompressed_file.close() print("-> Uncompressed !\n") - def add_data_table_entry( data_manager_dict, data_table_entry ): data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry ) return data_manager_dict - def standardize_species_name(species_name): standard_species_name = re.sub(r'[)]$', '', species_name) standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) return standard_species_name.lower() - def get_ensembl_url_root(kingdom): print("____________________________________________________________") print("*** Determining Ensembl ftp root url") @@ -71,11 +66,11 @@ print("-> Determined !\n") return root - def test_ensembl_species_exists(kingdom, url, species_name): print("____________________________________________________________") print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) + print("%s" % kingdom) if kingdom=='vertebrates': download_file(url, list_species_file_name) else: @@ -93,8 +88,13 @@ nb_lines = len(species_lines) if nb_lines == 1: - columns = species_lines[0].split('\t') - found_species_name = columns[1] + if kingdom == 'vertebrates': + fields = species_lines[0].split(' ') + columns = fields[-1].split('\r') + found_species_name = columns[0] + else: + columns = species_lines[0].split('\t') + found_species_name = columns[1] if species_name != found_species_name: print('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name)) return found_species_name, species_lines_matched @@ -103,8 +103,13 @@ else: list_species = [''] * nb_lines for i in range(0, nb_lines): - columns = species_lines[i].split('\t') - list_species[i] = columns[1] + if kingdom == 'vertebrates': + fields = species_lines[0].split(' ') + columns = fields[-1].split('\r') + list_species[i] = columns[0] + else: + columns = species_lines[0].split('\t') + list_species[i] = columns[1] exact_match = re.search('^%s$' % species_name, list_species[i]) if exact_match: print("-> Referenced !\n") @@ -113,7 +118,6 @@ logging.critical(msg) sys.exit(msg) - def get_ensembl_collection(kingdom, species_line): print("*** Extracting the %s_collection of the species" % kingdom) collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) @@ -124,7 +128,6 @@ print("-> Extracted !\n") return collection_match.group(0) - def get_ensembl_gtf_archive_name(url_dir, species_name): print("____________________________________________________________") print("*** Extracting the gtf archive name of %s" % species_name) @@ -137,7 +140,6 @@ print("-> Extracted !\n") return gtf_archive_name - def get_ensembl_gtf_archive(kingdom, url, species_name, species_line): if kingdom != 'vertebrates': url = url + 'gtf/' @@ -153,7 +155,6 @@ print("-> Downloaded !\n") return gtf_archive_name - def generate_alfa_indexes(path_to_alfa, gtf_file_name): print("____________________________________________________________") print("*** Generating alfa indexes from %s" % gtf_file_name) @@ -166,7 +167,6 @@ print("Alfa prompt:\n%s" % alfa_out) print("-> Generated !\n") - def get_data_table_new_entry(gtf_archive_name): info_list = gtf_archive_name.split('.') species = info_list[0] @@ -179,14 +179,11 @@ entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix } return entry_dict - def main(): options, args = get_arg() - galaxy_root_dir = args[0] - tool_dir = args[1] + tool_dir = args[0] path_to_alfa = os.path.join(tool_dir, 'ALFA.py') - path_to_tmp_dir = os.path.join(galaxy_root_dir, 'database/tmp/') if options.output_filename == None: msg = 'No json output file specified' @@ -197,10 +194,9 @@ target_directory = params['output_data'][0]['extra_files_path'] os.mkdir(target_directory) - tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='', dir=path_to_tmp_dir) + tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') os.chdir(tmp_dir) - #log_file_name = 'galaxy_log_report.log' - #logging.basicConfig(level=print, filename=log_file_name, filemode="a+", format='%(message)s') + data_manager_dict = {} if options.ensembl_info: @@ -236,8 +232,6 @@ shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) - #shutil.copyfile(log_file_name, os.path.join(target_directory, log_file_name)) - #shutil.copyfile(log_file_name, options.log_filename) cleanup_before_exit(tmp_dir)