# HG changeset patch # User charles-bernard # Date 1481186606 18000 # Node ID 9139892d06a20eb03a70b04c7366d76b2f7ef690 # Parent 4f70c9afd89d55431e2c9869c2d81c0ec3d873ad Uploaded diff -r 4f70c9afd89d -r 9139892d06a2 data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py --- a/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Sat Nov 19 04:24:46 2016 -0500 +++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Thu Dec 08 03:43:26 2016 -0500 @@ -28,7 +28,6 @@ page = urllib2.urlopen(req) return page.read() - def download_file(link, local_file_name): req = urllib2.Request(link) src_file = urllib2.urlopen(req) @@ -51,6 +50,7 @@ return data_manager_dict def standardize_species_name(species_name): + # substitute all capital letters, replace every succession of chars that are not letters to one underscore standard_species_name = re.sub(r'[)]$', '', species_name) standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) return standard_species_name.lower() @@ -66,6 +66,11 @@ return root def test_ensembl_species_exists(kingdom, url, species_name): + """ + Test if a species exist on the ftp & return the species name with the species_line if so. + if the species_name matches a single string, then this string will be returned as the species name + if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run + """ print("____________________________________________________________") print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) @@ -111,7 +116,10 @@ if exact_match: print("-> Referenced !\n") return species_name, species_lines[i] - msg = 'The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\nPlease retry with one of the following species names:\n%s' % (species_name, list_species[0:]) + msg = ("The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\n" + "Please retry with one of these following species names:\n" % species_name) + for s in list_species: + msg = ("%s- %s\n" % (msg, s)) sys.exit(msg) def get_ensembl_collection(kingdom, species_line): @@ -184,6 +192,10 @@ msg = 'No json output file specified' sys.exit(msg) output_filename = options.output_filename + + # Interestingly the output file to return is not empty initially. + # it contains a dictionary, with notably the path to the dir where the alfa_indexes + # are expected to be found params = from_json_string(open(output_filename).read()) target_directory = params['output_data'][0]['extra_files_path'] os.mkdir(target_directory) @@ -209,8 +221,6 @@ print("____________________________________________________________") print("*** General Info") - print("TMP DIR:\t%s" % tmp_dir) - print("TARGET DIR:\t%s" % target_directory) print("URL ROOT:\t%s" % url) print("SPECIES:\t%s" % data_table_entry['species']) print("VERSION:\t%s" % data_table_entry['version']) @@ -219,10 +229,6 @@ print("DBKEY:\t%s" % data_table_entry['dbkey']) print("NAME:\t%s" % data_table_entry['name']) print("PREFIX:\t%s" % data_table_entry['prefix']) - print("____________________________________________________________") - print("*** Intial dictionary") - print("%s" % params) - shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))