Mercurial > repos > trinity_ctat > ctat_centrifuge_indexes_data_manager
comparison data_manager/add_ctat_centrifuge_index.py @ 0:b4d4f0d76e94 draft default tip
Uploaded
author | trinity_ctat |
---|---|
date | Mon, 16 Jul 2018 20:27:06 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b4d4f0d76e94 |
---|---|
1 #!/usr/bin/env python | |
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/ | |
3 | |
4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and | |
5 # other example code on the web. | |
6 # This allows downloading of a centrifuge index, or specification of its disk location. | |
7 # This index is one of the input paramters needed by the ctat_metagenomics tool. | |
8 # At the moment only one index is supported by the ctat_metagenomics tool: | |
9 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz | |
10 | |
11 import argparse | |
12 import os | |
13 #import tarfile | |
14 #import urllib | |
15 import subprocess | |
16 | |
17 # The following is used to generate a unique_id value | |
18 from datetime import * | |
19 | |
20 # Remove the following line when testing without galaxy package: | |
21 from galaxy.util.json import to_json_string | |
22 # Am not using the following: | |
23 # from galaxy.util.json import from_json_string | |
24 | |
25 # The FileListParser is used by get_ctat_genome_filenames(), | |
26 # which is called by the Data Manager interface (.xml file) to get | |
27 # the filenames that are available online at broadinstitute.org | |
28 # Not sure best way to do it. | |
29 # This object uses HTMLParser to look through the html | |
30 # searching for the filenames within anchor tags. | |
31 import urllib2 | |
32 from HTMLParser import HTMLParser | |
33 | |
34 _CTAT_CentrifugeIndexPage_URL = 'https://ccb.jhu.edu/software/centrifuge/' | |
35 _CTAT_CentrifugeDownload_URL = 'ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz' | |
36 _CTAT_CentrifugeIndexTableName = 'ctat_centrifuge_indexes' | |
37 _CTAT_CentrifugeDir_Name = 'p_compressed+h+v' | |
38 _CTAT_Centrifuge_DisplayNamePrefix = 'CTAT_CentrifugeIndex_' | |
39 _CentrifugeIndexFileExtension = 'cf' | |
40 _NumBytesNeededForIndex = 7400130287 # 6.9 GB | |
41 #_DownloadFileSize = 5790678746 # 5.4 Gigabytes. | |
42 _Download_TestFile = 'write_testfile.txt' | |
43 _DownloadSuccessFile = 'download_succeeded.txt' | |
44 | |
45 class FileListParser(HTMLParser): | |
46 def __init__(self): | |
47 # Have to use direct call to super class rather than using super(): | |
48 # super(FileListParser, self).__init__() | |
49 # because HTMLParser is an "old style" class and its inheritance chain does not include object. | |
50 HTMLParser.__init__(self) | |
51 self.filenames = set() | |
52 def handle_starttag(self, tag, attrs): | |
53 # Look for filename references in anchor tags and add them to filenames. | |
54 if tag == "a": | |
55 # The tag is an anchor tag. | |
56 for attribute in attrs: | |
57 # print "Checking: {:s}".format(str(attribute)) | |
58 if attribute[0] == "href": | |
59 # Does the href have a tar.gz in it? | |
60 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]): | |
61 # Add the value to filenames. | |
62 self.filenames.add(attribute[1]) | |
63 # End of class FileListParser | |
64 | |
65 def get_ctat_centrifuge_index_locations(): | |
66 # For dynamic options need to return an interable with contents that are tuples with 3 items. | |
67 # Item one is a string that is the display name put into the option list. | |
68 # Item two is the value that is put into the parameter associated with the option list. | |
69 # Item three is a True or False value, indicating whether the item is selected. | |
70 options = [] | |
71 # open the url and retrieve the filenames of the files in the directory. | |
72 resource = urllib2.urlopen(_CTAT_CentrifugeIndexPage_URL) | |
73 theHTML = resource.read() | |
74 filelist_parser = FileListParser() | |
75 filelist_parser.feed(theHTML) | |
76 # This is what was returned on 2018-04-23 | |
77 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz | |
78 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz | |
79 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz | |
80 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz | |
81 # Which could be hard coded: | |
82 # options.append(("p_compressed+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz", True)) | |
83 # options.append(("p+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz", False)) | |
84 # options.append(("nt_2018_3_3", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz", False)) | |
85 # options.append(("p_compressed_2018_4_15", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz", False)) | |
86 # but only returning the one we want, which for now is assumed to be present. | |
87 # For now, I am printing the list, just so I can see what was returned, | |
88 # print "FYI: The URL's that were found on Centrifuge's page are:" | |
89 # print "\t" + "\n\t".join(filelist_parser.filenames) | |
90 # For now instead of sending back the list of found URL's, send back the one URL we want. | |
91 # Currently, only one of the options is supported. | |
92 options.append((_CTAT_CentrifugeDir_Name, _CTAT_CentrifugeDownload_URL, True)) | |
93 print "The list of items being returned for the option menu is:" | |
94 print str(options) | |
95 return options | |
96 | |
97 # The following was used by the example program to get input parameters through the json. | |
98 # Just leaving here for reference. | |
99 # We are getting all of our parameter values through command line arguments. | |
100 #def get_reference_id_name(params): | |
101 # genome_id = params['param_dict']['genome_id'] | |
102 # genome_name = params['param_dict']['genome_name'] | |
103 # return genome_id, genome_name | |
104 # | |
105 #def get_url(params): | |
106 # trained_url = params['param_dict']['trained_url'] | |
107 # return trained_url | |
108 | |
109 def download_index(src_location, destination, force_download): | |
110 # We do not know if the index has been downloaded already. | |
111 # This function returns whether or not the index actually gets downloaded. | |
112 index_was_downloaded = False | |
113 # Get the root filename of the Genome Directory. | |
114 # The part after the last '/' and before the first '.' | |
115 root_index_dirname = src_location.split("/")[-1].split(".")[0] | |
116 | |
117 # We want to make sure that destination is absolute fully specified path. | |
118 cannonical_destination = os.path.realpath(destination) | |
119 if cannonical_destination.split("/")[-1] != root_index_dirname: | |
120 cannonical_destination += "/" + root_index_dirname | |
121 if os.path.exists(cannonical_destination): | |
122 if not os.path.isdir(cannonical_destination): | |
123 raise ValueError("The destination is not a directory: " + \ | |
124 "{:s}".format(cannonical_destination)) | |
125 # else all is good. It is a directory. | |
126 else: | |
127 # We need to create it. | |
128 try: | |
129 os.makedirs(cannonical_destination) | |
130 except os.error: | |
131 print "ERROR: Trying to create the following directory path:" | |
132 print "\t{:s}".format(cannonical_destination) | |
133 raise | |
134 | |
135 # Make sure the directory now exists and we can write to it. | |
136 if not os.path.exists(cannonical_destination): | |
137 # It should have been created, but if it doesn't exist at this point | |
138 # in the code, something is wrong. Raise an error. | |
139 raise OSError("The destination directory could not be created: " + \ | |
140 "{:s}".format(cannonical_destination)) | |
141 test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile) | |
142 try: | |
143 filehandle = open(test_writing_file, "w") | |
144 filehandle.write("Testing writing to this file.") | |
145 filehandle.close() | |
146 os.remove(test_writing_file) | |
147 except IOError: | |
148 print "The destination directory could not be written into: " + \ | |
149 "{:s}".format(cannonical_destination) | |
150 raise | |
151 | |
152 # Get the list of files in the directory, | |
153 # We use it to check for a previous download or extraction among other things. | |
154 orig_files_in_destdir = set(os.listdir(cannonical_destination)) | |
155 # See whether the file has been downloaded already. | |
156 download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile) | |
157 if (_DownloadSuccessFile not in orig_files_in_destdir) or force_download: | |
158 # Check whether there is enough space on the device for the index. | |
159 statvfs = os.statvfs(cannonical_destination) | |
160 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes | |
161 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes | |
162 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users | |
163 # are allowed to use (excl. reserved space) | |
164 if (num_avail_bytes < _NumBytesNeededForIndex): | |
165 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \ | |
166 " on the device of the destination directory: " + \ | |
167 "{:s}".format(cannonical_destination)) | |
168 | |
169 #Previous code to download and untar. Not using anymore. | |
170 #full_filepath = os.path.join(destination, src_filename) | |
171 # | |
172 #Download ref: https://dzone.com/articles/how-download-file-python | |
173 #f = urllib2.urlopen(ctat_resource_lib_url) | |
174 #data = f.read() | |
175 #with open(full_filepath, 'wb') as code: | |
176 # code.write(data) | |
177 # | |
178 #Another way to download: | |
179 #try: | |
180 # urllib.urlretrieve(url=ctat_resource_lib_url, filename=full_filepath) | |
181 # | |
182 #Then untar the file. | |
183 #try: | |
184 # tarfile.open(full_filepath, mode='r:*').extractall() | |
185 | |
186 if (_DownloadSuccessFile in orig_files_in_destdir): | |
187 # Since we are redoing the download, | |
188 # the success file needs to be removed | |
189 # until the download has succeeded. | |
190 os.remove(download_success_file_path) | |
191 # We want to transfer and untar the file without storing the tar file, because that | |
192 # adds all that much more space to the needed amount of free space on the disk. | |
193 # Use subprocess to pipe the output of curl into tar. | |
194 # Make curl silent so progress is not printed to stderr. | |
195 command = "curl --silent {:s} | tar -xzf - -C {:s}".format(src_location, cannonical_destination) | |
196 try: # to send the command that downloads and extracts the file. | |
197 command_output = subprocess.check_output(command, shell=True) | |
198 # FIX - not sure check_output is what we want to use. If we want to have an error raised on | |
199 # any problem, maybe we should not be checking output. | |
200 except subprocess.CalledProcessError: | |
201 print "ERROR: Trying to run the following command:\n\t{:s}".format(command) | |
202 raise | |
203 else: | |
204 index_was_downloaded = True | |
205 | |
206 # Some code to help us if errors occur. | |
207 print "\n*******************************\nFinished download and extraction." | |
208 if os.path.exists(cannonical_destination) and os.path.isdir(cannonical_destination): | |
209 subprocess.check_call("ls -lad {:s}/* 2>&1".format(cannonical_destination), shell=True) | |
210 | |
211 files_in_destdir = set(os.listdir(cannonical_destination)) | |
212 found_filenames = set() | |
213 for filename in files_in_destdir: | |
214 # There should be three files, but some OS's might have created | |
215 # other files in the directory, or maybe the user did. | |
216 # Look for the index files. | |
217 # The download files' names should start with the root_index_dirname | |
218 # print "Is root: {:s} in file: {:s}".format(root_index_dirname, filename) | |
219 if root_index_dirname in filename: | |
220 found_filenames.add(filename) | |
221 # print "The found_filenames are:\n\t{:s}".format(str(found_filenames)) | |
222 if (len(found_filenames) >= 3): | |
223 # FIX - we could md5 the files to make sure they are correct. | |
224 # Or at least check their sizes, to see if the download completed ok. | |
225 # Also we could check the names of the files. | |
226 try: | |
227 # Create a file to indicate that the download succeeded. | |
228 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True) | |
229 except IOError: | |
230 print "The download_success file could not be created: " + \ | |
231 "{:s}".format(download_success_file_path) | |
232 raise | |
233 else: | |
234 print "After download, the potential index files found are:\n\t{:s}".format(str(found_filenames)) | |
235 raise ValueError("ERROR: Could not find the extracted index files " + \ | |
236 "in the destination directory:\n\t{:s}".format(cannonical_destination)) | |
237 | |
238 return (cannonical_destination, root_index_dirname, index_was_downloaded) | |
239 | |
240 def main(): | |
241 #Parse Command Line | |
242 # print "At start before parsing arguments." | |
243 parser = argparse.ArgumentParser() | |
244 parser.add_argument('-d', '--download_location', default="", \ | |
245 help='This is the download location of the centrifuge index.') | |
246 parser.add_argument('-n', '--display_name', default="", \ | |
247 help='Is used as the selector text for the entry of this Centrifuge Index in the data table.') | |
248 parser.add_argument('-p', '--destination_path', \ | |
249 help='Full path of the Centrifuge Index location or destination, either where it is, or where it will be placed.') | |
250 parser.add_argument('-o', '--output_filename', \ | |
251 help='Name of the output file, where the json dictionary will be written.') | |
252 parser.add_argument('-f', '--force_download', | |
253 help='Forces download of the Centrifuge Index, even if previously downloaded. ' + \ | |
254 'Requires download_location to be set in order to work.', action="store_true") | |
255 args = parser.parse_args() | |
256 | |
257 # All of the input parameters are written by default to the output file prior to | |
258 # this program being called. | |
259 # But I do not get input values from the json file, but rather from command line. | |
260 # Just leaving the following code as a comment, in case it might be useful to someone later. | |
261 # params = from_json_string(open(filename).read()) | |
262 # target_directory = params['output_data'][0]['extra_files_path'] | |
263 # os.mkdir(target_directory) | |
264 | |
265 # print "Arguments are parsed." | |
266 print "\ndownload_location is {:s}".format(str(args.download_location)) | |
267 print "display_name is {:s}".format(str(args.display_name)) | |
268 print "destination_path is {:s}\n".format(str(args.destination_path)) | |
269 root_index_dirname = None | |
270 # FIX - Prob don't need index_was_downloaded. Not doing anything with it. | |
271 # But it indicates success downloading the index, so maybe should be checking it. | |
272 index_was_downloaded = False | |
273 if (args.download_location != ""): | |
274 index_directory, root_index_dirname, index_was_downloaded = \ | |
275 download_index(src_location=args.download_location, \ | |
276 destination=args.destination_path, \ | |
277 force_download=args.force_download) | |
278 else: | |
279 cannonical_destination = os.path.realpath(args.destination_path) | |
280 if not os.path.exists(cannonical_destination): | |
281 raise ValueError("Cannot find the Centrifuge Index.\n" + \ | |
282 "The directory does not exist:\n\t{:s}".format(index_directory)) | |
283 # If args.destination_path is a directory containing | |
284 # a subdirectory that contains the index files, | |
285 # then we need to set the index_directory to be that subdirectory. | |
286 files_in_destination_path = os.listdir(cannonical_destination) | |
287 if (len(files_in_destination_path) == 1): | |
288 path_to_file = "{:s}/{:s}".format(cannonical_destination, files_in_destination_path[0]) | |
289 if os.path.isdir(path_to_file): | |
290 index_directory = path_to_file | |
291 else: | |
292 index_directory = cannonical_destination | |
293 else: | |
294 index_directory = cannonical_destination | |
295 # Get the root_index_dirname of the index from the index_directory name. | |
296 root_index_dirname = index_directory.split("/")[-1].split(".")[0] | |
297 | |
298 # Check if there is an actual Centrifuge Index file in the index_directory. | |
299 print "\nThe location of the Centrifuge Index is {:s}.\n".format(index_directory) | |
300 files_in_index_directory = set(os.listdir(index_directory)) | |
301 index_file_found = False | |
302 index_file_path = index_directory | |
303 for filename in files_in_index_directory: | |
304 # The current index is split into 3 files. | |
305 # filenames are in the form: index_root_name.#.cf, | |
306 # where # is a numeral (1, 2, or 3) | |
307 # indicating the order of the files. | |
308 if filename.split(".")[-1] == _CentrifugeIndexFileExtension: | |
309 index_file_found = True | |
310 # The centrifuge program wants the root name of the files to be final part of the path. | |
311 index_file_path = "{:s}/{:s}".format(index_directory, filename.split(".")[0]) | |
312 if not index_file_found: | |
313 raise ValueError("Cannot find any Centrifuge Index files.\n" + \ | |
314 "The contents of the directory {:s} are:\n\t".format(index_directory) + \ | |
315 "\n\t".join(files_in_index_directory)) | |
316 | |
317 # Set the display_name | |
318 if (args.display_name is None) or (args.display_name == ""): | |
319 # Use the root_index_dirname. | |
320 if (root_index_dirname != None) and (root_index_dirname != ""): | |
321 display_name = _CTAT_Centrifuge_DisplayNamePrefix + root_index_dirname | |
322 else: | |
323 display_name = _CTAT_Centrifuge_DisplayNamePrefix + _CTAT_CentrifugeDir_Name | |
324 print "WARNING: Did not set the display name. Using the default: {:s}".format(display_name_value) | |
325 else: | |
326 display_name = _CTAT_Centrifuge_DisplayNamePrefix + args.display_name | |
327 display_name = display_name.replace(" ","_") | |
328 | |
329 # Set the unique_id | |
330 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f") | |
331 if (root_index_dirname != None) and (root_index_dirname != ""): | |
332 unique_id = root_index_dirname + datetime_stamp | |
333 else: | |
334 unique_id = _CTAT_CentrifugeDir_Name + datetime_stamp | |
335 | |
336 print "The Index's display_name will be set to: {:s}\n".format(display_name) | |
337 print "Its unique_id will be set to: {:s}\n".format(unique_id) | |
338 print "Its dir_path will be set to: {:s}\n".format(index_file_path) | |
339 | |
340 data_manager_dict = {} | |
341 data_manager_dict['data_tables'] = {} | |
342 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName] = [] | |
343 data_table_entry = dict(value=unique_id, name=display_name, path=index_file_path) | |
344 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName].append(data_table_entry) | |
345 | |
346 # Temporarily the output file's dictionary is written for debugging: | |
347 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict)) | |
348 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager, | |
349 # which then puts it into the correct .loc file (I think). | |
350 # Remove the following line when testing without galaxy package. | |
351 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict)) | |
352 | |
353 if __name__ == "__main__": | |
354 main() |