Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/download.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author | iss |
---|---|
date | Mon, 21 Mar 2022 15:23:09 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6bab5103a14 |
---|---|
1 import os.path | |
2 import multiprocessing | |
3 import sys | |
4 import functools | |
5 import time | |
6 import subprocess | |
7 | |
8 try: | |
9 import modules.utils as utils | |
10 except ImportError: | |
11 from ReMatCh.modules import utils as utils | |
12 | |
13 | |
14 def get_read_run_info(ena_id): | |
15 import urllib.request | |
16 | |
17 url = 'http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=' + ena_id + '&result=read_run' | |
18 | |
19 read_run_info = None | |
20 try: | |
21 url = urllib.request.urlopen(url) | |
22 read_run_info = url.read().decode("utf8").splitlines() | |
23 if len(read_run_info) <= 1: | |
24 read_run_info = None | |
25 except Exception as error: | |
26 print(error) | |
27 | |
28 return read_run_info | |
29 | |
30 | |
31 def get_download_information(read_run_info): | |
32 header_line = read_run_info[0].split('\t') | |
33 info_line = read_run_info[1].split('\t') | |
34 | |
35 download_information = {'fastq': None, 'submitted': None, 'cram_index': None} | |
36 download_types = ['aspera', 'ftp'] | |
37 | |
38 for i in range(0, len(header_line)): | |
39 header = header_line[i].lower().rsplit('_', 1) | |
40 if header[0] in list(download_information.keys()): | |
41 if header[1] in download_types: | |
42 if len(info_line[i]) > 0: | |
43 files_path = info_line[i].split(';') | |
44 if len(files_path) > 2: | |
45 print('WARNING: Were found more files than expected in' | |
46 ' {download_information}-{download_types} download' | |
47 ' links!'.format(download_information=header[0], download_types=header[1])) | |
48 if download_information[header[0]] is None: | |
49 download_information[header[0]] = {} | |
50 download_information[header[0]][header[1]] = files_path | |
51 | |
52 return download_information | |
53 | |
54 | |
55 def get_sequencing_information(read_run_info): | |
56 header_line = read_run_info[0].split('\t') | |
57 info_line = read_run_info[1].split('\t') | |
58 | |
59 sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, | |
60 'library_layout': None, 'library_source': None, 'extra_run_accession': None, | |
61 'nominal_length': None, 'read_count': None, 'base_count': None, | |
62 'date_download': time.strftime("%Y-%m-%d")} | |
63 | |
64 for i in range(0, len(header_line)): | |
65 header = header_line[i].lower() | |
66 if header in list(sequencing_information.keys()): | |
67 if len(info_line[i]) > 0: | |
68 sequencing_information[header] = info_line[i] | |
69 | |
70 if len(read_run_info) > 2: | |
71 extra_run_accession = [] | |
72 for i in range(2, len(read_run_info)): | |
73 info = read_run_info[i].split('\t') | |
74 for j in range(0, len(header_line)): | |
75 header = header_line[j].lower() | |
76 if header == 'run_accession': | |
77 if len(info[j]) > 0: | |
78 extra_run_accession.append(info[j]) | |
79 if len(extra_run_accession) >= 1: | |
80 sequencing_information['extra_run_accession'] = ','.join(extra_run_accession) | |
81 | |
82 return sequencing_information | |
83 | |
84 | |
85 @utils.trace_unhandled_exceptions | |
86 def download_with_aspera(aspera_file_path, aspera_key, outdir, pickle_prefix, sra, ena_id): | |
87 command = ['ascp', '-QT', '-l', '300m', '', '-i', aspera_key, '', outdir] | |
88 if not sra: | |
89 command[4] = '-P33001' | |
90 command[7] = str('era-fasp@' + aspera_file_path) | |
91 pickle = pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1] | |
92 else: | |
93 command[7] = 'anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( | |
94 a=ena_id[:3], b=ena_id[:6], c=ena_id) | |
95 pickle = pickle_prefix + '.' + ena_id | |
96 | |
97 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) | |
98 | |
99 utils.save_variable_to_pickle(run_successfully, outdir, pickle) | |
100 | |
101 | |
102 @utils.trace_unhandled_exceptions | |
103 def download_with_wget(ftp_file_path, outdir, pickle_prefix, sra, ena_id): | |
104 command = ['wget', '--tries=1', '', '-O', ''] | |
105 if not sra: | |
106 command[2] = ftp_file_path | |
107 file_download = ftp_file_path.rsplit('/', 1)[1] | |
108 command[4] = os.path.join(outdir, file_download) | |
109 pickle = pickle_prefix + '.' + file_download | |
110 else: | |
111 command[2] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( | |
112 a=ena_id[:3], b=ena_id[:6], c=ena_id) | |
113 command[4] = os.path.join(outdir, ena_id + '.sra') | |
114 pickle = pickle_prefix + '.' + ena_id | |
115 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) | |
116 | |
117 utils.save_variable_to_pickle(run_successfully, outdir, pickle) | |
118 | |
119 | |
120 @utils.trace_unhandled_exceptions | |
121 def download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id): | |
122 command = ['prefetch', '', ena_id] | |
123 | |
124 if aspera_key is not None: | |
125 _, ascp, _ = utils.run_command_popen_communicate(['which', 'ascp'], False, None, False) | |
126 command[1] = '-a {ascp}|{aspera_key}'.format(ascp=ascp.splitlines()[0], aspera_key=aspera_key) | |
127 | |
128 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) | |
129 if run_successfully: | |
130 _, prefetch_outdir, _ = utils.run_command_popen_communicate(['echo', '$HOME/ncbi/public/sra'], True, None, | |
131 False) | |
132 | |
133 try: | |
134 os.rename(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'), | |
135 os.path.join(outdir, ena_id + '.sra')) | |
136 except OSError as e: | |
137 print('Found the following error:' | |
138 '{}'.format(e)) | |
139 | |
140 from shutil import copy as shutil_copy | |
141 | |
142 shutil_copy(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'), | |
143 os.path.join(outdir, ena_id + '.sra')) | |
144 os.remove(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra')) | |
145 | |
146 utils.save_variable_to_pickle(run_successfully, outdir, pickle_prefix + '.' + ena_id) | |
147 | |
148 | |
149 @utils.trace_unhandled_exceptions | |
150 def download_with_curl(ftp_file_path, outdir, pickle_prefix, sra, ena_id): | |
151 command = ['curl', '--retry', '1', '', '-o', ''] | |
152 if not sra: | |
153 command[3] = ftp_file_path | |
154 file_download = ftp_file_path.rsplit('/', 1)[1] | |
155 command[5] = os.path.join(outdir, file_download) | |
156 pickle = pickle_prefix + '.' + file_download | |
157 else: | |
158 command[3] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( | |
159 a=ena_id[:3], b=ena_id[:6], c=ena_id) | |
160 command[5] = os.path.join(outdir, ena_id + '.sra') | |
161 pickle = pickle_prefix + '.' + ena_id | |
162 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) | |
163 | |
164 utils.save_variable_to_pickle(run_successfully, outdir, pickle) | |
165 | |
166 | |
167 def get_pickle_run_successfully(directory, pickle_prefix): | |
168 run_successfully = True | |
169 read_pickle = False | |
170 | |
171 files = find_files(directory, pickle_prefix, '.pkl') | |
172 if files is not None: | |
173 for file_found in files: | |
174 if run_successfully: | |
175 run_successfully = utils.extract_variable_from_pickle(file_found) | |
176 read_pickle = True | |
177 | |
178 os.remove(file_found) | |
179 | |
180 if not read_pickle: | |
181 run_successfully = False | |
182 | |
183 return run_successfully | |
184 | |
185 | |
186 def curl_installed(): | |
187 command = ['which', 'curl'] | |
188 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False) | |
189 return run_successfully | |
190 | |
191 | |
192 def download(download_information_type, aspera_key, outdir, sra, sra_opt, ena_id): | |
193 pickle_prefix = 'download' | |
194 | |
195 run_successfully = False | |
196 download_sra = False | |
197 | |
198 if not sra: | |
199 if aspera_key is not None and download_information_type['aspera'] is not None: | |
200 pool = multiprocessing.Pool(processes=2) | |
201 for file_download in download_information_type['aspera']: | |
202 pool.apply_async(download_with_aspera, args=(file_download, aspera_key, outdir, pickle_prefix, sra, | |
203 ena_id,)) | |
204 pool.close() | |
205 pool.join() | |
206 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
207 if not run_successfully and download_information_type['ftp'] is not None: | |
208 if curl_installed(): | |
209 pool = multiprocessing.Pool(processes=2) | |
210 for file_download in download_information_type['ftp']: | |
211 pool.apply_async(download_with_curl, args=(file_download, outdir, pickle_prefix, sra, ena_id,)) | |
212 pool.close() | |
213 pool.join() | |
214 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
215 if not run_successfully: | |
216 pool = multiprocessing.Pool(processes=2) | |
217 for file_download in download_information_type['ftp']: | |
218 pool.apply_async(download_with_wget, args=(file_download, outdir, pickle_prefix, sra, ena_id,)) | |
219 pool.close() | |
220 pool.join() | |
221 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
222 | |
223 if not run_successfully and (sra or sra_opt): | |
224 if aspera_key is not None: | |
225 download_with_aspera(None, aspera_key, outdir, pickle_prefix, sra or sra_opt, ena_id) | |
226 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
227 if not run_successfully: | |
228 download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id) | |
229 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
230 if not run_successfully: | |
231 if curl_installed(): | |
232 download_with_curl(None, outdir, pickle_prefix, sra or sra_opt, ena_id) | |
233 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
234 if not run_successfully: | |
235 download_with_wget(None, outdir, pickle_prefix, sra or sra_opt, ena_id) | |
236 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
237 | |
238 if run_successfully: | |
239 download_sra = True | |
240 | |
241 return run_successfully, download_sra | |
242 | |
243 | |
244 def download_files(download_information, aspera_key, outdir, download_cram_bam_true, sra, sra_opt, ena_id): | |
245 run_successfully = False | |
246 cram_index_run_successfully = False | |
247 download_sra = False | |
248 | |
249 if download_information['fastq'] is not None: | |
250 run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, sra, sra_opt, | |
251 ena_id) | |
252 | |
253 if not run_successfully: | |
254 if download_information['submitted'] is not None: | |
255 if not download_cram_bam_true: | |
256 cram_bam = False | |
257 for i in download_information['submitted']: | |
258 if download_information['submitted'][i][0].endswith(('.cram', '.bam')): | |
259 cram_bam = True | |
260 break | |
261 if not cram_bam: | |
262 run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir, | |
263 False, False, ena_id) | |
264 | |
265 elif download_cram_bam_true: | |
266 run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir, False, | |
267 False, ena_id) | |
268 if run_successfully and download_information['cram_index'] is not None: | |
269 cram_index_run_successfully = download(download_information['cram_index'], aspera_key, outdir, | |
270 False, False, ena_id) | |
271 | |
272 if not run_successfully and (sra or sra_opt): | |
273 run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, True, sra_opt, | |
274 ena_id) | |
275 | |
276 return run_successfully, cram_index_run_successfully, download_sra | |
277 | |
278 | |
279 def sort_alignment(alignment_file, output_file, sort_by_name_true, threads): | |
280 out_format_string = os.path.splitext(output_file)[1][1:].lower() | |
281 command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file] | |
282 if sort_by_name_true: | |
283 command[6] = '-n' | |
284 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) | |
285 | |
286 if not run_successfully: | |
287 output_file = None | |
288 | |
289 return run_successfully, output_file | |
290 | |
291 | |
292 def alignment_to_fastq(alignment_file, threads, pair_end_type): | |
293 fastq_basename = os.path.splitext(alignment_file)[0] | |
294 outfiles = None | |
295 bam_file = fastq_basename + '.temp.bam' | |
296 # sort cram | |
297 run_successfully, bam_file = sort_alignment(alignment_file, bam_file, True, threads) | |
298 if run_successfully: | |
299 command = ['samtools', 'fastq', '', bam_file] | |
300 if pair_end_type.lower() == 'paired': | |
301 command[2] = '-1 ' + str(fastq_basename + '_1.fq') + ' -2 ' + str(fastq_basename + '_2.fq') | |
302 elif pair_end_type == 'single': | |
303 command[2] = '-0 ' + str(fastq_basename + '.fq') | |
304 | |
305 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) | |
306 if run_successfully: | |
307 if pair_end_type.lower() == 'paired': | |
308 outfiles = [str(fastq_basename + '_1.fq'), str(fastq_basename + '_2.fq')] | |
309 elif pair_end_type.lower() == 'single': | |
310 outfiles = [str(fastq_basename + '.fq')] | |
311 | |
312 if bam_file is not None and os.path.isfile(bam_file): | |
313 os.remove(bam_file) | |
314 | |
315 return run_successfully, outfiles | |
316 | |
317 | |
318 def formart_fastq_headers(in_fastq_1, in_fastq_2): | |
319 | |
320 out_fastq_1 = in_fastq_1 + '.temp' | |
321 out_fastq_2 = in_fastq_2 + '.temp' | |
322 writer_in_fastq_1 = open(out_fastq_1, 'wt') | |
323 writer_in_fastq_2 = open(out_fastq_2, 'wt') | |
324 outfiles = [out_fastq_1, out_fastq_2] | |
325 with open(in_fastq_1, 'rtU') as reader_in_fastq_1, open(in_fastq_2, 'rtU') as reader_in_fastq_2: | |
326 plus_line = True | |
327 quality_line = True | |
328 number_reads = 0 | |
329 for in_1, in_2 in zip(reader_in_fastq_1, reader_in_fastq_2): | |
330 if len(in_1) > 0: | |
331 in_1 = in_1.splitlines()[0] | |
332 in_2 = in_2.splitlines()[0] | |
333 if in_1.startswith('@') and plus_line and quality_line: | |
334 if in_1 != in_2: | |
335 sys.exit('The PE fastq files are not aligned properly!') | |
336 in_1 += '/1' + '\n' | |
337 in_2 += '/2' + '\n' | |
338 writer_in_fastq_1.write(in_1) | |
339 writer_in_fastq_2.write(in_2) | |
340 plus_line = False | |
341 quality_line = False | |
342 elif in_1.startswith('+') and not plus_line: | |
343 in_1 += '\n' | |
344 writer_in_fastq_1.write(in_1) | |
345 writer_in_fastq_2.write(in_1) | |
346 plus_line = True | |
347 elif plus_line and not quality_line: | |
348 in_1 += '\n' | |
349 in_2 += '\n' | |
350 writer_in_fastq_1.write(in_1) | |
351 writer_in_fastq_2.write(in_2) | |
352 writer_in_fastq_1.flush() | |
353 writer_in_fastq_2.flush() | |
354 number_reads += 1 | |
355 quality_line = True | |
356 else: | |
357 in_1 += '\n' | |
358 in_2 += '\n' | |
359 writer_in_fastq_1.write(in_1) | |
360 writer_in_fastq_2.write(in_2) | |
361 return number_reads, outfiles | |
362 | |
363 | |
364 @utils.trace_unhandled_exceptions | |
365 def gzip_files(file_2_compress, pickle_prefix, outdir): | |
366 if file_2_compress.endswith('.temp'): | |
367 out_file = os.path.splitext(file_2_compress)[0] | |
368 else: | |
369 out_file = file_2_compress | |
370 | |
371 command = ['gzip', '--stdout', '--best', file_2_compress, '>', str(out_file + '.gz')] | |
372 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, True) | |
373 if run_successfully: | |
374 os.remove(file_2_compress) | |
375 | |
376 utils.save_variable_to_pickle(run_successfully, outdir, | |
377 str(pickle_prefix + '.' + os.path.basename(file_2_compress))) | |
378 | |
379 | |
380 def find_files(directory, prefix, suffix): | |
381 list_files_found = [] | |
382 files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))] | |
383 for file_found in files: | |
384 if file_found.startswith(prefix) and file_found.endswith(suffix): | |
385 file_path = os.path.join(directory, file_found) | |
386 list_files_found.append(file_path) | |
387 | |
388 if len(list_files_found) == 0: | |
389 list_files_found = None | |
390 | |
391 return list_files_found | |
392 | |
393 | |
394 def compress_files(fastq_files, outdir, threads): | |
395 pickle_prefix = 'compress' | |
396 compressed_fastq_files = None | |
397 | |
398 pool = multiprocessing.Pool(processes=threads) | |
399 for fastq in fastq_files: | |
400 pool.apply_async(gzip_files, args=(fastq, pickle_prefix, outdir,)) | |
401 pool.close() | |
402 pool.join() | |
403 | |
404 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) | |
405 if run_successfully: | |
406 compressed_fastq_files = find_files(outdir, '', '.gz') | |
407 | |
408 return run_successfully, compressed_fastq_files | |
409 | |
410 | |
411 def bam_cram_2_fastq(alignment_file, outdir, threads, pair_end_type): | |
412 run_successfully, fastq_files = alignment_to_fastq(alignment_file, threads, pair_end_type) | |
413 if run_successfully: | |
414 if pair_end_type.lower() == 'paired': | |
415 number_reads, fastq_files = formart_fastq_headers(fastq_files[0], fastq_files[1]) | |
416 | |
417 run_successfully, fastq_files = compress_files(fastq_files, outdir, threads) | |
418 | |
419 return run_successfully, fastq_files | |
420 | |
421 | |
422 def check_correct_links(download_information): | |
423 for i in download_information: | |
424 if download_information[i] is not None: | |
425 if download_information[i]['aspera'] is not None: | |
426 for j in range(0, len(download_information[i]['aspera'])): | |
427 if download_information[i]['aspera'][j].startswith('fasp.sra.ebi.ac.uk/'): | |
428 download_information[i]['aspera'][j] = download_information[i]['aspera'][j].replace( | |
429 'fasp.sra.ebi.ac.uk/', 'fasp.sra.ebi.ac.uk:/', 1) | |
430 if download_information[i]['ftp'] is not None: | |
431 for j in range(0, len(download_information[i]['ftp'])): | |
432 if '#' in download_information[i]['ftp'][j]: | |
433 download_information[i]['ftp'][j] = download_information[i]['ftp'][j].replace('#', '%23') | |
434 return download_information | |
435 | |
436 | |
437 def get_fastq_files(download_dir, cram_index_run_successfully, threads, download_paired_type): | |
438 run_successfully = False | |
439 downloaded_files = find_files(download_dir, '', '') | |
440 if cram_index_run_successfully: | |
441 cram_file = None | |
442 for i in downloaded_files: | |
443 if i.endswith('.cram'): | |
444 cram_file = i | |
445 run_successfully, downloaded_files = bam_cram_2_fastq(cram_file, download_dir, threads, download_paired_type) | |
446 else: | |
447 if downloaded_files is not None and len(downloaded_files) > 0: | |
448 run_successfully = True | |
449 | |
450 return run_successfully, downloaded_files | |
451 | |
452 | |
453 def rename_move_files(list_files, new_name, outdir, download_paired_type): | |
454 list_new_files = {} | |
455 run_successfully = False | |
456 | |
457 for i in range(0, len(list_files)): | |
458 temp_name = utils.rchop(os.path.basename(list_files[i]), 'astq.gz') | |
459 if len(temp_name) == len(os.path.basename(list_files[i])): | |
460 temp_name = utils.rchop(os.path.basename(list_files[i]), 'q.gz') | |
461 if download_paired_type.lower() == 'paired': | |
462 if temp_name.endswith(('_R1_001.f', '_1.f')): | |
463 list_new_files[i] = os.path.join(outdir, new_name + '_1.fq.gz') | |
464 elif temp_name.endswith(('_R2_001.f', '_2.f')): | |
465 list_new_files[i] = os.path.join(outdir, new_name + '_2.fq.gz') | |
466 else: | |
467 if not temp_name.endswith(('_R1_001.f', '_R2_001.f')): | |
468 list_new_files[i] = os.path.join(outdir, new_name + '.fq.gz') | |
469 if temp_name.endswith(('_1.f', '_2.f')): | |
470 print('WARNING: possible single-end file conflict with pair-end (' + list_files[i] + ')!') | |
471 | |
472 if len(list_new_files) == 2 and download_paired_type.lower() == 'paired': | |
473 run_successfully = True | |
474 elif len(list_new_files) == 1 and download_paired_type.lower() == 'single': | |
475 run_successfully = True | |
476 | |
477 if run_successfully: | |
478 try: | |
479 for i in range(0, len(list_files)): | |
480 if i not in list_new_files: | |
481 if os.path.isfile(list_files[i]): | |
482 os.remove(list_files[i]) | |
483 else: | |
484 os.rename(list_files[i], list_new_files[i]) | |
485 list_new_files = list(list_new_files.values()) | |
486 except Exception as e: | |
487 print(e) | |
488 run_successfully = False | |
489 | |
490 if not run_successfully: | |
491 list_new_files = None | |
492 | |
493 return run_successfully, list_new_files | |
494 | |
495 | |
496 # @utils.trace_unhandled_exceptions | |
497 def rename_header_sra(fastq): | |
498 run_successfully = False | |
499 try: | |
500 command = ['gawk', '\'{if(NR%4==1) $0=gensub(/\./, \"/\", 2); print}\'', fastq, '|', 'gzip', '-1', '>', | |
501 str(fastq + '.gz')] | |
502 print('Running: ' + str(' '.join(command))) | |
503 return_code = subprocess.call(' '.join(command), shell=True) | |
504 if return_code == 0: | |
505 run_successfully = True | |
506 else: | |
507 print('Something went wrong with command: {command}'.format(command=' '.join(command))) | |
508 except Exception as e: | |
509 print(e) | |
510 | |
511 return run_successfully | |
512 | |
513 | |
514 def sra_2_fastq(download_dir, ena_id): | |
515 command = ['fastq-dump', '-I', '-O', download_dir, '--split-files', '{download_dir}{ena_id}.sra'.format( | |
516 download_dir=download_dir, ena_id=ena_id)] | |
517 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) | |
518 if run_successfully: | |
519 files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) | |
520 if not f.startswith('.') and os.path.isfile(os.path.join(download_dir, f)) and f.endswith('.fastq')] | |
521 | |
522 pool = multiprocessing.Pool(processes=2) | |
523 results = [] | |
524 p = pool.map_async(rename_header_sra, files, callback=results.extend) | |
525 p.wait() | |
526 | |
527 run_successfully = all(results) | |
528 | |
529 return run_successfully | |
530 | |
531 | |
532 download_timer = functools.partial(utils.timer, name='Download module') | |
533 | |
534 | |
535 @download_timer | |
536 def run_download(ena_id, download_paired_type, aspera_key, outdir, download_cram_bam_true, threads, instrument_platform, | |
537 sra, sra_opt): | |
538 download_dir = os.path.join(outdir, 'download', '') | |
539 utils.remove_directory(download_dir) | |
540 os.mkdir(download_dir) | |
541 | |
542 run_successfully = False | |
543 downloaded_files = None | |
544 sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, | |
545 'library_layout': None, 'library_source': None, 'extra_run_accession': None, | |
546 'nominal_length': None, 'read_count': None, 'base_count': None, | |
547 'date_download': time.strftime("%Y-%m-%d")} | |
548 | |
549 read_run_info = get_read_run_info(ena_id) | |
550 if read_run_info is not None: | |
551 download_information = get_download_information(read_run_info) | |
552 download_information = check_correct_links(download_information) | |
553 sequencing_information = get_sequencing_information(read_run_info) | |
554 | |
555 if instrument_platform.lower() == 'all' or \ | |
556 (sequencing_information['instrument_platform'] is not None and | |
557 sequencing_information['instrument_platform'].lower() == instrument_platform.lower()): | |
558 if download_paired_type.lower() == 'both' or \ | |
559 (sequencing_information['library_layout'] is not None and | |
560 sequencing_information['library_layout'].lower() == download_paired_type.lower()): | |
561 run_successfully, cram_index_run_successfully, download_sra = download_files(download_information, | |
562 aspera_key, download_dir, | |
563 download_cram_bam_true, | |
564 sra, sra_opt, ena_id) | |
565 if download_sra: | |
566 run_successfully = sra_2_fastq(download_dir, ena_id) | |
567 if run_successfully: | |
568 run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, | |
569 threads, | |
570 sequencing_information['library_layout']) | |
571 if run_successfully and downloaded_files is not None: | |
572 run_successfully, downloaded_files = rename_move_files(downloaded_files, | |
573 sequencing_information['run_accession'], | |
574 outdir, | |
575 sequencing_information['library_layout']) | |
576 else: | |
577 if sra or sra_opt: | |
578 run_successfully, cram_index_run_successfully, download_sra = download_files({'fastq': None, | |
579 'submitted': None, | |
580 'cram_index': None}, | |
581 aspera_key, download_dir, | |
582 download_cram_bam_true, sra, | |
583 sra_opt, ena_id) | |
584 if download_sra: | |
585 run_successfully = sra_2_fastq(download_dir, ena_id) | |
586 if run_successfully: | |
587 run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, | |
588 'paired') | |
589 if not run_successfully: | |
590 run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, | |
591 threads, 'single') | |
592 if run_successfully and downloaded_files is not None: | |
593 run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'paired') | |
594 if not run_successfully: | |
595 run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'single') | |
596 | |
597 utils.remove_directory(download_dir) | |
598 | |
599 return run_successfully, downloaded_files, sequencing_information |