comparison data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py @ 8:14eb0fc65c62 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta commit b56485a8b386fc6f17219850e30e5656c159f231"
author iuc
date Wed, 16 Oct 2019 04:17:00 -0400
parents b1bc53e9bbc5
children
comparison
equal deleted inserted replaced
7:b1bc53e9bbc5 8:14eb0fc65c62
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Dan Blankenberg 2 # Dan Blankenberg
3 3
4 import bz2 4 import bz2
5 import gzip 5 import gzip
6 import json
6 import optparse 7 import optparse
7 import os 8 import os
8 import shutil 9 import shutil
9 import sys 10 import sys
10 import tarfile 11 import tarfile
11 import tempfile 12 import tempfile
12 import zipfile 13 import zipfile
13 from ftplib import FTP 14 from ftplib import FTP
14 from json import dumps, loads
15 15
16 try: 16 try:
17 # For Python 3.0 and later 17 # For Python 3.0 and later
18 from io import BytesIO as StringIO 18 from io import BytesIO as StringIO
19 from io import UnsupportedOperation 19 from io import UnsupportedOperation
116 current_order = [_[1] for _ in sorted((_[1], _[0]) for _ in fasta_offsets.items())] 116 current_order = [_[1] for _ in sorted((_[1], _[0]) for _ in fasta_offsets.items())]
117 return (unsorted_filename, fasta_offsets, current_order) 117 return (unsorted_filename, fasta_offsets, current_order)
118 118
119 119
120 def _write_sorted_fasta(sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename): 120 def _write_sorted_fasta(sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename):
121 unsorted_fh = open(unsorted_fasta_filename) 121 with open(unsorted_fasta_filename, 'rb') as unsorted_fh, open(sorted_fasta_filename, 'wb+') as sorted_fh:
122 sorted_fh = open(sorted_fasta_filename, 'wb+') 122 for name in sorted_names:
123 123 offset = fasta_offsets[name]
124 for name in sorted_names: 124 unsorted_fh.seek(offset)
125 offset = fasta_offsets[name] 125 sorted_fh.write(unsorted_fh.readline())
126 unsorted_fh.seek(offset) 126 while True:
127 sorted_fh.write(unsorted_fh.readline()) 127 line = unsorted_fh.readline()
128 while True: 128 if not line or line.startswith(b">"):
129 line = unsorted_fh.readline() 129 break
130 if not line or line.startswith(">"): 130 sorted_fh.write(line)
131 break
132 sorted_fh.write(line)
133 unsorted_fh.close()
134 sorted_fh.close()
135 131
136 132
137 def _sort_fasta_as_is(fasta_filename, params): 133 def _sort_fasta_as_is(fasta_filename, params):
138 return 134 return
139 135
314 310
315 def download_from_url(params, tmp_dir, **kwds): 311 def download_from_url(params, tmp_dir, **kwds):
316 """ 312 """
317 Download a file from a URL and return a list of filehandles from which to read the data. 313 Download a file from a URL and return a list of filehandles from which to read the data.
318 314
319 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar'
320 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
321 >>> tmp_dir = tempfile.mkdtemp() 315 >>> tmp_dir = tempfile.mkdtemp()
322 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] 316 >>> url = 'https://github.com/galaxyproject/tools-iuc/raw/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.bz2'
323 >>> assert fh.readline().startswith('>FBtr0304171')
324 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.bz2'
325 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} 317 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
326 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] 318 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
327 >>> assert fh.readline().startswith('>FBtr0304171') 319 >>> assert fh.readline().startswith('b>FBtr0304171')
328 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.gz' 320 >>> url = 'https://github.com/galaxyproject/tools-iuc/raw/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/phiX174.fasta'
329 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} 321 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
330 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] 322 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
331 >>> assert fh.readline().startswith('>FBtr0304171') 323 >>> assert fh.readline().startswith('b>phiX174')
332 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.zip'
333 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
334 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
335 >>> assert fh.readline().startswith('>FBtr0304171')
336 >>> url = 'https://raw.githubusercontent.com/galaxyproject/tools-devteam/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/phiX174.fasta'
337 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
338 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
339 >>> assert fh.readline().startswith('>phiX174')
340 """ 324 """
341 urls = filter(bool, [x.strip() for x in params['param_dict']['reference_source']['user_url'].split('\n')]) 325 urls = filter(bool, [x.strip() for x in params['param_dict']['reference_source']['user_url'].split('\n')])
342 return [get_stream_reader(urlopen(url), tmp_dir) for url in urls] 326 return [get_stream_reader(urlopen(url), tmp_dir) for url in urls]
343 327
344 328
346 # TODO: allow multiple FASTA input files 330 # TODO: allow multiple FASTA input files
347 input_filename = params['param_dict']['reference_source']['input_fasta'] 331 input_filename = params['param_dict']['reference_source']['input_fasta']
348 if isinstance(input_filename, list): 332 if isinstance(input_filename, list):
349 fasta_readers = [get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename] 333 fasta_readers = [get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename]
350 else: 334 else:
351 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) 335 fasta_readers = get_stream_reader(open(input_filename, 'rb'), tmp_dir)
352 return fasta_readers 336 return fasta_readers
353 337
354 338
355 def copy_from_directory(data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir): 339 def copy_from_directory(data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir):
356 input_filename = params['param_dict']['reference_source']['fasta_filename'] 340 input_filename = params['param_dict']['reference_source']['fasta_filename']
466 parser.add_option('-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description') 450 parser.add_option('-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description')
467 (options, args) = parser.parse_args() 451 (options, args) = parser.parse_args()
468 452
469 filename = args[0] 453 filename = args[0]
470 454
471 params = loads(open(filename).read()) 455 with open(filename) as fh:
456 params = json.load(fh)
472 target_directory = params['output_data'][0]['extra_files_path'] 457 target_directory = params['output_data'][0]['extra_files_path']
473 os.mkdir(target_directory) 458 os.mkdir(target_directory)
474 data_manager_dict = {} 459 data_manager_dict = {}
475 460
476 dbkey, dbkey_name, sequence_id, sequence_name = get_dbkey_dbname_id_name(params, dbkey_description=options.dbkey_description) 461 dbkey, dbkey_name, sequence_id, sequence_name = get_dbkey_dbname_id_name(params, dbkey_description=options.dbkey_description)
502 params=params) 487 params=params)
503 488
504 finally: 489 finally:
505 cleanup_before_exit(tmp_dir) 490 cleanup_before_exit(tmp_dir)
506 # save info to json file 491 # save info to json file
507 open(filename, 'wb').write(dumps(data_manager_dict).encode()) 492 with open(filename, 'w') as fh:
493 json.dump(data_manager_dict, fh, sort_keys=True)
508 494
509 495
510 if __name__ == "__main__": 496 if __name__ == "__main__":
511 main() 497 main()