Mercurial > repos > devteam > data_manager_fetch_genome_dbkeys_all_fasta
comparison data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py @ 8:14eb0fc65c62 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta commit b56485a8b386fc6f17219850e30e5656c159f231"
author | iuc |
---|---|
date | Wed, 16 Oct 2019 04:17:00 -0400 |
parents | b1bc53e9bbc5 |
children |
comparison
equal
deleted
inserted
replaced
7:b1bc53e9bbc5 | 8:14eb0fc65c62 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Dan Blankenberg | 2 # Dan Blankenberg |
3 | 3 |
4 import bz2 | 4 import bz2 |
5 import gzip | 5 import gzip |
6 import json | |
6 import optparse | 7 import optparse |
7 import os | 8 import os |
8 import shutil | 9 import shutil |
9 import sys | 10 import sys |
10 import tarfile | 11 import tarfile |
11 import tempfile | 12 import tempfile |
12 import zipfile | 13 import zipfile |
13 from ftplib import FTP | 14 from ftplib import FTP |
14 from json import dumps, loads | |
15 | 15 |
16 try: | 16 try: |
17 # For Python 3.0 and later | 17 # For Python 3.0 and later |
18 from io import BytesIO as StringIO | 18 from io import BytesIO as StringIO |
19 from io import UnsupportedOperation | 19 from io import UnsupportedOperation |
116 current_order = [_[1] for _ in sorted((_[1], _[0]) for _ in fasta_offsets.items())] | 116 current_order = [_[1] for _ in sorted((_[1], _[0]) for _ in fasta_offsets.items())] |
117 return (unsorted_filename, fasta_offsets, current_order) | 117 return (unsorted_filename, fasta_offsets, current_order) |
118 | 118 |
119 | 119 |
120 def _write_sorted_fasta(sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename): | 120 def _write_sorted_fasta(sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename): |
121 unsorted_fh = open(unsorted_fasta_filename) | 121 with open(unsorted_fasta_filename, 'rb') as unsorted_fh, open(sorted_fasta_filename, 'wb+') as sorted_fh: |
122 sorted_fh = open(sorted_fasta_filename, 'wb+') | 122 for name in sorted_names: |
123 | 123 offset = fasta_offsets[name] |
124 for name in sorted_names: | 124 unsorted_fh.seek(offset) |
125 offset = fasta_offsets[name] | 125 sorted_fh.write(unsorted_fh.readline()) |
126 unsorted_fh.seek(offset) | 126 while True: |
127 sorted_fh.write(unsorted_fh.readline()) | 127 line = unsorted_fh.readline() |
128 while True: | 128 if not line or line.startswith(b">"): |
129 line = unsorted_fh.readline() | 129 break |
130 if not line or line.startswith(">"): | 130 sorted_fh.write(line) |
131 break | |
132 sorted_fh.write(line) | |
133 unsorted_fh.close() | |
134 sorted_fh.close() | |
135 | 131 |
136 | 132 |
137 def _sort_fasta_as_is(fasta_filename, params): | 133 def _sort_fasta_as_is(fasta_filename, params): |
138 return | 134 return |
139 | 135 |
314 | 310 |
315 def download_from_url(params, tmp_dir, **kwds): | 311 def download_from_url(params, tmp_dir, **kwds): |
316 """ | 312 """ |
317 Download a file from a URL and return a list of filehandles from which to read the data. | 313 Download a file from a URL and return a list of filehandles from which to read the data. |
318 | 314 |
319 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar' | |
320 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} | |
321 >>> tmp_dir = tempfile.mkdtemp() | 315 >>> tmp_dir = tempfile.mkdtemp() |
322 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] | 316 >>> url = 'https://github.com/galaxyproject/tools-iuc/raw/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.bz2' |
323 >>> assert fh.readline().startswith('>FBtr0304171') | |
324 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.bz2' | |
325 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} | 317 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} |
326 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] | 318 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] |
327 >>> assert fh.readline().startswith('>FBtr0304171') | 319 >>> assert fh.readline().startswith('b>FBtr0304171') |
328 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.gz' | 320 >>> url = 'https://github.com/galaxyproject/tools-iuc/raw/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/phiX174.fasta' |
329 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} | 321 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} |
330 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] | 322 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] |
331 >>> assert fh.readline().startswith('>FBtr0304171') | 323 >>> assert fh.readline().startswith('b>phiX174') |
332 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.zip' | |
333 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} | |
334 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] | |
335 >>> assert fh.readline().startswith('>FBtr0304171') | |
336 >>> url = 'https://raw.githubusercontent.com/galaxyproject/tools-devteam/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/phiX174.fasta' | |
337 >>> params = {'param_dict': {'reference_source': {'user_url': url}}} | |
338 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0] | |
339 >>> assert fh.readline().startswith('>phiX174') | |
340 """ | 324 """ |
341 urls = filter(bool, [x.strip() for x in params['param_dict']['reference_source']['user_url'].split('\n')]) | 325 urls = filter(bool, [x.strip() for x in params['param_dict']['reference_source']['user_url'].split('\n')]) |
342 return [get_stream_reader(urlopen(url), tmp_dir) for url in urls] | 326 return [get_stream_reader(urlopen(url), tmp_dir) for url in urls] |
343 | 327 |
344 | 328 |
346 # TODO: allow multiple FASTA input files | 330 # TODO: allow multiple FASTA input files |
347 input_filename = params['param_dict']['reference_source']['input_fasta'] | 331 input_filename = params['param_dict']['reference_source']['input_fasta'] |
348 if isinstance(input_filename, list): | 332 if isinstance(input_filename, list): |
349 fasta_readers = [get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename] | 333 fasta_readers = [get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename] |
350 else: | 334 else: |
351 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) | 335 fasta_readers = get_stream_reader(open(input_filename, 'rb'), tmp_dir) |
352 return fasta_readers | 336 return fasta_readers |
353 | 337 |
354 | 338 |
355 def copy_from_directory(data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir): | 339 def copy_from_directory(data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir): |
356 input_filename = params['param_dict']['reference_source']['fasta_filename'] | 340 input_filename = params['param_dict']['reference_source']['fasta_filename'] |
466 parser.add_option('-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description') | 450 parser.add_option('-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description') |
467 (options, args) = parser.parse_args() | 451 (options, args) = parser.parse_args() |
468 | 452 |
469 filename = args[0] | 453 filename = args[0] |
470 | 454 |
471 params = loads(open(filename).read()) | 455 with open(filename) as fh: |
456 params = json.load(fh) | |
472 target_directory = params['output_data'][0]['extra_files_path'] | 457 target_directory = params['output_data'][0]['extra_files_path'] |
473 os.mkdir(target_directory) | 458 os.mkdir(target_directory) |
474 data_manager_dict = {} | 459 data_manager_dict = {} |
475 | 460 |
476 dbkey, dbkey_name, sequence_id, sequence_name = get_dbkey_dbname_id_name(params, dbkey_description=options.dbkey_description) | 461 dbkey, dbkey_name, sequence_id, sequence_name = get_dbkey_dbname_id_name(params, dbkey_description=options.dbkey_description) |
502 params=params) | 487 params=params) |
503 | 488 |
504 finally: | 489 finally: |
505 cleanup_before_exit(tmp_dir) | 490 cleanup_before_exit(tmp_dir) |
506 # save info to json file | 491 # save info to json file |
507 open(filename, 'wb').write(dumps(data_manager_dict).encode()) | 492 with open(filename, 'w') as fh: |
493 json.dump(data_manager_dict, fh, sort_keys=True) | |
508 | 494 |
509 | 495 |
510 if __name__ == "__main__": | 496 if __name__ == "__main__": |
511 main() | 497 main() |