comparison data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py @ 4:60994ca04177 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta commit 8652f36a3a3838dca989426961561e81432acf4f
author iuc
date Tue, 04 Apr 2017 17:13:31 -0400
parents 86fa71e9b427
children b1bc53e9bbc5
comparison
equal deleted inserted replaced
3:86fa71e9b427 4:60994ca04177
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 #Dan Blankenberg 2 # Dan Blankenberg
3 3
4 import bz2
5 import gzip
6 import optparse
7 import os
8 import shutil
4 import sys 9 import sys
5 import os 10 import tarfile
6 import tempfile 11 import tempfile
7 import shutil 12 import zipfile
8 import optparse
9 from ftplib import FTP 13 from ftplib import FTP
10 import tarfile 14 from json import dumps, loads
11 import zipfile 15
12 import gzip
13 import bz2
14 try: 16 try:
15 # For Python 3.0 and later 17 # For Python 3.0 and later
16 from urllib.request import urlopen
17 from io import BytesIO as StringIO 18 from io import BytesIO as StringIO
18 from io import UnsupportedOperation 19 from io import UnsupportedOperation
20 from urllib.request import urlopen
19 except ImportError: 21 except ImportError:
20 # Fall back to Python 2's urllib2 22 # Fall back to Python 2 imports
23 from StringIO import StringIO
21 from urllib2 import urlopen 24 from urllib2 import urlopen
22 from StringIO import StringIO 25
23 UnsupportedOperation = AttributeError 26 UnsupportedOperation = AttributeError
24 from json import loads, dumps 27
25 28 CHUNK_SIZE = 2 ** 20 # 1mb
26 29
27 CHUNK_SIZE = 2**20 # 1mb 30
28 31 def cleanup_before_exit(tmp_dir):
29 32 if tmp_dir and os.path.exists(tmp_dir):
30 def cleanup_before_exit( tmp_dir ): 33 shutil.rmtree(tmp_dir)
31 if tmp_dir and os.path.exists( tmp_dir ):
32 shutil.rmtree( tmp_dir )
33 34
34 35
35 def stop_err(msg): 36 def stop_err(msg):
36 sys.stderr.write(msg) 37 sys.stderr.write(msg)
37 sys.exit(1) 38 sys.exit(1)
38 39
39 40
40 def get_dbkey_dbname_id_name( params, dbkey_description=None ): 41 def get_dbkey_dbname_id_name(params, dbkey_description=None):
41 dbkey = params['param_dict']['dbkey_source']['dbkey'] 42 dbkey = params['param_dict']['dbkey_source']['dbkey']
42 #TODO: ensure sequence_id is unique and does not already appear in location file 43 # TODO: ensure sequence_id is unique and does not already appear in location file
43 sequence_id = params['param_dict']['sequence_id'] 44 sequence_id = params['param_dict']['sequence_id']
44 if not sequence_id: 45 if not sequence_id:
45 sequence_id = dbkey #uuid.uuid4() generate and use an uuid instead? 46 sequence_id = dbkey # uuid.uuid4() generate and use an uuid instead?
46 47
47 if params['param_dict']['dbkey_source']['dbkey_source_selector'] == 'new': 48 if params['param_dict']['dbkey_source']['dbkey_source_selector'] == 'new':
48 dbkey_name = params['param_dict']['dbkey_source']['dbkey_name'] 49 dbkey_name = params['param_dict']['dbkey_source']['dbkey_name']
49 if not dbkey_name: 50 if not dbkey_name:
50 dbkey_name = dbkey 51 dbkey_name = dbkey
51 else: 52 else:
52 dbkey_name = None 53 dbkey_name = None
53 54
54 sequence_name = params['param_dict']['sequence_name'] 55 sequence_name = params['param_dict']['sequence_name']
55 if not sequence_name: 56 if not sequence_name:
56 sequence_name = dbkey_description 57 sequence_name = dbkey_description
57 if not sequence_name: 58 if not sequence_name:
58 sequence_name = dbkey 59 sequence_name = dbkey
59 return dbkey, dbkey_name, sequence_id, sequence_name 60 return dbkey, dbkey_name, sequence_id, sequence_name
60 61
61 62
62 def _get_files_in_ftp_path( ftp, path ): 63 def _get_files_in_ftp_path(ftp, path):
63 path_contents = [] 64 path_contents = []
64 ftp.retrlines( 'MLSD %s' % ( path ), path_contents.append ) 65 ftp.retrlines('MLSD %s' % (path), path_contents.append)
65 return [ line.split( ';' )[ -1 ].lstrip() for line in path_contents ] 66 return [line.split(';')[-1].lstrip() for line in path_contents]
66 67
67 68
68 def _get_stream_readers_for_tar( fh, tmp_dir ): 69 def _get_stream_readers_for_tar(fh, tmp_dir):
69 fasta_tar = tarfile.open( fileobj=fh, mode='r:*' ) 70 fasta_tar = tarfile.open(fileobj=fh, mode='r:*')
70 return [x for x in [fasta_tar.extractfile(member) for member in fasta_tar.getmembers()] if x] 71 return [x for x in [fasta_tar.extractfile(member) for member in fasta_tar.getmembers()] if x]
71 72
72 73
73 def _get_stream_readers_for_zip( fh, tmp_dir ): 74 def _get_stream_readers_for_zip(fh, tmp_dir):
74 """ 75 """
75 Unpacks all archived files in a zip file. 76 Unpacks all archived files in a zip file.
76 Individual files will be concatenated (in _stream_fasta_to_file) 77 Individual files will be concatenated (in _stream_fasta_to_file)
77 """ 78 """
78 fasta_zip = zipfile.ZipFile( fh, 'r' ) 79 fasta_zip = zipfile.ZipFile(fh, 'r')
79 rval = [] 80 rval = []
80 for member in fasta_zip.namelist(): 81 for member in fasta_zip.namelist():
81 fasta_zip.extract( member, tmp_dir ) 82 fasta_zip.extract(member, tmp_dir)
82 rval.append( open( os.path.join( tmp_dir, member ), 'rb' ) ) 83 rval.append(open(os.path.join(tmp_dir, member), 'rb'))
83 return rval 84 return rval
84 85
85 86
86 def _get_stream_readers_for_gzip( fh, tmp_dir ): 87 def _get_stream_readers_for_gzip(fh, tmp_dir):
87 return [ gzip.GzipFile( fileobj=fh, mode='rb') ] 88 return [gzip.GzipFile(fileobj=fh, mode='rb')]
88 89
89 90
90 def _get_stream_readers_for_bz2( fh, tmp_dir ): 91 def _get_stream_readers_for_bz2(fh, tmp_dir):
91 return [ bz2.BZ2File( fh.name, 'rb') ] 92 return [bz2.BZ2File(fh.name, 'rb')]
92 93
93 94
94 def sort_fasta( fasta_filename, sort_method, params ): 95 def sort_fasta(fasta_filename, sort_method, params):
95 if sort_method is None: 96 if sort_method is None:
96 return 97 return
97 assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method ) 98 assert sort_method in SORTING_METHODS, ValueError("%s is not a valid sorting option." % sort_method)
98 return SORTING_METHODS[ sort_method ]( fasta_filename, params ) 99 return SORTING_METHODS[sort_method](fasta_filename, params)
99 100
100 101
101 def _move_and_index_fasta_for_sorting( fasta_filename ): 102 def _move_and_index_fasta_for_sorting(fasta_filename):
102 unsorted_filename = tempfile.NamedTemporaryFile().name 103 unsorted_filename = tempfile.NamedTemporaryFile().name
103 shutil.move( fasta_filename, unsorted_filename ) 104 shutil.move(fasta_filename, unsorted_filename)
104 fasta_offsets = {} 105 fasta_offsets = {}
105 unsorted_fh = open( unsorted_filename ) 106 unsorted_fh = open(unsorted_filename)
106 while True: 107 while True:
107 offset = unsorted_fh.tell() 108 offset = unsorted_fh.tell()
108 line = unsorted_fh.readline() 109 line = unsorted_fh.readline()
109 if not line: 110 if not line:
110 break 111 break
111 if line.startswith( ">" ): 112 if line.startswith(">"):
112 line = line.split( None, 1 )[0][1:] 113 line = line.split(None, 1)[0][1:]
113 fasta_offsets[ line ] = offset 114 fasta_offsets[line] = offset
114 unsorted_fh.close() 115 unsorted_fh.close()
115 current_order = map( lambda x: x[1], sorted( map( lambda x: ( x[1], x[0] ), fasta_offsets.items() ) ) ) 116 current_order = [_[1] for _ in sorted((_[1], _[0]) for _ in fasta_offsets.items())]
116 return ( unsorted_filename, fasta_offsets, current_order ) 117 return (unsorted_filename, fasta_offsets, current_order)
117 118
118 119
119 def _write_sorted_fasta( sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename ): 120 def _write_sorted_fasta(sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename):
120 unsorted_fh = open( unsorted_fasta_filename ) 121 unsorted_fh = open(unsorted_fasta_filename)
121 sorted_fh = open( sorted_fasta_filename, 'wb+' ) 122 sorted_fh = open(sorted_fasta_filename, 'wb+')
122 123
123 for name in sorted_names: 124 for name in sorted_names:
124 offset = fasta_offsets[ name ] 125 offset = fasta_offsets[name]
125 unsorted_fh.seek( offset ) 126 unsorted_fh.seek(offset)
126 sorted_fh.write( unsorted_fh.readline() ) 127 sorted_fh.write(unsorted_fh.readline())
127 while True: 128 while True:
128 line = unsorted_fh.readline() 129 line = unsorted_fh.readline()
129 if not line or line.startswith( ">" ): 130 if not line or line.startswith(">"):
130 break 131 break
131 sorted_fh.write( line ) 132 sorted_fh.write(line)
132 unsorted_fh.close() 133 unsorted_fh.close()
133 sorted_fh.close() 134 sorted_fh.close()
134 135
135 136
136 def _sort_fasta_as_is( fasta_filename, params ): 137 def _sort_fasta_as_is(fasta_filename, params):
137 return 138 return
138 139
139 def _sort_fasta_lexicographical( fasta_filename, params ): 140
140 ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) 141 def _sort_fasta_lexicographical(fasta_filename, params):
141 sorted_names = sorted( fasta_offsets.keys() ) 142 (unsorted_filename, fasta_offsets, current_order) = _move_and_index_fasta_for_sorting(fasta_filename)
143 sorted_names = sorted(fasta_offsets.keys())
142 if sorted_names == current_order: 144 if sorted_names == current_order:
143 shutil.move( unsorted_filename, fasta_filename ) 145 shutil.move(unsorted_filename, fasta_filename)
144 else: 146 else:
145 _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) 147 _write_sorted_fasta(sorted_names, fasta_offsets, fasta_filename, unsorted_filename)
146 148
147 149
148 def _sort_fasta_gatk( fasta_filename, params ): 150 def _sort_fasta_gatk(fasta_filename, params):
149 #This method was added by reviewer request. 151 (unsorted_filename, fasta_offsets, current_order) = _move_and_index_fasta_for_sorting(fasta_filename)
150 ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) 152 sorted_names = list(map(str, range(1, 23))) + ['X', 'Y']
151 sorted_names = map( str, range( 1, 23 ) ) + [ 'X', 'Y' ] 153 # detect if we have chrN, or just N
152 #detect if we have chrN, or just N
153 has_chr = False 154 has_chr = False
154 for chrom in sorted_names: 155 for chrom in sorted_names:
155 if "chr%s" % chrom in current_order: 156 if "chr%s" % chrom in current_order:
156 has_chr = True 157 has_chr = True
157 break 158 break
158 159
159 if has_chr: 160 if has_chr:
160 sorted_names = map( lambda x: "chr%s" % x, sorted_names) 161 sorted_names = ["chr%s" % x for x in sorted_names]
161 sorted_names.insert( 0, "chrM" ) 162 sorted_names.insert(0, "chrM")
162 else: 163 else:
163 sorted_names.insert( 0, "MT" ) 164 sorted_names.insert(0, "MT")
164 sorted_names.extend( map( lambda x: "%s_random" % x, sorted_names ) ) 165 sorted_names.extend("%s_random" % x for x in sorted_names)
165 166
166 existing_sorted_names = [] 167 existing_sorted_names = []
167 for name in sorted_names: 168 for name in sorted_names:
168 if name in current_order: 169 if name in current_order:
169 existing_sorted_names.append( name ) 170 existing_sorted_names.append(name)
170 for name in current_order: 171 for name in current_order:
171 #TODO: confirm that non-canonical names do not need to be sorted specially 172 # TODO: confirm that non-canonical names do not need to be sorted specially
172 if name not in existing_sorted_names: 173 if name not in existing_sorted_names:
173 existing_sorted_names.append( name ) 174 existing_sorted_names.append(name)
174 175
175 if existing_sorted_names == current_order: 176 if existing_sorted_names == current_order:
176 shutil.move( unsorted_filename, fasta_filename ) 177 shutil.move(unsorted_filename, fasta_filename)
177 else: 178 else:
178 _write_sorted_fasta( existing_sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) 179 _write_sorted_fasta(existing_sorted_names, fasta_offsets, fasta_filename, unsorted_filename)
179 180
180 181
181 def _sort_fasta_custom( fasta_filename, params ): 182 def _sort_fasta_custom(fasta_filename, params):
182 ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) 183 (unsorted_filename, fasta_offsets, current_order) = _move_and_index_fasta_for_sorting(fasta_filename)
183 sorted_names = [] 184 sorted_names = []
184 for id_repeat in params['param_dict']['sorting']['sequence_identifiers']: 185 for id_repeat in params['param_dict']['sorting']['sequence_identifiers']:
185 sorted_names.append( id_repeat[ 'identifier' ] ) 186 sorted_names.append(id_repeat['identifier'])
186 handle_not_listed = params['param_dict']['sorting']['handle_not_listed_selector'] 187 handle_not_listed = params['param_dict']['sorting']['handle_not_listed_selector']
187 if handle_not_listed.startswith( 'keep' ): 188 if handle_not_listed.startswith('keep'):
188 add_list = [] 189 add_list = []
189 for name in current_order: 190 for name in current_order:
190 if name not in sorted_names: 191 if name not in sorted_names:
191 add_list.append( name ) 192 add_list.append(name)
192 if add_list: 193 if add_list:
193 if handle_not_listed == 'keep_append': 194 if handle_not_listed == 'keep_append':
194 sorted_names.extend( add_list ) 195 sorted_names.extend(add_list)
195 else: 196 else:
196 add_list.extend( sorted_names ) 197 add_list.extend(sorted_names)
197 sorted_names = add_list 198 sorted_names = add_list
198 if sorted_names == current_order: 199 if sorted_names == current_order:
199 shutil.move( unsorted_filename, fasta_filename ) 200 shutil.move(unsorted_filename, fasta_filename)
200 else: 201 else:
201 _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) 202 _write_sorted_fasta(sorted_names, fasta_offsets, fasta_filename, unsorted_filename)
202 203
203 204
204 def _download_file(start, fh): 205 def _download_file(start, fh):
205 tmp = tempfile.NamedTemporaryFile() 206 tmp = tempfile.NamedTemporaryFile()
206 tmp.write(start) 207 tmp.write(start)
221 b"\x50\x4b\x03\x04": _get_stream_readers_for_zip, 222 b"\x50\x4b\x03\x04": _get_stream_readers_for_zip,
222 } 223 }
223 start_of_file = fh.read(CHUNK_SIZE) 224 start_of_file = fh.read(CHUNK_SIZE)
224 try: 225 try:
225 fh.seek(0) 226 fh.seek(0)
226 except UnsupportedOperation: # This is if fh has been created by urlopen 227 except UnsupportedOperation: # This happens if fh has been created by urlopen
227 fh = _download_file(start_of_file, fh) 228 fh = _download_file(start_of_file, fh)
228 for k,v in magic_dict.items():
229 if start_of_file.startswith(k):
230 return v(fh, tmp_dir)
231 try: # Check if file is tar file 229 try: # Check if file is tar file
232 if tarfile.open(fileobj=StringIO(start_of_file)): 230 if tarfile.open(fileobj=StringIO(start_of_file)):
233 return _get_stream_readers_for_tar(fh, tmp_dir) 231 return _get_stream_readers_for_tar(fh, tmp_dir)
234 except tarfile.ReadError: 232 except tarfile.ReadError:
235 pass 233 pass
236 return fh 234 for k, v in magic_dict.items():
235 if start_of_file.startswith(k):
236 return v(fh, tmp_dir)
237 return [fh]
237 238
238 239
239 def _get_ucsc_download_address(params, dbkey): 240 def _get_ucsc_download_address(params, dbkey):
240 """ 241 """
241 Check if we can find the correct file for the supplied dbkey on UCSC's FTP server 242 Check if we can find the correct file for the supplied dbkey on UCSC's FTP server
264 ucsc_file_name = "%s%s%s" % (ucsc_path, ucsc_chrom_fa_filename, ext) 265 ucsc_file_name = "%s%s%s" % (ucsc_path, ucsc_chrom_fa_filename, ext)
265 return "ftp://%s%s" % (UCSC_FTP_SERVER, ucsc_file_name) 266 return "ftp://%s%s" % (UCSC_FTP_SERVER, ucsc_file_name)
266 267
267 raise Exception('Unable to determine filename for UCSC Genome for %s: %s' % (ucsc_dbkey, path_contents)) 268 raise Exception('Unable to determine filename for UCSC Genome for %s: %s' % (ucsc_dbkey, path_contents))
268 269
269 def add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params): 270
270 for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): 271 def add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, **kwds):
271 if data_table_entry: 272 fasta_filename = _stream_fasta_to_file(fasta_stream=fasta_readers,
272 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) 273 target_directory=target_directory,
273 274 sequence_id=sequence_id)
274 275 sort_fasta(fasta_filename, params['param_dict']['sorting']['sort_selector'], params)
275 def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 276 if dbkey_name:
277 add_dbkey_to_table(data_manager_dict=data_manager_dict,
278 target_directory=target_directory,
279 dbkey=dbkey,
280 dbkey_name=dbkey_name,
281 fasta_filename=fasta_filename)
282 _add_data_table_entry(data_manager_dict,
283 data_table_entry=dict(value=sequence_id, dbkey=dbkey, name=sequence_name, path=os.path.basename(fasta_filename)),
284 data_table_name='all_fasta')
285 return fasta_filename
286
287
288 def add_dbkey_to_table(data_manager_dict, target_directory, dbkey, dbkey_name, fasta_filename):
289 # do len calc here
290 len_base_name = "%s.len" % (dbkey)
291 compute_fasta_length(fasta_filename, os.path.join(target_directory, len_base_name), keep_first_word=True)
292 dbkey_dict = dict(value=dbkey, name=dbkey_name, len_path=len_base_name)
293 _add_data_table_entry(data_manager_dict,
294 data_table_entry=dbkey_dict,
295 data_table_name='__dbkeys__')
296
297
298 def download_from_ucsc(params, dbkey, tmp_dir, **kwds):
276 url = _get_ucsc_download_address(params, dbkey) 299 url = _get_ucsc_download_address(params, dbkey)
277 fasta_readers = get_stream_reader(urlopen(url), tmp_dir) 300 return get_stream_reader(urlopen(url), tmp_dir)
278 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) 301
279 302
280 303 def download_from_ncbi(params, tmp_dir, **kwds):
281 def download_from_ncbi( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 304 NCBI_DOWNLOAD_URL = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' # FIXME: taken from dave's genome manager...why some japan site?
282 NCBI_DOWNLOAD_URL = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' #FIXME: taken from dave's genome manager...why some japan site?
283 requested_identifier = params['param_dict']['reference_source']['requested_identifier'] 305 requested_identifier = params['param_dict']['reference_source']['requested_identifier']
284 url = NCBI_DOWNLOAD_URL % requested_identifier 306 url = NCBI_DOWNLOAD_URL % requested_identifier
285 fasta_readers = get_stream_reader(urlopen(url), tmp_dir) 307 return get_stream_reader(urlopen(url), tmp_dir)
286 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) 308
287 309
288 310 def download_from_url(params, tmp_dir, **kwds):
289 def download_from_url( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 311 """
290 urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) 312 Download a file from a URL and return a list of filehandles from which to read the data.
291 fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ] 313
292 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params) 314 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar'
293 315 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
294 316 >>> tmp_dir = tempfile.mkdtemp()
295 def download_from_history( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 317 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
296 #TODO: allow multiple FASTA input files 318 >>> assert fh.readline().startswith('>FBtr0304171')
319 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.bz2'
320 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
321 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
322 >>> assert fh.readline().startswith('>FBtr0304171')
323 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.tar.gz'
324 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
325 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
326 >>> assert fh.readline().startswith('>FBtr0304171')
327 >>> url = 'https://github.com/mvdbeek/tools-devteam/raw/data_manager/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/test.zip'
328 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
329 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
330 >>> assert fh.readline().startswith('>FBtr0304171')
331 >>> url = 'https://raw.githubusercontent.com/galaxyproject/tools-devteam/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta/test-data/phiX174.fasta'
332 >>> params = {'param_dict': {'reference_source': {'user_url': url}}}
333 >>> fh = download_from_url(params=params, tmp_dir=tmp_dir)[0][0]
334 >>> assert fh.readline().startswith('>phiX174')
335 """
336 urls = filter(bool, [x.strip() for x in params['param_dict']['reference_source']['user_url'].split('\n')])
337 return [get_stream_reader(urlopen(url), tmp_dir) for url in urls]
338
339
340 def download_from_history(params, tmp_dir, **kwds):
341 # TODO: allow multiple FASTA input files
297 input_filename = params['param_dict']['reference_source']['input_fasta'] 342 input_filename = params['param_dict']['reference_source']['input_fasta']
298 if isinstance( input_filename, list ): 343 if isinstance(input_filename, list):
299 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] 344 fasta_readers = [get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename]
300 else: 345 else:
301 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) 346 fasta_readers = get_stream_reader(open(input_filename), tmp_dir)
302 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) 347 return fasta_readers
303 348
304 349
305 def copy_from_directory( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 350 def copy_from_directory(data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir):
306 input_filename = params['param_dict']['reference_source']['fasta_filename'] 351 input_filename = params['param_dict']['reference_source']['fasta_filename']
307 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' 352 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink'
308 if create_symlink: 353 if create_symlink:
309 data_table_entries = _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ) 354 data_table_entries = _create_symlink(input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name)
310 else: 355 else:
311 if isinstance( input_filename, list ): 356 if isinstance(input_filename, list):
312 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] 357 fasta_readers = [get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename]
313 else: 358 else:
314 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) 359 fasta_readers = get_stream_reader(open(input_filename), tmp_dir)
315 data_table_entries = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) 360 return fasta_readers
316 for data_table_name, data_table_entry in data_table_entries: 361 for data_table_name, data_table_entry in data_table_entries:
317 if data_table_entry: 362 if data_table_entry:
318 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) 363 _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name)
319 364
320 365
321 def _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ): 366 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
322 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) 367 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
323 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( 'all_fasta', [] ) 368 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', [])
324 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) 369 data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
325 return data_manager_dict 370 return data_manager_dict
326 371
327 372
328 def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ): 373 def _stream_fasta_to_file(fasta_stream, target_directory, sequence_id, close_stream=True):
329 fasta_base_filename = "%s.fa" % sequence_id 374 fasta_base_filename = "%s.fa" % sequence_id
330 fasta_filename = os.path.join( target_directory, fasta_base_filename ) 375 fasta_filename = os.path.join(target_directory, fasta_base_filename)
331 with open( fasta_filename, 'wb+' ) as fasta_writer: 376 with open(fasta_filename, 'wb+') as fasta_writer:
332 377
333 if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1: 378 if isinstance(fasta_stream, list) and len(fasta_stream) == 1:
334 fasta_stream = fasta_stream[0] 379 fasta_stream = fasta_stream[0]
335 380
336 if isinstance( fasta_stream, list ): 381 if isinstance(fasta_stream, list):
337 last_char = None 382 last_char = None
338 for fh in fasta_stream: 383 for fh in fasta_stream:
339 if last_char not in [ None, '\n', '\r', b'\n', b'\r' ]: 384 if last_char not in [None, '\n', '\r', b'\n', b'\r']:
340 fasta_writer.write( b'\n' ) 385 fasta_writer.write(b'\n')
341 while True: 386 while True:
342 data = fh.read( CHUNK_SIZE ) 387 data = fh.read(CHUNK_SIZE)
343 if data: 388 if data:
344 fasta_writer.write( data ) 389 fasta_writer.write(data)
345 last_char = data[-1] 390 last_char = data[-1]
346 else: 391 else:
347 break 392 break
348 if close_stream: 393 if close_stream:
349 fh.close() 394 fh.close()
350 else: 395 else:
351 while True: 396 while True:
352 data = fasta_stream.read( CHUNK_SIZE ) 397 data = fasta_stream.read(CHUNK_SIZE)
353 if data: 398 if data:
354 fasta_writer.write( data ) 399 fasta_writer.write(data)
355 else: 400 else:
356 break 401 break
357 if close_stream: 402 if close_stream:
358 fasta_stream.close() 403 fasta_stream.close()
359 404 return fasta_filename
360 sort_fasta( fasta_filename, params['param_dict']['sorting']['sort_selector'], params ) 405
361 406
362 dbkey_dict = None 407 def compute_fasta_length(fasta_file, out_file, keep_first_word=False):
363 if dbkey_name:
364 #do len calc here
365 len_base_name = "%s.len" % ( dbkey )
366 compute_fasta_length( fasta_filename, os.path.join( target_directory, len_base_name ), keep_first_word=True )
367 dbkey_dict = dict( value=dbkey, name=dbkey_name, len_path=len_base_name )
368
369 return [ ( '__dbkeys__', dbkey_dict ), ( 'all_fasta', dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ]
370
371
372 def compute_fasta_length( fasta_file, out_file, keep_first_word=False ):
373
374 infile = fasta_file 408 infile = fasta_file
375 out = open( out_file, 'w') 409 out = open(out_file, 'w')
376 410
377 fasta_title = '' 411 fasta_title = ''
378 seq_len = 0 412 seq_len = 0
379 413
380 first_entry = True 414 first_entry = True
381 415
382 for line in open( infile ): 416 for line in open(infile):
383 line = line.strip() 417 line = line.strip()
384 if not line or line.startswith( '#' ): 418 if not line or line.startswith('#'):
385 continue 419 continue
386 if line[0] == '>': 420 if line[0] == '>':
387 if first_entry == False: 421 if not first_entry:
388 if keep_first_word: 422 if keep_first_word:
389 fasta_title = fasta_title.split()[0] 423 fasta_title = fasta_title.split()[0]
390 out.write( "%s\t%d\n" % ( fasta_title[ 1: ], seq_len ) ) 424 out.write("%s\t%d\n" % (fasta_title[1:], seq_len))
391 else: 425 else:
392 first_entry = False 426 first_entry = False
393 fasta_title = line 427 fasta_title = line
394 seq_len = 0 428 seq_len = 0
395 else: 429 else:
396 seq_len += len(line) 430 seq_len += len(line)
397 431
398 # last fasta-entry 432 # last fasta-entry
399 if keep_first_word: 433 if keep_first_word:
400 fasta_title = fasta_title.split()[0] 434 fasta_title = fasta_title.split()[0]
401 out.write( "%s\t%d\n" % ( fasta_title[ 1: ], seq_len ) ) 435 out.write("%s\t%d\n" % (fasta_title[1:], seq_len))
402 out.close() 436 out.close()
403 437
404 438
405 def _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ): 439 def _create_symlink(input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name):
406 fasta_base_filename = "%s.fa" % sequence_id 440 fasta_base_filename = "%s.fa" % sequence_id
407 fasta_filename = os.path.join( target_directory, fasta_base_filename ) 441 fasta_filename = os.path.join(target_directory, fasta_base_filename)
408 os.symlink( input_filename, fasta_filename ) 442 os.symlink(input_filename, fasta_filename)
409 443
410 dbkey_dict = None 444 dbkey_dict = None
411 if dbkey_name: 445 if dbkey_name:
412 #do len calc here 446 # do len calc here
413 len_base_name = "%s.len" % ( dbkey ) 447 len_base_name = "%s.len" % (dbkey)
414 compute_fasta_length( fasta_filename, os.path.join( target_directory, len_base_name ), keep_first_word=True ) 448 compute_fasta_length(fasta_filename, os.path.join(target_directory, len_base_name), keep_first_word=True)
415 dbkey_dict = dict( value=dbkey, name=dbkey_name, len_path=len_base_name ) 449 dbkey_dict = dict(value=dbkey, name=dbkey_name, len_path=len_base_name)
416 450
417 return [ ( '__dbkeys__', dbkey_dict ), ( 'all_fasta', dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ] 451 return [('__dbkeys__', dbkey_dict), ('all_fasta', dict(value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename))]
418 452
419 453
420 REFERENCE_SOURCE_TO_DOWNLOAD = dict( ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory ) 454 REFERENCE_SOURCE_TO_DOWNLOAD = dict(ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history,
421 455 directory=copy_from_directory)
422 SORTING_METHODS = dict( as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom ) 456 SORTING_METHODS = dict(as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom)
423 457
424 458
425 def main(): 459 def main():
426 #Parse Command Line
427 parser = optparse.OptionParser() 460 parser = optparse.OptionParser()
428 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) 461 parser.add_option('-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description')
429 (options, args) = parser.parse_args() 462 (options, args) = parser.parse_args()
430 463
431 filename = args[0] 464 filename = args[0]
432 465
433 params = loads( open( filename ).read() ) 466 params = loads(open(filename).read())
434 target_directory = params[ 'output_data' ][0]['extra_files_path'] 467 target_directory = params['output_data'][0]['extra_files_path']
435 os.mkdir( target_directory ) 468 os.mkdir(target_directory)
436 data_manager_dict = {} 469 data_manager_dict = {}
437 470
438 dbkey, dbkey_name, sequence_id, sequence_name = get_dbkey_dbname_id_name( params, dbkey_description=options.dbkey_description ) 471 dbkey, dbkey_name, sequence_id, sequence_name = get_dbkey_dbname_id_name(params, dbkey_description=options.dbkey_description)
439 472
440 if dbkey in [ None, '', '?' ]: 473 if dbkey in [None, '', '?']:
441 raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( dbkey ) ) 474 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey))
442 475
443 # Create a tmp_dir, in case a zip file needs to be uncompressed 476 # Create a tmp_dir, in case a zip file needs to be uncompressed
444 tmp_dir = tempfile.mkdtemp() 477 tmp_dir = tempfile.mkdtemp()
445 #Fetch the FASTA 478 # Fetch the FASTA
446 try: 479 try:
447 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ) 480 reference_source = params['param_dict']['reference_source']['reference_source_selector']
481 fasta_readers = REFERENCE_SOURCE_TO_DOWNLOAD[reference_source](data_manager_dict=data_manager_dict,
482 params=params,
483 target_directory=target_directory,
484 dbkey=dbkey,
485 dbkey_name=dbkey_name,
486 sequence_id=sequence_id,
487 sequence_name=sequence_name,
488 tmp_dir=tmp_dir)
489 if fasta_readers:
490 add_fasta_to_table(data_manager_dict=data_manager_dict,
491 fasta_readers=fasta_readers,
492 target_directory=target_directory,
493 dbkey=dbkey,
494 dbkey_name=dbkey_name,
495 sequence_id=sequence_id,
496 sequence_name=sequence_name,
497 params=params)
498
448 finally: 499 finally:
449 cleanup_before_exit(tmp_dir) 500 cleanup_before_exit(tmp_dir)
450 #save info to json file 501 # save info to json file
451 open( filename, 'wb' ).write( dumps( data_manager_dict ).encode() ) 502 open(filename, 'wb').write(dumps(data_manager_dict).encode())
452 503
504
453 if __name__ == "__main__": 505 if __name__ == "__main__":
454 main() 506 main()