comparison data_manager/data_manager_fetch_genome_all_fasta.py @ 1:ac850912d386 draft

Uploaded
author devteam
date Tue, 03 Feb 2015 10:01:30 -0500
parents 2ebc856bce29
children cca219f2b212
comparison
equal deleted inserted replaced
0:2ebc856bce29 1:ac850912d386
12 import tarfile 12 import tarfile
13 import zipfile 13 import zipfile
14 import gzip 14 import gzip
15 import bz2 15 import bz2
16 16
17 from galaxy.util.json import from_json_string, to_json_string 17 from json import loads, dumps
18 18
19 19
20 CHUNK_SIZE = 2**20 #1mb 20 CHUNK_SIZE = 2**20 #1mb
21 21
22 def cleanup_before_exit( tmp_dir ): 22 def cleanup_before_exit( tmp_dir ):
46 ftp.retrlines( 'MLSD %s' % ( path ), path_contents.append ) 46 ftp.retrlines( 'MLSD %s' % ( path ), path_contents.append )
47 return [ line.split( ';' )[ -1 ].lstrip() for line in path_contents ] 47 return [ line.split( ';' )[ -1 ].lstrip() for line in path_contents ]
48 48
49 def _get_stream_readers_for_tar( file_obj, tmp_dir ): 49 def _get_stream_readers_for_tar( file_obj, tmp_dir ):
50 fasta_tar = tarfile.open( fileobj=file_obj, mode='r:*' ) 50 fasta_tar = tarfile.open( fileobj=file_obj, mode='r:*' )
51 return [ fasta_tar.extractfile( member ) for member in fasta_tar.getmembers() ] 51 return filter( lambda x: x is not None, [ fasta_tar.extractfile( member ) for member in fasta_tar.getmembers() ] )
52 52
53 def _get_stream_readers_for_zip( file_obj, tmp_dir ): 53 def _get_stream_readers_for_zip( file_obj, tmp_dir ):
54 fasta_zip = zipfile.ZipFile( file_obj, 'r' ) 54 fasta_zip = zipfile.ZipFile( file_obj, 'r' )
55 rval = [] 55 rval = []
56 for member in fasta_zip.namelist(): 56 for member in fasta_zip.namelist():
168 else: 168 else:
169 _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) 169 _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename )
170 170
171 def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): 171 def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ):
172 UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu' 172 UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu'
173 UCSC_CHROM_FA_FILENAME = 'chromFa'
174 UCSC_DOWNLOAD_PATH = '/goldenPath/%s/bigZips/' 173 UCSC_DOWNLOAD_PATH = '/goldenPath/%s/bigZips/'
175 COMPRESSED_EXTENSIONS = [ ( '.tar.gz', _get_stream_readers_for_tar ), ( '.tar.bz2', _get_stream_readers_for_tar ), ( '.zip', _get_stream_readers_for_zip ), ( '.fa.gz', _get_stream_readers_for_gzip ), ( '.fa.bz2', _get_stream_readers_for_bz2 ) ] 174 COMPRESSED_EXTENSIONS = [ ( '.tar.gz', _get_stream_readers_for_tar ), ( '.tar.bz2', _get_stream_readers_for_tar ), ( '.zip', _get_stream_readers_for_zip ), ( '.fa.gz', _get_stream_readers_for_gzip ), ( '.fa.bz2', _get_stream_readers_for_bz2 ) ]
176 175
177 email = params['param_dict']['__user_email__'] 176 email = params['param_dict']['__user_email__']
178 if not email: 177 if not email:
179 email = 'anonymous@example.com' 178 email = 'anonymous@example.com'
180 179
181 ucsc_dbkey = params['param_dict']['reference_source']['requested_dbkey'] or dbkey 180 ucsc_dbkey = params['param_dict']['reference_source']['requested_dbkey'] or dbkey
181 UCSC_CHROM_FA_FILENAMES = [ '%s.chromFa' % ucsc_dbkey, 'chromFa' ]
182
182 ftp = FTP( UCSC_FTP_SERVER ) 183 ftp = FTP( UCSC_FTP_SERVER )
183 ftp.login( 'anonymous', email ) 184 ftp.login( 'anonymous', email )
184 185
185 ucsc_path = UCSC_DOWNLOAD_PATH % ucsc_dbkey 186 ucsc_path = UCSC_DOWNLOAD_PATH % ucsc_dbkey
186 path_contents = _get_files_in_ftp_path( ftp, ucsc_path ) 187 path_contents = _get_files_in_ftp_path( ftp, ucsc_path )
187 188
188 ucsc_file_name = None 189 ucsc_file_name = None
189 get_stream_reader = None 190 get_stream_reader = None
190 ext = None 191 ext = None
191 for ext, get_stream_reader in COMPRESSED_EXTENSIONS: 192 ucsc_chrom_fa_filename = None
192 if "%s%s" % ( UCSC_CHROM_FA_FILENAME, ext ) in path_contents: 193 for ucsc_chrom_fa_filename in UCSC_CHROM_FA_FILENAMES:
193 ucsc_file_name = "%s%s%s" % ( ucsc_path, UCSC_CHROM_FA_FILENAME, ext ) 194 for ext, get_stream_reader in COMPRESSED_EXTENSIONS:
195 if "%s%s" % ( ucsc_chrom_fa_filename, ext ) in path_contents:
196 ucsc_file_name = "%s%s%s" % ( ucsc_path, ucsc_chrom_fa_filename, ext )
197 break
198 if ucsc_file_name:
194 break 199 break
195 200
196 if not ucsc_file_name: 201 if not ucsc_file_name:
197 raise Exception( 'Unable to determine filename for UCSC Genome for %s: %s' % ( ucsc_dbkey, path_contents ) ) 202 raise Exception( 'Unable to determine filename for UCSC Genome for %s: %s' % ( ucsc_dbkey, path_contents ) )
198 203
199 204
200 tmp_dir = tempfile.mkdtemp( prefix='tmp-data-manager-ucsc-' ) 205 tmp_dir = tempfile.mkdtemp( prefix='tmp-data-manager-ucsc-' )
201 ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( UCSC_CHROM_FA_FILENAME, ext ) ) 206 ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( ucsc_chrom_fa_filename, ext ) )
202 207
203 fasta_base_filename = "%s.fa" % sequence_id 208 fasta_base_filename = "%s.fa" % sequence_id
204 fasta_filename = os.path.join( target_directory, fasta_base_filename ) 209 fasta_filename = os.path.join( target_directory, fasta_base_filename )
205 fasta_writer = open( fasta_filename, 'wb+' ) 210 fasta_writer = open( fasta_filename, 'wb+' )
206 211
329 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) 334 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' )
330 (options, args) = parser.parse_args() 335 (options, args) = parser.parse_args()
331 336
332 filename = args[0] 337 filename = args[0]
333 338
334 params = from_json_string( open( filename ).read() ) 339 params = loads( open( filename ).read() )
335 target_directory = params[ 'output_data' ][0]['extra_files_path'] 340 target_directory = params[ 'output_data' ][0]['extra_files_path']
336 os.mkdir( target_directory ) 341 os.mkdir( target_directory )
337 data_manager_dict = {} 342 data_manager_dict = {}
338 343
339 dbkey, sequence_id, sequence_name = get_dbkey_id_name( params, dbkey_description=options.dbkey_description ) 344 dbkey, sequence_id, sequence_name = get_dbkey_id_name( params, dbkey_description=options.dbkey_description )
343 348
344 #Fetch the FASTA 349 #Fetch the FASTA
345 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ) 350 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name )
346 351
347 #save info to json file 352 #save info to json file
348 open( filename, 'wb' ).write( to_json_string( data_manager_dict ) ) 353 open( filename, 'wb' ).write( dumps( data_manager_dict ) )
349 354
350 if __name__ == "__main__": main() 355 if __name__ == "__main__": main()