Mercurial > repos > devteam > data_manager_fetch_genome_all_fasta
comparison data_manager/data_manager_fetch_genome_all_fasta.py @ 1:ac850912d386 draft
Uploaded
author | devteam |
---|---|
date | Tue, 03 Feb 2015 10:01:30 -0500 |
parents | 2ebc856bce29 |
children | cca219f2b212 |
comparison
equal
deleted
inserted
replaced
0:2ebc856bce29 | 1:ac850912d386 |
---|---|
12 import tarfile | 12 import tarfile |
13 import zipfile | 13 import zipfile |
14 import gzip | 14 import gzip |
15 import bz2 | 15 import bz2 |
16 | 16 |
17 from galaxy.util.json import from_json_string, to_json_string | 17 from json import loads, dumps |
18 | 18 |
19 | 19 |
20 CHUNK_SIZE = 2**20 #1mb | 20 CHUNK_SIZE = 2**20 #1mb |
21 | 21 |
22 def cleanup_before_exit( tmp_dir ): | 22 def cleanup_before_exit( tmp_dir ): |
46 ftp.retrlines( 'MLSD %s' % ( path ), path_contents.append ) | 46 ftp.retrlines( 'MLSD %s' % ( path ), path_contents.append ) |
47 return [ line.split( ';' )[ -1 ].lstrip() for line in path_contents ] | 47 return [ line.split( ';' )[ -1 ].lstrip() for line in path_contents ] |
48 | 48 |
49 def _get_stream_readers_for_tar( file_obj, tmp_dir ): | 49 def _get_stream_readers_for_tar( file_obj, tmp_dir ): |
50 fasta_tar = tarfile.open( fileobj=file_obj, mode='r:*' ) | 50 fasta_tar = tarfile.open( fileobj=file_obj, mode='r:*' ) |
51 return [ fasta_tar.extractfile( member ) for member in fasta_tar.getmembers() ] | 51 return filter( lambda x: x is not None, [ fasta_tar.extractfile( member ) for member in fasta_tar.getmembers() ] ) |
52 | 52 |
53 def _get_stream_readers_for_zip( file_obj, tmp_dir ): | 53 def _get_stream_readers_for_zip( file_obj, tmp_dir ): |
54 fasta_zip = zipfile.ZipFile( file_obj, 'r' ) | 54 fasta_zip = zipfile.ZipFile( file_obj, 'r' ) |
55 rval = [] | 55 rval = [] |
56 for member in fasta_zip.namelist(): | 56 for member in fasta_zip.namelist(): |
168 else: | 168 else: |
169 _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) | 169 _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) |
170 | 170 |
171 def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): | 171 def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): |
172 UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu' | 172 UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu' |
173 UCSC_CHROM_FA_FILENAME = 'chromFa' | |
174 UCSC_DOWNLOAD_PATH = '/goldenPath/%s/bigZips/' | 173 UCSC_DOWNLOAD_PATH = '/goldenPath/%s/bigZips/' |
175 COMPRESSED_EXTENSIONS = [ ( '.tar.gz', _get_stream_readers_for_tar ), ( '.tar.bz2', _get_stream_readers_for_tar ), ( '.zip', _get_stream_readers_for_zip ), ( '.fa.gz', _get_stream_readers_for_gzip ), ( '.fa.bz2', _get_stream_readers_for_bz2 ) ] | 174 COMPRESSED_EXTENSIONS = [ ( '.tar.gz', _get_stream_readers_for_tar ), ( '.tar.bz2', _get_stream_readers_for_tar ), ( '.zip', _get_stream_readers_for_zip ), ( '.fa.gz', _get_stream_readers_for_gzip ), ( '.fa.bz2', _get_stream_readers_for_bz2 ) ] |
176 | 175 |
177 email = params['param_dict']['__user_email__'] | 176 email = params['param_dict']['__user_email__'] |
178 if not email: | 177 if not email: |
179 email = 'anonymous@example.com' | 178 email = 'anonymous@example.com' |
180 | 179 |
181 ucsc_dbkey = params['param_dict']['reference_source']['requested_dbkey'] or dbkey | 180 ucsc_dbkey = params['param_dict']['reference_source']['requested_dbkey'] or dbkey |
181 UCSC_CHROM_FA_FILENAMES = [ '%s.chromFa' % ucsc_dbkey, 'chromFa' ] | |
182 | |
182 ftp = FTP( UCSC_FTP_SERVER ) | 183 ftp = FTP( UCSC_FTP_SERVER ) |
183 ftp.login( 'anonymous', email ) | 184 ftp.login( 'anonymous', email ) |
184 | 185 |
185 ucsc_path = UCSC_DOWNLOAD_PATH % ucsc_dbkey | 186 ucsc_path = UCSC_DOWNLOAD_PATH % ucsc_dbkey |
186 path_contents = _get_files_in_ftp_path( ftp, ucsc_path ) | 187 path_contents = _get_files_in_ftp_path( ftp, ucsc_path ) |
187 | 188 |
188 ucsc_file_name = None | 189 ucsc_file_name = None |
189 get_stream_reader = None | 190 get_stream_reader = None |
190 ext = None | 191 ext = None |
191 for ext, get_stream_reader in COMPRESSED_EXTENSIONS: | 192 ucsc_chrom_fa_filename = None |
192 if "%s%s" % ( UCSC_CHROM_FA_FILENAME, ext ) in path_contents: | 193 for ucsc_chrom_fa_filename in UCSC_CHROM_FA_FILENAMES: |
193 ucsc_file_name = "%s%s%s" % ( ucsc_path, UCSC_CHROM_FA_FILENAME, ext ) | 194 for ext, get_stream_reader in COMPRESSED_EXTENSIONS: |
195 if "%s%s" % ( ucsc_chrom_fa_filename, ext ) in path_contents: | |
196 ucsc_file_name = "%s%s%s" % ( ucsc_path, ucsc_chrom_fa_filename, ext ) | |
197 break | |
198 if ucsc_file_name: | |
194 break | 199 break |
195 | 200 |
196 if not ucsc_file_name: | 201 if not ucsc_file_name: |
197 raise Exception( 'Unable to determine filename for UCSC Genome for %s: %s' % ( ucsc_dbkey, path_contents ) ) | 202 raise Exception( 'Unable to determine filename for UCSC Genome for %s: %s' % ( ucsc_dbkey, path_contents ) ) |
198 | 203 |
199 | 204 |
200 tmp_dir = tempfile.mkdtemp( prefix='tmp-data-manager-ucsc-' ) | 205 tmp_dir = tempfile.mkdtemp( prefix='tmp-data-manager-ucsc-' ) |
201 ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( UCSC_CHROM_FA_FILENAME, ext ) ) | 206 ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( ucsc_chrom_fa_filename, ext ) ) |
202 | 207 |
203 fasta_base_filename = "%s.fa" % sequence_id | 208 fasta_base_filename = "%s.fa" % sequence_id |
204 fasta_filename = os.path.join( target_directory, fasta_base_filename ) | 209 fasta_filename = os.path.join( target_directory, fasta_base_filename ) |
205 fasta_writer = open( fasta_filename, 'wb+' ) | 210 fasta_writer = open( fasta_filename, 'wb+' ) |
206 | 211 |
329 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) | 334 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) |
330 (options, args) = parser.parse_args() | 335 (options, args) = parser.parse_args() |
331 | 336 |
332 filename = args[0] | 337 filename = args[0] |
333 | 338 |
334 params = from_json_string( open( filename ).read() ) | 339 params = loads( open( filename ).read() ) |
335 target_directory = params[ 'output_data' ][0]['extra_files_path'] | 340 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
336 os.mkdir( target_directory ) | 341 os.mkdir( target_directory ) |
337 data_manager_dict = {} | 342 data_manager_dict = {} |
338 | 343 |
339 dbkey, sequence_id, sequence_name = get_dbkey_id_name( params, dbkey_description=options.dbkey_description ) | 344 dbkey, sequence_id, sequence_name = get_dbkey_id_name( params, dbkey_description=options.dbkey_description ) |
343 | 348 |
344 #Fetch the FASTA | 349 #Fetch the FASTA |
345 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ) | 350 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ) |
346 | 351 |
347 #save info to json file | 352 #save info to json file |
348 open( filename, 'wb' ).write( to_json_string( data_manager_dict ) ) | 353 open( filename, 'wb' ).write( dumps( data_manager_dict ) ) |
349 | 354 |
350 if __name__ == "__main__": main() | 355 if __name__ == "__main__": main() |