Mercurial > repos > ieguinoa > data_manager_fetch_tx2gene
comparison data_manager/data_manager_fetch_tx2gene.py @ 3:d71f65b854de draft
Uploaded
| author | ieguinoa |
|---|---|
| date | Fri, 19 Oct 2018 07:36:02 -0400 |
| parents | a4d4da89aae1 |
| children | c380b7da5b65 |
comparison
equal
deleted
inserted
replaced
| 2:7d3ffe28ff3f | 3:d71f65b854de |
|---|---|
| 9 from ftplib import FTP | 9 from ftplib import FTP |
| 10 import tarfile | 10 import tarfile |
| 11 import zipfile | 11 import zipfile |
| 12 import gzip | 12 import gzip |
| 13 import bz2 | 13 import bz2 |
| 14 import subprocess | |
| 14 try: | 15 try: |
| 15 # For Python 3.0 and later | 16 # For Python 3.0 and later |
| 16 from urllib.request import urlopen | 17 from urllib.request import urlopen |
| 17 from io import BytesIO as StringIO | 18 from io import BytesIO as StringIO |
| 18 from io import UnsupportedOperation | 19 from io import UnsupportedOperation |
| 91 | 92 |
| 92 def _get_stream_readers_for_bz2( fh, tmp_dir ): | 93 def _get_stream_readers_for_bz2( fh, tmp_dir ): |
| 93 return [ bz2.BZ2File( fh.name, 'rb') ] | 94 return [ bz2.BZ2File( fh.name, 'rb') ] |
| 94 | 95 |
| 95 | 96 |
| 96 def convert_tx2gene( fasta_filename, file_type, params ): | 97 def convert_to_tx2gene( rscript_gff_to_tx2gene, fasta_filename, file_type, params ): |
| 97 if file_type is 'tx2gene': | 98 if file_type == 'tx2gene': |
| 98 return #no need to extract tx2gene table | 99 return #no need to extract tx2gene table |
| 100 #print file_type | |
| 99 #If the file is actually a GFF/GTF file then extract the tx2gene | 101 #If the file is actually a GFF/GTF file then extract the tx2gene |
| 100 gff_temp_filename = tempfile.NamedTemporaryFile().name | 102 gff_temp_filename = tempfile.NamedTemporaryFile().name |
| 101 shutil.move(fasta_filename, gff_temp_filename) | 103 shutil.move(fasta_filename, gff_temp_filename) |
| 102 args= ['Rscript'] | 104 args= ['Rscript'] |
| 103 args.append(RSCRIPT_GFF_TO_TX2GENE) | 105 args.append(rscript_gff_to_tx2gene) |
| 104 args.append(gff_temp_filename) | 106 args.extend(['-x',gff_temp_filename]) |
| 105 args.append(fasta_filename) | 107 args.extend(['-o',fasta_filename]) |
| 106 | 108 args.extend(['-t',file_type]) |
| 107 #assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method ) | 109 tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-stderr" ) |
| 108 #return SORTING_METHODS[ sort_method ]( fasta_filename, params ) | 110 return_code = subprocess.call( args=args, shell=False, stderr=tmp_stderr.fileno() ) |
| 109 | 111 #return_code = subprocess.call( args=args, shell=False, stderr=None) |
| 112 if return_code: | |
| 113 tmp_stderr.flush() | |
| 114 tmp_stderr.seek(0) | |
| 115 print >> sys.stderr, "Error in process call" | |
| 116 while True: | |
| 117 chunk = tmp_stderr.read( CHUNK_SIZE ) | |
| 118 if not chunk: | |
| 119 break | |
| 120 sys.stderr.write( chunk ) | |
| 121 sys.exit( return_code ) | |
| 122 tmp_stderr.close() | |
| 123 | |
| 124 | |
| 125 | |
| 110 def _download_file(start, fh): | 126 def _download_file(start, fh): |
| 111 tmp = tempfile.NamedTemporaryFile() | 127 tmp = tempfile.NamedTemporaryFile() |
| 112 tmp.write(start) | 128 tmp.write(start) |
| 113 tmp.write(fh.read()) | 129 tmp.write(fh.read()) |
| 114 tmp.flush() | 130 tmp.flush() |
| 141 pass | 157 pass |
| 142 return fh | 158 return fh |
| 143 | 159 |
| 144 | 160 |
| 145 | 161 |
| 146 def add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params): | 162 def add_fasta_to_table(rscript_gff_to_tx2gene, data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params): |
| 147 for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): | 163 for data_table_name, data_table_entry in _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): |
| 148 if data_table_entry: | 164 if data_table_entry: |
| 149 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) | 165 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) |
| 150 | 166 |
| 151 | 167 |
| 152 def download_from_url( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): | 168 def download_from_url(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): |
| 153 urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) | 169 urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) |
| 154 fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ] | 170 fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ] |
| 155 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params) | 171 add_fasta_to_table(rscript_gff_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params) |
| 156 | 172 |
| 157 | 173 |
| 158 def download_from_history( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): | 174 def download_from_history(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): |
| 159 #TODO: allow multiple FASTA input files | 175 #TODO: allow multiple FASTA input files |
| 160 input_filename = params['param_dict']['reference_source']['input_fasta'] | 176 input_filename = params['param_dict']['reference_source']['input_fasta'] |
| 161 if isinstance( input_filename, list ): | 177 if isinstance( input_filename, list ): |
| 162 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] | 178 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] |
| 163 else: | 179 else: |
| 164 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) | 180 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) |
| 165 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) | 181 add_fasta_to_table(rscript_gff_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) |
| 166 | 182 |
| 167 | 183 |
| 168 def copy_from_directory( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): | 184 def copy_from_directory(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): |
| 169 input_filename = params['param_dict']['reference_source']['fasta_filename'] | 185 input_filename = params['param_dict']['reference_source']['fasta_filename'] |
| 170 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' | 186 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' |
| 171 if create_symlink: | 187 if create_symlink: |
| 172 data_table_entries = _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ) | 188 data_table_entries = _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ) |
| 173 else: | 189 else: |
| 174 if isinstance( input_filename, list ): | 190 if isinstance( input_filename, list ): |
| 175 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] | 191 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] |
| 176 else: | 192 else: |
| 177 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) | 193 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) |
| 178 data_table_entries = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) | 194 data_table_entries = _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) |
| 179 for data_table_name, data_table_entry in data_table_entries: | 195 for data_table_name, data_table_entry in data_table_entries: |
| 180 if data_table_entry: | 196 if data_table_entry: |
| 181 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) | 197 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) |
| 182 | 198 |
| 183 | 199 |
| 186 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( DATA_TABLE_NAME, [] ) | 202 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( DATA_TABLE_NAME, [] ) |
| 187 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) | 203 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) |
| 188 return data_manager_dict | 204 return data_manager_dict |
| 189 | 205 |
| 190 | 206 |
| 191 def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ): | 207 def _stream_fasta_to_file( rscript_gff_to_tx2gene, fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ): |
| 192 fasta_base_filename = "%s_tx2gene.tab" % sequence_id | 208 fasta_base_filename = "%s_tx2gene.tab" % sequence_id |
| 193 fasta_filename = os.path.join( target_directory, fasta_base_filename ) | 209 fasta_filename = os.path.join( target_directory, fasta_base_filename ) |
| 194 with open( fasta_filename, 'wb+' ) as fasta_writer: | 210 with open( fasta_filename, 'wb+' ) as fasta_writer: |
| 195 | 211 |
| 196 if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1: | 212 if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1: |
| 218 else: | 234 else: |
| 219 break | 235 break |
| 220 if close_stream: | 236 if close_stream: |
| 221 fasta_stream.close() | 237 fasta_stream.close() |
| 222 | 238 |
| 223 convert_to_tx2gene( fasta_filename, params['param_dict']['file_type'], params ) | 239 convert_to_tx2gene( rscript_gff_to_tx2gene,fasta_filename, params['param_dict']['file_type'], params ) |
| 224 return [ ( DATA_TABLE_NAME, dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ] | 240 return [ ( DATA_TABLE_NAME, dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ] |
| 225 | 241 |
| 226 | 242 |
| 227 def compute_fasta_length( fasta_file, out_file, keep_first_word=False ): | 243 def compute_fasta_length( fasta_file, out_file, keep_first_word=False ): |
| 228 | 244 |
| 269 | 285 |
| 270 def main(): | 286 def main(): |
| 271 #Parse Command Line | 287 #Parse Command Line |
| 272 parser = optparse.OptionParser() | 288 parser = optparse.OptionParser() |
| 273 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) | 289 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) |
| 290 parser.add_option( '-b', '--base_dir', dest='base_dir', action='store', type='string', default=None, help='base_dir') | |
| 274 parser.add_option( '-t', '--type', dest='file_type', action='store', type='string', default=None, help='file_type') | 291 parser.add_option( '-t', '--type', dest='file_type', action='store', type='string', default=None, help='file_type') |
| 275 (options, args) = parser.parse_args() | 292 (options, args) = parser.parse_args() |
| 276 | 293 |
| 277 filename = args[0] | 294 filename = args[0] |
| 278 #global DATA_TABLE_NAME | 295 #global DATA_TABLE_NAME |
| 279 global RSCRIPT_GFF_TO_TX2GENE= os.path.join( options.base_dir, 'tximport.r') | 296 rscript_gff_to_tx2gene=os.path.join( options.base_dir, 'get_tx2gene_table.R') |
| 280 | 297 |
| 281 | 298 #input_type='gff_gtf' |
| 282 if options.file_type == 'gff_gtf': | 299 #if options.file_type != 'gff_gtf': |
| 283 #DATA_TABLE_NAME= 'representative_gff' | 300 # file_type='tx2gene' |
| 284 else: #file_type='tx2gene' | |
| 285 | 301 |
| 286 params = loads( open( filename ).read() ) | 302 params = loads( open( filename ).read() ) |
| 287 target_directory = params[ 'output_data' ][0]['extra_files_path'] | 303 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
| 288 os.mkdir( target_directory ) | 304 os.mkdir( target_directory ) |
| 289 data_manager_dict = {} | 305 data_manager_dict = {} |
| 295 | 311 |
| 296 # Create a tmp_dir, in case a zip file needs to be uncompressed | 312 # Create a tmp_dir, in case a zip file needs to be uncompressed |
| 297 tmp_dir = tempfile.mkdtemp() | 313 tmp_dir = tempfile.mkdtemp() |
| 298 #Fetch the input file | 314 #Fetch the input file |
| 299 try: | 315 try: |
| 300 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir) | 316 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir) |
| 301 finally: | 317 finally: |
| 302 cleanup_before_exit(tmp_dir) | 318 cleanup_before_exit(tmp_dir) |
| 303 #save info to json file | 319 #save info to json file |
| 304 open( filename, 'wb' ).write( dumps( data_manager_dict ).encode() ) | 320 open( filename, 'wb' ).write( dumps( data_manager_dict ).encode() ) |
| 305 | 321 |
