comparison data_manager/data_manager_fetch_tx2gene.py @ 3:d71f65b854de draft

Uploaded
author ieguinoa
date Fri, 19 Oct 2018 07:36:02 -0400
parents a4d4da89aae1
children c380b7da5b65
comparison
equal deleted inserted replaced
2:7d3ffe28ff3f 3:d71f65b854de
9 from ftplib import FTP 9 from ftplib import FTP
10 import tarfile 10 import tarfile
11 import zipfile 11 import zipfile
12 import gzip 12 import gzip
13 import bz2 13 import bz2
14 import subprocess
14 try: 15 try:
15 # For Python 3.0 and later 16 # For Python 3.0 and later
16 from urllib.request import urlopen 17 from urllib.request import urlopen
17 from io import BytesIO as StringIO 18 from io import BytesIO as StringIO
18 from io import UnsupportedOperation 19 from io import UnsupportedOperation
91 92
92 def _get_stream_readers_for_bz2( fh, tmp_dir ): 93 def _get_stream_readers_for_bz2( fh, tmp_dir ):
93 return [ bz2.BZ2File( fh.name, 'rb') ] 94 return [ bz2.BZ2File( fh.name, 'rb') ]
94 95
95 96
96 def convert_tx2gene( fasta_filename, file_type, params ): 97 def convert_to_tx2gene( rscript_gff_to_tx2gene, fasta_filename, file_type, params ):
97 if file_type is 'tx2gene': 98 if file_type == 'tx2gene':
98 return #no need to extract tx2gene table 99 return #no need to extract tx2gene table
100 #print file_type
99 #If the file is actually a GFF/GTF file then extract the tx2gene 101 #If the file is actually a GFF/GTF file then extract the tx2gene
100 gff_temp_filename = tempfile.NamedTemporaryFile().name 102 gff_temp_filename = tempfile.NamedTemporaryFile().name
101 shutil.move(fasta_filename, gff_temp_filename) 103 shutil.move(fasta_filename, gff_temp_filename)
102 args= ['Rscript'] 104 args= ['Rscript']
103 args.append(RSCRIPT_GFF_TO_TX2GENE) 105 args.append(rscript_gff_to_tx2gene)
104 args.append(gff_temp_filename) 106 args.extend(['-x',gff_temp_filename])
105 args.append(fasta_filename) 107 args.extend(['-o',fasta_filename])
106 108 args.extend(['-t',file_type])
107 #assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method ) 109 tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-stderr" )
108 #return SORTING_METHODS[ sort_method ]( fasta_filename, params ) 110 return_code = subprocess.call( args=args, shell=False, stderr=tmp_stderr.fileno() )
109 111 #return_code = subprocess.call( args=args, shell=False, stderr=None)
112 if return_code:
113 tmp_stderr.flush()
114 tmp_stderr.seek(0)
115 print >> sys.stderr, "Error in process call"
116 while True:
117 chunk = tmp_stderr.read( CHUNK_SIZE )
118 if not chunk:
119 break
120 sys.stderr.write( chunk )
121 sys.exit( return_code )
122 tmp_stderr.close()
123
124
125
110 def _download_file(start, fh): 126 def _download_file(start, fh):
111 tmp = tempfile.NamedTemporaryFile() 127 tmp = tempfile.NamedTemporaryFile()
112 tmp.write(start) 128 tmp.write(start)
113 tmp.write(fh.read()) 129 tmp.write(fh.read())
114 tmp.flush() 130 tmp.flush()
141 pass 157 pass
142 return fh 158 return fh
143 159
144 160
145 161
146 def add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params): 162 def add_fasta_to_table(rscript_gff_to_tx2gene, data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params):
147 for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): 163 for data_table_name, data_table_entry in _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ):
148 if data_table_entry: 164 if data_table_entry:
149 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) 165 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name )
150 166
151 167
152 def download_from_url( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 168 def download_from_url(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):
153 urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) 169 urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) )
154 fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ] 170 fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ]
155 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params) 171 add_fasta_to_table(rscript_gff_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params)
156 172
157 173
158 def download_from_history( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 174 def download_from_history(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):
159 #TODO: allow multiple FASTA input files 175 #TODO: allow multiple FASTA input files
160 input_filename = params['param_dict']['reference_source']['input_fasta'] 176 input_filename = params['param_dict']['reference_source']['input_fasta']
161 if isinstance( input_filename, list ): 177 if isinstance( input_filename, list ):
162 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] 178 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ]
163 else: 179 else:
164 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) 180 fasta_readers = get_stream_reader(open(input_filename), tmp_dir)
165 add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) 181 add_fasta_to_table(rscript_gff_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params)
166 182
167 183
168 def copy_from_directory( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): 184 def copy_from_directory(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):
169 input_filename = params['param_dict']['reference_source']['fasta_filename'] 185 input_filename = params['param_dict']['reference_source']['fasta_filename']
170 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' 186 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink'
171 if create_symlink: 187 if create_symlink:
172 data_table_entries = _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ) 188 data_table_entries = _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name )
173 else: 189 else:
174 if isinstance( input_filename, list ): 190 if isinstance( input_filename, list ):
175 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] 191 fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ]
176 else: 192 else:
177 fasta_readers = get_stream_reader(open(input_filename), tmp_dir) 193 fasta_readers = get_stream_reader(open(input_filename), tmp_dir)
178 data_table_entries = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) 194 data_table_entries = _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params )
179 for data_table_name, data_table_entry in data_table_entries: 195 for data_table_name, data_table_entry in data_table_entries:
180 if data_table_entry: 196 if data_table_entry:
181 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) 197 _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name )
182 198
183 199
186 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( DATA_TABLE_NAME, [] ) 202 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( DATA_TABLE_NAME, [] )
187 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) 203 data_manager_dict['data_tables'][data_table_name].append( data_table_entry )
188 return data_manager_dict 204 return data_manager_dict
189 205
190 206
191 def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ): 207 def _stream_fasta_to_file( rscript_gff_to_tx2gene, fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ):
192 fasta_base_filename = "%s_tx2gene.tab" % sequence_id 208 fasta_base_filename = "%s_tx2gene.tab" % sequence_id
193 fasta_filename = os.path.join( target_directory, fasta_base_filename ) 209 fasta_filename = os.path.join( target_directory, fasta_base_filename )
194 with open( fasta_filename, 'wb+' ) as fasta_writer: 210 with open( fasta_filename, 'wb+' ) as fasta_writer:
195 211
196 if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1: 212 if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1:
218 else: 234 else:
219 break 235 break
220 if close_stream: 236 if close_stream:
221 fasta_stream.close() 237 fasta_stream.close()
222 238
223 convert_to_tx2gene( fasta_filename, params['param_dict']['file_type'], params ) 239 convert_to_tx2gene( rscript_gff_to_tx2gene,fasta_filename, params['param_dict']['file_type'], params )
224 return [ ( DATA_TABLE_NAME, dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ] 240 return [ ( DATA_TABLE_NAME, dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ]
225 241
226 242
227 def compute_fasta_length( fasta_file, out_file, keep_first_word=False ): 243 def compute_fasta_length( fasta_file, out_file, keep_first_word=False ):
228 244
269 285
270 def main(): 286 def main():
271 #Parse Command Line 287 #Parse Command Line
272 parser = optparse.OptionParser() 288 parser = optparse.OptionParser()
273 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) 289 parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' )
290 parser.add_option( '-b', '--base_dir', dest='base_dir', action='store', type='string', default=None, help='base_dir')
274 parser.add_option( '-t', '--type', dest='file_type', action='store', type='string', default=None, help='file_type') 291 parser.add_option( '-t', '--type', dest='file_type', action='store', type='string', default=None, help='file_type')
275 (options, args) = parser.parse_args() 292 (options, args) = parser.parse_args()
276 293
277 filename = args[0] 294 filename = args[0]
278 #global DATA_TABLE_NAME 295 #global DATA_TABLE_NAME
279 global RSCRIPT_GFF_TO_TX2GENE= os.path.join( options.base_dir, 'tximport.r') 296 rscript_gff_to_tx2gene=os.path.join( options.base_dir, 'get_tx2gene_table.R')
280 297
281 298 #input_type='gff_gtf'
282 if options.file_type == 'gff_gtf': 299 #if options.file_type != 'gff_gtf':
283 #DATA_TABLE_NAME= 'representative_gff' 300 # file_type='tx2gene'
284 else: #file_type='tx2gene'
285 301
286 params = loads( open( filename ).read() ) 302 params = loads( open( filename ).read() )
287 target_directory = params[ 'output_data' ][0]['extra_files_path'] 303 target_directory = params[ 'output_data' ][0]['extra_files_path']
288 os.mkdir( target_directory ) 304 os.mkdir( target_directory )
289 data_manager_dict = {} 305 data_manager_dict = {}
295 311
296 # Create a tmp_dir, in case a zip file needs to be uncompressed 312 # Create a tmp_dir, in case a zip file needs to be uncompressed
297 tmp_dir = tempfile.mkdtemp() 313 tmp_dir = tempfile.mkdtemp()
298 #Fetch the input file 314 #Fetch the input file
299 try: 315 try:
300 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir) 316 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir)
301 finally: 317 finally:
302 cleanup_before_exit(tmp_dir) 318 cleanup_before_exit(tmp_dir)
303 #save info to json file 319 #save info to json file
304 open( filename, 'wb' ).write( dumps( data_manager_dict ).encode() ) 320 open( filename, 'wb' ).write( dumps( data_manager_dict ).encode() )
305 321