comparison gstf_preparation.py @ 4:284f64ad9d43 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
author earlhaminst
date Fri, 08 Dec 2017 05:32:12 -0500
parents 7e11a7f4bdba
children b3ba0c84667c
comparison
equal deleted inserted replaced
3:7e11a7f4bdba 4:284f64ad9d43
249 if not results: 249 if not results:
250 return None 250 return None
251 return results[0] 251 return results[0]
252 252
253 253
254 def fetch_gene_id_for_transcript(conn, transcript_id):
255 cur = conn.cursor()
256
257 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
258 (transcript_id, ))
259 results = cur.fetchone()
260 if not results:
261 return None
262 return results[0]
263
264
254 def remove_id_version(s): 265 def remove_id_version(s):
255 """ 266 """
256 Remove the optional '.VERSION' from an Ensembl id. 267 Remove the optional '.VERSION' from an Ensembl id.
257 """ 268 """
258 if s.startswith('ENS'): 269 if s.startswith('ENS'):
264 def __main__(): 275 def __main__():
265 parser = optparse.OptionParser() 276 parser = optparse.OptionParser()
266 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') 277 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
267 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') 278 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
268 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') 279 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
280 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
281 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
269 parser.add_option('-o', '--output', help='Path of the output SQLite file') 282 parser.add_option('-o', '--output', help='Path of the output SQLite file')
270 parser.add_option('--of', help='Path of the output FASTA file') 283 parser.add_option('--of', help='Path of the output FASTA file')
271 options, args = parser.parse_args() 284 options, args = parser.parse_args()
272 if args: 285 if args:
273 raise Exception('Use options to provide inputs') 286 raise Exception('Use options to provide inputs')
285 transcript_dict = dict() 298 transcript_dict = dict()
286 exon_parent_dict = dict() 299 exon_parent_dict = dict()
287 cds_parent_dict = dict() 300 cds_parent_dict = dict()
288 five_prime_utr_parent_dict = dict() 301 five_prime_utr_parent_dict = dict()
289 three_prime_utr_parent_dict = dict() 302 three_prime_utr_parent_dict = dict()
303
290 with open(filename) as f: 304 with open(filename) as f:
291 for i, line in enumerate(f, start=1): 305 for i, line in enumerate(f, start=1):
292 line = line.strip() 306 line = line.strip()
293 if not line: 307 if not line:
294 # skip empty lines 308 # skip empty lines
323 337
324 for json_arg in options.json: 338 for json_arg in options.json:
325 with open(json_arg) as f: 339 with open(json_arg) as f:
326 write_gene_dict_to_db(conn, json.load(f)) 340 write_gene_dict_to_db(conn, json.load(f))
327 341
328 with open(options.of, 'w') as output_fasta_file: 342 if options.longestCDS:
343 gene_transcripts_dict = dict()
329 for fasta_arg in options.fasta: 344 for fasta_arg in options.fasta:
330 for entry in FASTAReader_gen(fasta_arg): 345 for entry in FASTAReader_gen(fasta_arg):
331 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id 346 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
332 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) 347 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
348
349 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
350 if not gene_id:
351 continue
352
353 if gene_id in gene_transcripts_dict:
354 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
355 else:
356 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
357
358 # For each gene, select the transcript with the longest sequence
359 # If more than one transcripts have the same longest sequence for a gene, the
360 # first one to appear in the FASTA file is selected
361 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
362
363 with open(options.of, 'w') as output_fasta_file:
364 for fasta_arg in options.fasta:
365 for entry in FASTAReader_gen(fasta_arg):
366 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
367 if options.longestCDS and transcript_id not in selected_transcript_ids:
368 continue
369
333 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) 370 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
334 if not species_for_transcript: 371 if not species_for_transcript:
335 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) 372 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr)
336 continue 373 continue
337 # Remove any underscore in the species 374
338 species_for_transcript = species_for_transcript.replace('_', '') 375 if options.headers:
339 # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest 376 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
340 output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence)) 377 # Remove any underscore in the species
378 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
379 else:
380 header = entry.header
381
382 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
341 383
342 conn.close() 384 conn.close()
343 385
344 386
345 if __name__ == '__main__': 387 if __name__ == '__main__':