comparison gstf_preparation.py @ 15:9c62ad7dd113 draft default tip

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
author earlhaminst
date Thu, 29 Apr 2021 19:46:38 +0000
parents 598e9172b8e7
children
comparison
equal deleted inserted replaced
14:598e9172b8e7 15:9c62ad7dd113
134 if cols[6] == '+': 134 if cols[6] == '+':
135 d['strand'] = 1 135 d['strand'] = 1
136 elif cols[6] == '-': 136 elif cols[6] == '-':
137 d['strand'] = -1 137 d['strand'] = -1
138 else: 138 else:
139 raise Exception("Unrecognized strand '%s'" % cols[6]) 139 raise Exception(f"Unrecognized strand: {cols[6]}")
140 if parent_dict is not None and 'Parent' in d: 140 if parent_dict is not None and 'Parent' in d:
141 # a 3' UTR can be split among multiple exons 141 # a 3' UTR can be split among multiple exons
142 # a 5' UTR can be split among multiple exons 142 # a 5' UTR can be split among multiple exons
143 # a CDS can be part of multiple transcripts 143 # a CDS can be part of multiple transcripts
144 for parent in d['Parent'].split(','): 144 for parent in d['Parent'].split(','):
148 148
149 def add_gene_to_dict(cols, species, gene_dict): 149 def add_gene_to_dict(cols, species, gene_dict):
150 global gene_count 150 global gene_count
151 gene = feature_to_dict(cols) 151 gene = feature_to_dict(cols)
152 if not gene['id']: 152 if not gene['id']:
153 raise Exception("Id not found among column 9 attribute tags: %s" % cols[8]) 153 raise Exception(f"Id not found among column 9 attribute tags: {cols[8]}")
154 gene.update({ 154 gene.update({
155 'member_id': gene_count, 155 'member_id': gene_count,
156 'object_type': 'Gene', 156 'object_type': 'Gene',
157 'seq_region_name': cols[0], 157 'seq_region_name': cols[0],
158 'species': species, 158 'species': species,
215 derived_translation_end = None 215 derived_translation_end = None
216 if transcript_id in cds_parent_dict: 216 if transcript_id in cds_parent_dict:
217 cds_list = cds_parent_dict[transcript_id] 217 cds_list = cds_parent_dict[transcript_id]
218 unique_cds_ids = {cds['id'] for cds in cds_list} 218 unique_cds_ids = {cds['id'] for cds in cds_list}
219 if len(unique_cds_ids) > 1: 219 if len(unique_cds_ids) > 1:
220 msg = """Found multiple CDS IDs (%s) for transcript '%s'. 220 msg = f"""Found multiple CDS IDs ({unique_cds_ids}) for transcript '{transcript_id}'.
221 This is not supported by the Ensembl JSON format. If a CDS is split across 221 This is not supported by the Ensembl JSON format. If a CDS is split across
222 multiple discontinuous genomic locations, the GFF3 standard requires that all 222 multiple discontinuous genomic locations, the GFF3 standard requires that all
223 corresponding lines use the same ID attribute.""" 223 corresponding lines use the same ID attribute."""
224 raise Exception(msg % (unique_cds_ids, transcript_id)) 224 raise Exception(msg)
225 cds_id = unique_cds_ids.pop() 225 cds_id = unique_cds_ids.pop()
226 translation['id'] = cds_id 226 translation['id'] = cds_id
227 cds_list.sort(key=lambda _: _['start']) 227 cds_list.sort(key=lambda _: _['start'])
228 translation['CDS'] = cds_list 228 translation['CDS'] = cds_list
229 translation['start'] = cds_list[0]['start'] 229 translation['start'] = cds_list[0]['start']
290 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) 290 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id)
291 try: 291 try:
292 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', 292 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)',
293 to_insert) 293 to_insert)
294 except Exception as e: 294 except Exception as e:
295 raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e)) 295 raise Exception(f"Error while inserting {to_insert} into transcript table: {e}")
296 296
297 conn.commit() 297 conn.commit()
298 298
299 299
300 def remove_id_version(s, force=False): 300 def remove_id_version(s, force=False):
333 333
334 for gff3_arg in options.gff3: 334 for gff3_arg in options.gff3:
335 try: 335 try:
336 (species, filename) = gff3_arg.split(':') 336 (species, filename) = gff3_arg.split(':')
337 except ValueError: 337 except ValueError:
338 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) 338 raise Exception(f"Argument for --gff3 '{gff3_arg}' is not in the SPECIES:FILENAME format")
339 gene_dict = dict() 339 gene_dict = dict()
340 transcript_dict = dict() 340 transcript_dict = dict()
341 exon_parent_dict = dict() 341 exon_parent_dict = dict()
342 cds_parent_dict = dict() 342 cds_parent_dict = dict()
343 five_prime_utr_parent_dict = dict() 343 five_prime_utr_parent_dict = dict()
353 if line[0] == '#': 353 if line[0] == '#':
354 # skip comment lines 354 # skip comment lines
355 continue 355 continue
356 cols = line.split('\t') 356 cols = line.split('\t')
357 if len(cols) != 9: 357 if len(cols) != 9:
358 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) 358 raise Exception(f"Line {i} in file '{filename}': '{line}' does not have 9 columns")
359 feature_type = cols[2] 359 feature_type = cols[2]
360 try: 360 try:
361 if feature_type == 'gene': 361 if feature_type == 'gene':
362 add_gene_to_dict(cols, species, gene_dict) 362 add_gene_to_dict(cols, species, gene_dict)
363 elif feature_type in ('mRNA', 'transcript'): 363 elif feature_type in ('mRNA', 'transcript'):
373 elif feature_type in unimplemented_feature_nlines_dict: 373 elif feature_type in unimplemented_feature_nlines_dict:
374 unimplemented_feature_nlines_dict[feature_type] += 1 374 unimplemented_feature_nlines_dict[feature_type] += 1
375 else: 375 else:
376 unimplemented_feature_nlines_dict[feature_type] = 0 376 unimplemented_feature_nlines_dict[feature_type] = 0
377 except Exception as e: 377 except Exception as e:
378 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) 378 print(f"Line {i} in file '{filename}': {e}", file=sys.stderr)
379 379
380 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): 380 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
381 print("Skipped %d lines in GFF3 file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) 381 print(f"Skipped {nlines} lines in GFF3 file '{filename}': '{unimplemented_feature}' is not an implemented feature type", file=sys.stderr)
382 382
383 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) 383 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
384 write_gene_dict_to_db(conn, gene_dict) 384 write_gene_dict_to_db(conn, gene_dict)
385 385
386 for json_arg in options.json: 386 for json_arg in options.json:
408 transcript = fetch_transcript_and_gene(conn, transcript_id) 408 transcript = fetch_transcript_and_gene(conn, transcript_id)
409 # Remember that we need to force the removal for this file 409 # Remember that we need to force the removal for this file
410 if transcript: 410 if transcript:
411 force_remove_id_version = True 411 force_remove_id_version = True
412 force_remove_id_version_file_list.append(fasta_arg) 412 force_remove_id_version_file_list.append(fasta_arg)
413 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) 413 print(f"Forcing removal of id version in FASTA file '{fasta_arg}'", file=sys.stderr)
414 if not transcript: 414 if not transcript:
415 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) 415 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
416 continue 416 continue
417 if options.filter != 'canonical': 417 if options.filter != 'canonical':
418 break 418 break
433 # Select the transcript with the longest sequence. If more than 433 # Select the transcript with the longest sequence. If more than
434 # one transcripts have the same longest sequence for a gene, the 434 # one transcripts have the same longest sequence for a gene, the
435 # first one to appear in the FASTA file is selected. 435 # first one to appear in the FASTA file is selected.
436 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0] 436 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0]
437 elif len(canonical_transcript_ids) > 1: 437 elif len(canonical_transcript_ids) > 1:
438 raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id)) 438 raise Exception(f"Gene {gene_id} has more than 1 canonical transcripts")
439 else: 439 else:
440 selected_transcript_id = canonical_transcript_ids[0] 440 selected_transcript_id = canonical_transcript_ids[0]
441 selected_transcript_ids.append(selected_transcript_id) 441 selected_transcript_ids.append(selected_transcript_id)
442 442
443 regions = [_.strip().lower() for _ in options.regions.split(",")] 443 regions = [_.strip().lower() for _ in options.regions.split(",")]