Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 15:9c62ad7dd113 draft default tip
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
author | earlhaminst |
---|---|
date | Thu, 29 Apr 2021 19:46:38 +0000 |
parents | 598e9172b8e7 |
children |
comparison
equal
deleted
inserted
replaced
14:598e9172b8e7 | 15:9c62ad7dd113 |
---|---|
134 if cols[6] == '+': | 134 if cols[6] == '+': |
135 d['strand'] = 1 | 135 d['strand'] = 1 |
136 elif cols[6] == '-': | 136 elif cols[6] == '-': |
137 d['strand'] = -1 | 137 d['strand'] = -1 |
138 else: | 138 else: |
139 raise Exception("Unrecognized strand '%s'" % cols[6]) | 139 raise Exception(f"Unrecognized strand: {cols[6]}") |
140 if parent_dict is not None and 'Parent' in d: | 140 if parent_dict is not None and 'Parent' in d: |
141 # a 3' UTR can be split among multiple exons | 141 # a 3' UTR can be split among multiple exons |
142 # a 5' UTR can be split among multiple exons | 142 # a 5' UTR can be split among multiple exons |
143 # a CDS can be part of multiple transcripts | 143 # a CDS can be part of multiple transcripts |
144 for parent in d['Parent'].split(','): | 144 for parent in d['Parent'].split(','): |
148 | 148 |
149 def add_gene_to_dict(cols, species, gene_dict): | 149 def add_gene_to_dict(cols, species, gene_dict): |
150 global gene_count | 150 global gene_count |
151 gene = feature_to_dict(cols) | 151 gene = feature_to_dict(cols) |
152 if not gene['id']: | 152 if not gene['id']: |
153 raise Exception("Id not found among column 9 attribute tags: %s" % cols[8]) | 153 raise Exception(f"Id not found among column 9 attribute tags: {cols[8]}") |
154 gene.update({ | 154 gene.update({ |
155 'member_id': gene_count, | 155 'member_id': gene_count, |
156 'object_type': 'Gene', | 156 'object_type': 'Gene', |
157 'seq_region_name': cols[0], | 157 'seq_region_name': cols[0], |
158 'species': species, | 158 'species': species, |
215 derived_translation_end = None | 215 derived_translation_end = None |
216 if transcript_id in cds_parent_dict: | 216 if transcript_id in cds_parent_dict: |
217 cds_list = cds_parent_dict[transcript_id] | 217 cds_list = cds_parent_dict[transcript_id] |
218 unique_cds_ids = {cds['id'] for cds in cds_list} | 218 unique_cds_ids = {cds['id'] for cds in cds_list} |
219 if len(unique_cds_ids) > 1: | 219 if len(unique_cds_ids) > 1: |
220 msg = """Found multiple CDS IDs (%s) for transcript '%s'. | 220 msg = f"""Found multiple CDS IDs ({unique_cds_ids}) for transcript '{transcript_id}'. |
221 This is not supported by the Ensembl JSON format. If a CDS is split across | 221 This is not supported by the Ensembl JSON format. If a CDS is split across |
222 multiple discontinuous genomic locations, the GFF3 standard requires that all | 222 multiple discontinuous genomic locations, the GFF3 standard requires that all |
223 corresponding lines use the same ID attribute.""" | 223 corresponding lines use the same ID attribute.""" |
224 raise Exception(msg % (unique_cds_ids, transcript_id)) | 224 raise Exception(msg) |
225 cds_id = unique_cds_ids.pop() | 225 cds_id = unique_cds_ids.pop() |
226 translation['id'] = cds_id | 226 translation['id'] = cds_id |
227 cds_list.sort(key=lambda _: _['start']) | 227 cds_list.sort(key=lambda _: _['start']) |
228 translation['CDS'] = cds_list | 228 translation['CDS'] = cds_list |
229 translation['start'] = cds_list[0]['start'] | 229 translation['start'] = cds_list[0]['start'] |
290 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) | 290 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) |
291 try: | 291 try: |
292 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', | 292 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', |
293 to_insert) | 293 to_insert) |
294 except Exception as e: | 294 except Exception as e: |
295 raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e)) | 295 raise Exception(f"Error while inserting {to_insert} into transcript table: {e}") |
296 | 296 |
297 conn.commit() | 297 conn.commit() |
298 | 298 |
299 | 299 |
300 def remove_id_version(s, force=False): | 300 def remove_id_version(s, force=False): |
333 | 333 |
334 for gff3_arg in options.gff3: | 334 for gff3_arg in options.gff3: |
335 try: | 335 try: |
336 (species, filename) = gff3_arg.split(':') | 336 (species, filename) = gff3_arg.split(':') |
337 except ValueError: | 337 except ValueError: |
338 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) | 338 raise Exception(f"Argument for --gff3 '{gff3_arg}' is not in the SPECIES:FILENAME format") |
339 gene_dict = dict() | 339 gene_dict = dict() |
340 transcript_dict = dict() | 340 transcript_dict = dict() |
341 exon_parent_dict = dict() | 341 exon_parent_dict = dict() |
342 cds_parent_dict = dict() | 342 cds_parent_dict = dict() |
343 five_prime_utr_parent_dict = dict() | 343 five_prime_utr_parent_dict = dict() |
353 if line[0] == '#': | 353 if line[0] == '#': |
354 # skip comment lines | 354 # skip comment lines |
355 continue | 355 continue |
356 cols = line.split('\t') | 356 cols = line.split('\t') |
357 if len(cols) != 9: | 357 if len(cols) != 9: |
358 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) | 358 raise Exception(f"Line {i} in file '{filename}': '{line}' does not have 9 columns") |
359 feature_type = cols[2] | 359 feature_type = cols[2] |
360 try: | 360 try: |
361 if feature_type == 'gene': | 361 if feature_type == 'gene': |
362 add_gene_to_dict(cols, species, gene_dict) | 362 add_gene_to_dict(cols, species, gene_dict) |
363 elif feature_type in ('mRNA', 'transcript'): | 363 elif feature_type in ('mRNA', 'transcript'): |
373 elif feature_type in unimplemented_feature_nlines_dict: | 373 elif feature_type in unimplemented_feature_nlines_dict: |
374 unimplemented_feature_nlines_dict[feature_type] += 1 | 374 unimplemented_feature_nlines_dict[feature_type] += 1 |
375 else: | 375 else: |
376 unimplemented_feature_nlines_dict[feature_type] = 0 | 376 unimplemented_feature_nlines_dict[feature_type] = 0 |
377 except Exception as e: | 377 except Exception as e: |
378 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) | 378 print(f"Line {i} in file '{filename}': {e}", file=sys.stderr) |
379 | 379 |
380 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): | 380 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): |
381 print("Skipped %d lines in GFF3 file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) | 381 print(f"Skipped {nlines} lines in GFF3 file '{filename}': '{unimplemented_feature}' is not an implemented feature type", file=sys.stderr) |
382 | 382 |
383 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) | 383 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) |
384 write_gene_dict_to_db(conn, gene_dict) | 384 write_gene_dict_to_db(conn, gene_dict) |
385 | 385 |
386 for json_arg in options.json: | 386 for json_arg in options.json: |
408 transcript = fetch_transcript_and_gene(conn, transcript_id) | 408 transcript = fetch_transcript_and_gene(conn, transcript_id) |
409 # Remember that we need to force the removal for this file | 409 # Remember that we need to force the removal for this file |
410 if transcript: | 410 if transcript: |
411 force_remove_id_version = True | 411 force_remove_id_version = True |
412 force_remove_id_version_file_list.append(fasta_arg) | 412 force_remove_id_version_file_list.append(fasta_arg) |
413 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) | 413 print(f"Forcing removal of id version in FASTA file '{fasta_arg}'", file=sys.stderr) |
414 if not transcript: | 414 if not transcript: |
415 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) | 415 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) |
416 continue | 416 continue |
417 if options.filter != 'canonical': | 417 if options.filter != 'canonical': |
418 break | 418 break |
433 # Select the transcript with the longest sequence. If more than | 433 # Select the transcript with the longest sequence. If more than |
434 # one transcripts have the same longest sequence for a gene, the | 434 # one transcripts have the same longest sequence for a gene, the |
435 # first one to appear in the FASTA file is selected. | 435 # first one to appear in the FASTA file is selected. |
436 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0] | 436 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0] |
437 elif len(canonical_transcript_ids) > 1: | 437 elif len(canonical_transcript_ids) > 1: |
438 raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id)) | 438 raise Exception(f"Gene {gene_id} has more than 1 canonical transcripts") |
439 else: | 439 else: |
440 selected_transcript_id = canonical_transcript_ids[0] | 440 selected_transcript_id = canonical_transcript_ids[0] |
441 selected_transcript_ids.append(selected_transcript_id) | 441 selected_transcript_ids.append(selected_transcript_id) |
442 | 442 |
443 regions = [_.strip().lower() for _ in options.regions.split(",")] | 443 regions = [_.strip().lower() for _ in options.regions.split(",")] |