Mercurial > repos > earlhaminst > gstf_preparation
changeset 5:b3ba0c84667c draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
author | earlhaminst |
---|---|
date | Mon, 16 Apr 2018 14:05:09 -0400 |
parents | 284f64ad9d43 |
children | 56bbdbfe3eaa |
files | gstf_preparation.py |
diffstat | 1 files changed, 14 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/gstf_preparation.py Fri Dec 08 05:32:12 2017 -0500 +++ b/gstf_preparation.py Mon Apr 16 14:05:09 2018 -0400 @@ -172,8 +172,9 @@ cds_list = cds_parent_dict[transcript_id] cds_ids = set(_['id'] for _ in cds_list) if len(cds_ids) > 1: - raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent) - translation['id'] = cds_ids.pop() + raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) + cds_id = cds_ids.pop() + translation['id'] = cds_id cds_list.sort(key=lambda _: _['start']) translation['CDS'] = cds_list translation['start'] = cds_list[0]['start'] @@ -196,13 +197,13 @@ if derived_translation_start is not None: if found_cds: if derived_translation_start > translation['start']: - raise Exception("UTR overlaps with CDS") + raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id)) else: translation['start'] = derived_translation_start if derived_translation_end is not None: if found_cds: if derived_translation_end < translation['end']: - raise Exception("UTR overlaps with CDS") + raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id)) else: translation['end'] = derived_translation_end if found_cds or derived_translation_start is not None or derived_translation_end is not None: @@ -300,6 +301,7 @@ cds_parent_dict = dict() five_prime_utr_parent_dict = dict() three_prime_utr_parent_dict = dict() + unimplemented_feature_nlines_dict = dict() with open(filename) as f: for i, line in enumerate(f, start=1): @@ -327,11 +329,16 @@ feature_to_dict(cols, three_prime_utr_parent_dict) elif feature_type == 'CDS': add_cds_to_dict(cols, cds_parent_dict) + elif feature_type in unimplemented_feature_nlines_dict: + unimplemented_feature_nlines_dict[feature_type] += 1 else: - print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) + unimplemented_feature_nlines_dict[feature_type] = 0 except Exception as e: print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) + for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): + print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) + join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) write_gene_dict_to_db(conn, gene_dict) @@ -348,6 +355,7 @@ gene_id = fetch_gene_id_for_transcript(conn, transcript_id) if not gene_id: + print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) continue if gene_id in gene_transcripts_dict: @@ -369,7 +377,7 @@ species_for_transcript = fetch_species_for_transcript(conn, transcript_id) if not species_for_transcript: - print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) + print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) continue if options.headers: