Mercurial > repos > earlhaminst > gstf_preparation

--- a/gstf_preparation.py	Fri Dec 08 05:32:12 2017 -0500
+++ b/gstf_preparation.py	Mon Apr 16 14:05:09 2018 -0400
@@ -172,8 +172,9 @@
             cds_list = cds_parent_dict[transcript_id]
             cds_ids = set(_['id'] for _ in cds_list)
             if len(cds_ids) > 1:
-                raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent)
-            translation['id'] = cds_ids.pop()
+                raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id)
+            cds_id = cds_ids.pop()
+            translation['id'] = cds_id
             cds_list.sort(key=lambda _: _['start'])
             translation['CDS'] = cds_list
             translation['start'] = cds_list[0]['start']
@@ -196,13 +197,13 @@
         if derived_translation_start is not None:
             if found_cds:
                 if derived_translation_start > translation['start']:
-                    raise Exception("UTR overlaps with CDS")
+                    raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id))
             else:
                 translation['start'] = derived_translation_start
         if derived_translation_end is not None:
             if found_cds:
                 if derived_translation_end < translation['end']:
-                    raise Exception("UTR overlaps with CDS")
+                    raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id))
             else:
                 translation['end'] = derived_translation_end
         if found_cds or derived_translation_start is not None or derived_translation_end is not None:
@@ -300,6 +301,7 @@
         cds_parent_dict = dict()
         five_prime_utr_parent_dict = dict()
         three_prime_utr_parent_dict = dict()
+        unimplemented_feature_nlines_dict = dict()

         with open(filename) as f:
             for i, line in enumerate(f, start=1):
@@ -327,11 +329,16 @@
                         feature_to_dict(cols, three_prime_utr_parent_dict)
                     elif feature_type == 'CDS':
                         add_cds_to_dict(cols, cds_parent_dict)
+                    elif feature_type in unimplemented_feature_nlines_dict:
+                        unimplemented_feature_nlines_dict[feature_type] += 1
                     else:
-                        print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
+                        unimplemented_feature_nlines_dict[feature_type] = 0
                 except Exception as e:
                     print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)

+        for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
+            print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
+
         join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
         write_gene_dict_to_db(conn, gene_dict)

@@ -348,6 +355,7 @@

                 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
                 if not gene_id:
+                    print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
                     continue

                 if gene_id in gene_transcripts_dict:
@@ -369,7 +377,7 @@

                 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
                 if not species_for_transcript:
-                    print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr)
+                    print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
                     continue

                 if options.headers: