gstf_preparation: gstf_preparation.py comparison

comparison gstf_preparation.py @ 13:51a7a2a82902 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"

author	earlhaminst
date	Tue, 06 Oct 2020 17:10:37 +0000
parents	99bae410128c
children	598e9172b8e7

comparison

equal deleted inserted replaced

-:99bae410128c
+:51a7a2a82902
-from __future__ import print_function
 import json
 import optparse
 import os
 import sqlite3
 import sys
 version = "0.5.0"
 gene_count = 0
-class Sequence(object):
+def asbool(val):
+if isinstance(val, str):
+val_lower = val.strip().lower()
+if val_lower in ('true', '1'):
+return True
+elif val_lower in ('false', '0'):
+return False
+else:
+raise ValueError(f"Cannot convert {val} to bool")
+else:
+return bool(val)
+class Sequence:
 def __init__(self, header, sequence_parts):
 self.header = header
 self.sequence_parts = sequence_parts
 self._sequence = None
 found_cds = False
 derived_translation_start = None
 derived_translation_end = None
 if transcript_id in cds_parent_dict:
 cds_list = cds_parent_dict[transcript_id]
-cds_ids = set(_['id'] for _ in cds_list)
+cds_ids = {_['id'] for _ in cds_list}
 if len(cds_ids) > 1:
 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id)
 cds_id = cds_ids.pop()
 translation['id'] = cds_id
 cds_list.sort(key=lambda _: _['start'])
 else:
 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
 if derived_translation_start is not None:
 if found_cds:
 if derived_translation_start > translation['start']:
-raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id))
+raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end")
 else:
 translation['start'] = derived_translation_start
 if derived_translation_end is not None:
 if found_cds:
 if derived_translation_end < translation['end']:
-raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id))
+raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start")
 else:
 translation['end'] = derived_translation_end
 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
 transcript['Translation'] = translation
 for gene in gene_dict.values():
 if gene is None:
 # This can happen when loading a JSON file from Ensembl
 continue
 if 'confidence' in gene and gene['confidence'].lower() != 'high':
-print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr)
+print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr)
 continue
 gene_id = gene['id']
 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene)))
 for transcript in gene["Transcript"]:
 transcript_id = transcript['id']
 transcript_symbol = transcript.get('display_name')
 protein_id = transcript.get('Translation', {}).get('id')
 biotype = transcript.get('biotype')
-is_canonical = transcript.get('is_canonical', False)
+is_canonical = asbool(transcript.get('is_canonical', False))
 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id)
 try:
 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)',
 to_insert)
 except Exception as e:
-raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e))
+raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e))
 conn.commit()
 def remove_id_version(s, force=False):
 if transcript:
 force_remove_id_version = True
 force_remove_id_version_file_list.append(fasta_arg)
 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr)
 if not transcript:
-print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
+print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
 continue
 if options.filter != 'canonical':
 break
 found_gene_transcript = True
 for entry in FASTAReader_gen(fasta_arg):
 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
 transcript = fetch_transcript_and_gene(conn, transcript_id)
 if not transcript:
-print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
+print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
 continue
 if options.filter == 'canonical':
 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict
 if transcript_id not in selected_transcript_ids:
 continue
 elif options.filter == 'coding':
 if len(entry.sequence) % 3 != 0:
-print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr)
+print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr)
 continue
 transcript_biotype = transcript['biotype']  # This is the biotype of the transcript or, if that is NULL, the one of the gene
 if transcript_biotype and transcript_biotype != 'protein_coding':
-print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr)
+print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr)
 continue
 if options.headers == "TranscriptId_species":
 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
 # Remove any underscore in the species
-entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', ''))
+entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', ''))
 elif options.headers == "TranscriptID-GeneSymbol_species":
 # Remove any underscore in the species
-entry.header = ">%s-%s_%s" % (transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', ''))
+entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', ''))
 elif options.headers == "TranscriptID-TranscriptSymbol_species":
 # Remove any underscore in the species
-entry.header = ">%s-%s_%s" % (transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', ''))
+entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', ''))
 if transcript['seq_region_name'].lower() in regions:
 entry.print(filtered_fasta_file)
 else:
 entry.print(output_fasta_file)

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.py @ 13:51a7a2a82902 draft