Mercurial > repos > earlhaminst > gstf_preparation
changeset 13:51a7a2a82902 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
author | earlhaminst |
---|---|
date | Tue, 06 Oct 2020 17:10:37 +0000 |
parents | 99bae410128c |
children | 598e9172b8e7 |
files | gstf_preparation.py gstf_preparation.xml |
diffstat | 2 files changed, 29 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/gstf_preparation.py Mon Oct 05 13:33:59 2020 +0000 +++ b/gstf_preparation.py Tue Oct 06 17:10:37 2020 +0000 @@ -1,5 +1,3 @@ -from __future__ import print_function - import json import optparse import os @@ -10,7 +8,20 @@ gene_count = 0 -class Sequence(object): +def asbool(val): + if isinstance(val, str): + val_lower = val.strip().lower() + if val_lower in ('true', '1'): + return True + elif val_lower in ('false', '0'): + return False + else: + raise ValueError(f"Cannot convert {val} to bool") + else: + return bool(val) + + +class Sequence: def __init__(self, header, sequence_parts): self.header = header self.sequence_parts = sequence_parts @@ -204,7 +215,7 @@ derived_translation_end = None if transcript_id in cds_parent_dict: cds_list = cds_parent_dict[transcript_id] - cds_ids = set(_['id'] for _ in cds_list) + cds_ids = {_['id'] for _ in cds_list} if len(cds_ids) > 1: raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) cds_id = cds_ids.pop() @@ -231,13 +242,13 @@ if derived_translation_start is not None: if found_cds: if derived_translation_start > translation['start']: - raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id)) + raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end") else: translation['start'] = derived_translation_start if derived_translation_end is not None: if found_cds: if derived_translation_end < translation['end']: - raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id)) + raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start") else: translation['end'] = derived_translation_end if found_cds or derived_translation_start is not None or derived_translation_end is not None: @@ -259,7 +270,7 @@ # This can happen when loading a JSON file from Ensembl continue if 'confidence' in gene and gene['confidence'].lower() != 'high': - print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr) + print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr) continue gene_id = gene['id'] cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', @@ -271,13 +282,13 @@ transcript_symbol = transcript.get('display_name') protein_id = transcript.get('Translation', {}).get('id') biotype = transcript.get('biotype') - is_canonical = transcript.get('is_canonical', False) + is_canonical = asbool(transcript.get('is_canonical', False)) to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) try: cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', to_insert) except Exception as e: - raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e)) + raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e)) conn.commit() @@ -397,7 +408,7 @@ force_remove_id_version_file_list.append(fasta_arg) print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) if not transcript: - print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) + print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) continue if options.filter != 'canonical': break @@ -434,7 +445,7 @@ transcript = fetch_transcript_and_gene(conn, transcript_id) if not transcript: - print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) + print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) continue if options.filter == 'canonical': @@ -443,23 +454,23 @@ continue elif options.filter == 'coding': if len(entry.sequence) % 3 != 0: - print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr) + print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr) continue transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene if transcript_biotype and transcript_biotype != 'protein_coding': - print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr) + print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr) continue if options.headers == "TranscriptId_species": # Change the FASTA header to '>TranscriptId_species', as required by TreeBest # Remove any underscore in the species - entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', '')) + entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', '')) elif options.headers == "TranscriptID-GeneSymbol_species": # Remove any underscore in the species - entry.header = ">%s-%s_%s" % (transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) + entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) elif options.headers == "TranscriptID-TranscriptSymbol_species": # Remove any underscore in the species - entry.header = ">%s-%s_%s" % (transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) + entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) if transcript['seq_region_name'].lower() in regions: entry.print(filtered_fasta_file)
--- a/gstf_preparation.xml Mon Oct 05 13:33:59 2020 +0000 +++ b/gstf_preparation.xml Tue Oct 06 17:10:37 2020 +0000 @@ -47,8 +47,8 @@ <param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation"> <option value="TranscriptId_species" selected="true">TranscriptId_species</option> - <option value="TranscriptID-GeneSymbol_species">GeneSymbol-TranscriptID_species</option> - <option value="TranscriptID-TranscriptSymbol_species">TranscriptSymbol-TranscriptID_species</option> + <option value="TranscriptID-GeneSymbol_species">TranscriptID-GeneSymbol_species</option> + <option value="TranscriptID-TranscriptSymbol_species">TranscriptID-TranscriptSymbol_species</option> <option value="">Don't change</option> </param> <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" />