Mercurial > repos > cpt > cpt_gbk_to_gff
changeset 1:bb6332a85aa6 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:43:04 +0000 |
parents | a68f32350196 |
children | a921d6148d88 |
files | cpt-macros.xml cpt_gbkToGff3.xml cpt_gbk_to_gff/cpt-macros.xml cpt_gbk_to_gff/cpt_gbkToGff3.xml cpt_gbk_to_gff/gbk_to_gff3.py cpt_gbk_to_gff/macros.xml gbk_to_gff3.py macros.xml |
diffstat | 8 files changed, 712 insertions(+), 543 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:43:04 2023 +0000 @@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbkToGff3.xml Mon Jun 05 02:43:04 2023 +0000 @@ -0,0 +1,46 @@ +<tool id="edu.tamu.cpt.gff3.customGbkToGff" name="(CPT) Genbank to GFF3: " version="20.1.0.0"> + <description> CPT made Biobython-based solution</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +'$__tool_directory__/gbk_to_gff3.py' +'$gbkIn' +'$makeMRNA' +'$makeGene' +--identifier "$qualID" +--fastaFile '$fastaOut' +> '$default']]></command> + <inputs> + <param label="GenBank file" name="gbkIn" type="data" format="genbank"/> + <param checked="true" label="Automatically generate any missing Gene features if CDS/RBS has none" name="makeGene" type="boolean" truevalue="--makeGene" falsevalue=""/> + <param checked="true" label="Automatically generate missing mRNA features for genes" name="makeMRNA" type="boolean" truevalue="--makeMRNA" falsevalue=""/> + <param label="Qualifier to derive GFF ID from" name="qualID" type="text" value="locus_tag"/> + </inputs> + <outputs> + <data format="gff3" hidden="false" name="default"/> + <data format="fasta" hidden="false" name="fastaOut"/> + </outputs> + <tests> + </tests> + <help><![CDATA[ +**What it does** + +A Biopython-based script to convert Genbank files to GFF3. Should resolve frame shift errors and other problems caused by the old Bioperl solution. + +Will also attempt to automatically parent RBS, CDS, and Exon features without a locus tag to an appropriate gene feature. +]]></help> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </citations> +</tool>
--- a/cpt_gbk_to_gff/cpt-macros.xml Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros>
--- a/cpt_gbk_to_gff/cpt_gbkToGff3.xml Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -<?xml version="1.0"?> -<tool id="edu.tamu.cpt.gff3.customGbkToGff" name="(CPT) Genbank to GFF3: " version="20.1.0.0"> - <description> CPT made Biobython-based solution</description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -$__tool_directory__/gbk_to_gff3.py -$gbkIn -$makeMRNA -$makeGene ---identifier "$qualID" ---fastaFile $fastaOut -> $default]]></command> - <inputs> - <param label="GenBank file" name="gbkIn" type="data" format="genbank"/> - <param checked="true" label="Automatically generate any missing Gene features if CDS/RBS has none" name="makeGene" - type="boolean" truevalue="--makeGene" falsevalue=""/> - <param checked="true" label="Automatically generate missing mRNA features for genes" name="makeMRNA" - type="boolean" truevalue="--makeMRNA" falsevalue=""/> - <param label="Qualifier to derive GFF ID from" name="qualID" type="text" value="locus_tag"/> - </inputs> - <outputs> - <data format="gff3" hidden="false" name="default"/> - <data format="fasta" hidden="false" name="fastaOut"/> - </outputs> - <tests> - </tests> - <help><![CDATA[ -**What it does** - -A Biopython-based script to convert Genbank files to GFF3. Should resolve frame shift errors and other problems caused by the old Bioperl solution. - -Will also attempt to automatically parent RBS, CDS, and Exon features without a locus tag to an appropriate gene feature. -]]></help> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </citations> -</tool>
--- a/cpt_gbk_to_gff/gbk_to_gff3.py Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,274 +0,0 @@ -#!/usr/bin/env python - -import argparse -import sys - -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord -from Bio.SeqFeature import FeatureLocation -from CPT_GFFParser import gffSeqFeature, gffWrite - -bottomFeatTypes = ["exon", "RBS", "CDS"] - -def makeGffFeat(inFeat, num, recName, identifier): - if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"): - inFeat.type = "Shine_Dalgarno_sequence" - if "codon_start" in inFeat.qualifiers.keys(): - shift = int(inFeat.qualifiers["codon_start"][0]) - 1 - else: - shift = "." - if identifier in inFeat.qualifiers.keys(): - name = inFeat.qualifiers[identifier][0] + "." + inFeat.type - if num > 0: - name += "." + str(num) - else: - name = recName + "." + inFeat.type + "." + str(num) - - outFeat = gffSeqFeature(inFeat.location, inFeat.type, '', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff") - outFeat.qualifiers["ID"] = [name] - return outFeat - -def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile): - - ofh = sys.stdout - if outFile: - ofh = outFile - - outRec = [] - failed = 0 - for rec in SeqIO.parse(inFile, "genbank"): - recID = rec.name - - if len(str(rec.seq)) > 0: - seqs_pending_writes = True - outSeq = str(rec.seq) - seqLen = len(outSeq) - - locBucket = {} - outFeats = [] - topTypeDict = {} - seekingParent = [] - geneNum = 0 - autoGeneNum = 0 - for feat in rec.features: - if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC - if feat.type in bottomFeatTypes: - seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates] - continue - elif feat.type not in topTypeDict.keys(): - topTypeDict[feat.type] = 1 - else: - topTypeDict[feat.type] += 1 - outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)) - continue - elif feat.qualifiers[identifier][0] not in locBucket.keys(): - locBucket[feat.qualifiers[identifier][0]] = [] - locBucket[feat.qualifiers[identifier][0]].append(feat) - - for locus in locBucket.keys(): - minLoc = locBucket[locus][0].location.start - maxLoc = locBucket[locus][0].location.end - for feat in locBucket[locus]: - minLoc = min(minLoc, feat.location.start) - maxLoc = max(maxLoc, feat.location.end) - for x in seekingParent: - if x[0].location.start >= minLoc and x[0].location.end <= maxLoc: - x[1].append(locus) - if x[0].location.start == minLoc or x[0].location.end == maxLoc: - x[2].append(locus) - - for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free] - if len(x[2]) == 1: - finList = "" - if len(x[1]) > 1: - for loc in x[1]: - if loc != x[2][0]: - finList += loc + ", " - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "." - else: - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - x[0].qualifiers["Notes"].append(finList) - x[1] = x[2][0] - elif len(x[2]) > 1: - candidate = x[2][0] #Arbitrarily choose first one - finList = "" - strongList = "" - for loc in x[2]: - if loc != candidate: - finList += loc + ", " - strongList += loc + ", " - for loc in x[1]: - if loc not in x[2]: - finList += loc + ", " - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + finList[0:-2] + " (Equally strong candidate(s): " + strongList[0:-2] + ")." - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - x[0].qualifiers["Notes"].append(finList) - x[1] = candidate - elif len(x[1]) == 1: - x[1] = x[1][0] - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." - x[0].qualifiers["Notes"].append(finList) - elif len(x[1]) > 1: - candidate = x[1][0] #Arbitrarily choose first one - finList = "" - for loc in x[1]: - if loc != candidate: - finList += loc + ", " - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + finList[0:-2] + "." - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - x[0].qualifiers["Notes"].append(finList) - x[1] = candidate - else: - if makeGene: - sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "], creating standalone gene.\n") - autoGeneNum += 1 - x[0].source = "GbkToGff" - x[0].score = 0 - x[0].shift = 0 - if "ID" not in x[0].qualifiers.keys(): - x[0].qualifiers["ID"] = [recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)] - tempName = recID + ".derived_Gene." + str(autoGeneNum) - tempQuals = {"ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]} - tempGene = gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'gene', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") - if makeMRNA: - tempName = recID + ".derived_mRNA." + str(autoGeneNum) - tempQuals = {"ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} - tempGene.sub_features.append(gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'mRNA', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")) - tempGene.sub_features[-1].sub_features.append(x[0]) - else: - tempGene.sub_features.append(x[0]) - - - outFeats.append(tempGene) - else: - sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "].\n") - if x[0].type not in topTypeDict.keys(): - topTypeDict[x[0].type] = 1 - else: - topTypeDict[x[0].type] += 1 - outFeats.append(makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier)) - - for locus in locBucket.keys(): - if len(locBucket[locus]) == 1: # No heirarchy to be made - outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier)) - continue - topFeat = None - midFeat = None - bottomFeats = [] - typeDict = {} - minLoc = locBucket[locus][0].location.start - maxLoc = locBucket[locus][0].location.end - geneNum += 1 - for feat in locBucket[locus]: - # If we want to make our own top-level feat? - minLoc = min(minLoc, feat.location.start) - maxLoc = max(maxLoc, feat.location.end) - - # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot - if feat.type in ['gene']: - if not topFeat: - topFeat = feat - # Else handle multiple top features - elif feat.type in ['mRNA', 'tRNA', 'rRNA']: - if not midFeat: - midFeat = feat - # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats) - else: - if feat.type not in typeDict.keys(): - typeDict[feat.type] = 1 - else: - typeDict[feat.type] += 1 - bottomFeats.append(feat) - - for x in seekingParent: - if type(x[1]) != "list" and locus == x[1]: - x[0].qualifiers[identifier] = [locus] - bottomFeats.append(x[0]) - if x[0].type not in typeDict.keys(): - typeDict[x[0].type] = 1 - else: - typeDict[x[0].type] += 1 - - - - - - #if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds - - for x in typeDict.keys(): # If only 1, set it to 0 so we don't append a number to the name - if typeDict[x] == 1: # Else, set to 1 so that we count up as we encounter the features - typeDict[x] = 0 - else: - typeDict[x] = 1 - - if not topFeat: - if makeGene: - if midFeat: - possibleStrand = midFeat.strand - else: - possibleStrand = bottomFeats[0].strand - tempName = recID + ".gene." + str(geneNum) - tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]} - topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), 'gene', '', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") - else: - sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \n" % (minLoc, maxLoc)) - for x in locBucket[locus]: - sys.stderr.write(str(x)) - sys.stderr.write('\n') - failed = 1 - continue - - outFeats.append(makeGffFeat(topFeat, 0, recID, identifier)) - if not midFeat and topFeat.type == "gene" and makeMRNA: - if identifier in topFeat.qualifiers.keys(): - tempName = topFeat.qualifiers[identifier][0] + ".mRNA" - tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} - else: - tempName = outFeats[-1].ID + ".mRNA" - tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} - midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), 'mRNA', '', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") - - if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features - outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier)) - outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] - for x in bottomFeats: - typeDict[x.type] += 1 - outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier)) - outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id] - else: # No midFeat, append bottom feats directly to top feats - for x in bottomFeats: - typeDict[x.type] += 1 - outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier)) - outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] - - outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations)) - SeqIO.write([outRec[-1]], fastaFile, "fasta") - gffWrite(outRec, ofh) - exit(failed) # 0 if all features handled, 1 if unable to handle some - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( description='Biopython solution to Gbk to GFF conversion') - - parser.add_argument('inFile', type=argparse.FileType("r"), help='Path to an input GBK file' ) - parser.add_argument('--makeMRNA', action="store_true", required=False, help="Automatically create mRNA features") - parser.add_argument('--makeGene', action="store_true", required=False, help="Automatically create missing Gene features") - parser.add_argument('--identifier', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from") - parser.add_argument('--fastaFile', type=argparse.FileType("w"), help='Fasta output for sequences' ) - parser.add_argument('--outFile', type=argparse.FileType("w"), help='GFF feature output' ) - args = parser.parse_args() - main(**vars(args)) - - - - - - - -
--- a/cpt_gbk_to_gff/macros.xml Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <xml name="ldap_ref" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <repeat name="repeat_@NAME@" title="@LABEL@"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </repeat> - </xml> - <xml name="ldap_ref_single" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </xml> - <xml name="gbk_feature_type" - token_label="Feature type to remove" - token_multiple="True" - token_optional="False" - token_name="positional_2"> - <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select"> - <option value="-10_signal">-10_signal</option> - <option value="-35_signal">-35_signal</option> - <option value="3'UTR">3'UTR</option> - <option value="5'UTR">5'UTR</option> - <option value="CAAT_signal">CAAT_signal</option> - <option selected="true" value="CDS">CDS</option> - <option value="C_region">C_region</option> - <option value="D-loop">D-loop</option> - <option value="D_segment">D_segment</option> - <option value="GC_signal">GC_signal</option> - <option value="J_segment">J_segment</option> - <option value="LTR">LTR</option> - <option value="N_region">N_region</option> - <option value="RBS">RBS</option> - <option value="STS">STS</option> - <option value="S_region">S_region</option> - <option value="TATA_signal">TATA_signal</option> - <option value="V_region">V_region</option> - <option value="V_segment">V_segment</option> - <option value="all">all</option> - <option value="assembly_gap">assembly_gap</option> - <option value="attenuator">attenuator</option> - <option value="enhancer">enhancer</option> - <option value="exon">exon</option> - <option value="gap">gap</option> - <option value="gene">gene</option> - <option value="iDNA">iDNA</option> - <option value="intron">intron</option> - <option value="mRNA">mRNA</option> - <option value="mat_peptide">mat_peptide</option> - <option value="misc_RNA">misc_RNA</option> - <option value="misc_binding">misc_binding</option> - <option value="misc_difference">misc_difference</option> - <option value="misc_feature">misc_feature</option> - <option value="misc_recomb">misc_recomb</option> - <option value="misc_signal">misc_signal</option> - <option value="misc_structure">misc_structure</option> - <option value="mobile_element">mobile_element</option> - <option value="modified_base">modified_base</option> - <option value="ncRNA">ncRNA</option> - <option value="old_sequence">old_sequence</option> - <option value="operon">operon</option> - <option value="oriT">oriT</option> - <option value="polyA_signal">polyA_signal</option> - <option value="polyA_site">polyA_site</option> - <option value="precursor_RNA">precursor_RNA</option> - <option value="prim_transcript">prim_transcript</option> - <option value="primer_bind">primer_bind</option> - <option value="promoter">promoter</option> - <option value="protein_bind">protein_bind</option> - <option value="rRNA">rRNA</option> - <option value="rep_origin">rep_origin</option> - <option value="repeat_region">repeat_region</option> - <option value="sig_peptide">sig_peptide</option> - <option value="source">source</option> - <option value="stem_loop">stem_loop</option> - <option value="tRNA">tRNA</option> - <option value="terminator">terminator</option> - <option value="tmRNA">tmRNA</option> - <option value="transit_peptide">transit_peptide</option> - <option value="unsure">unsure</option> - <option value="variation">variation</option> - </param> - </xml> -</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_gff3.py Mon Jun 05 02:43:04 2023 +0000 @@ -0,0 +1,477 @@ +#!/usr/bin/env python + +import argparse +import sys + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import FeatureLocation +from CPT_GFFParser import gffSeqFeature, gffWrite + +bottomFeatTypes = ["exon", "RBS", "CDS"] + + +def makeGffFeat(inFeat, num, recName, identifier): + if inFeat.type == "RBS" or ( + inFeat.type == "regulatory" + and "regulatory_class" in inFeat.qualifiers.keys() + and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site" + ): + inFeat.type = "Shine_Dalgarno_sequence" + if "codon_start" in inFeat.qualifiers.keys(): + shift = int(inFeat.qualifiers["codon_start"][0]) - 1 + else: + shift = "." + if identifier in inFeat.qualifiers.keys(): + name = inFeat.qualifiers[identifier][0] + "." + inFeat.type + if num > 0: + name += "." + str(num) + else: + name = recName + "." + inFeat.type + "." + str(num) + + outFeat = gffSeqFeature( + inFeat.location, + inFeat.type, + "", + inFeat.strand, + name, + inFeat.qualifiers, + None, + None, + None, + shift, + 0, + "GbkToGff", + ) + outFeat.qualifiers["ID"] = [name] + return outFeat + + +def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile): + + ofh = sys.stdout + if outFile: + ofh = outFile + + outRec = [] + failed = 0 + for rec in SeqIO.parse(inFile, "genbank"): + recID = rec.name + + if len(str(rec.seq)) > 0: + seqs_pending_writes = True + outSeq = str(rec.seq) + seqLen = len(outSeq) + + locBucket = {} + outFeats = [] + topTypeDict = {} + seekingParent = [] + geneNum = 0 + autoGeneNum = 0 + for feat in rec.features: + if ( + identifier not in feat.qualifiers.keys() + ): # Allow metadata features and other features with no ID (Output warning?) - AJC + if feat.type in bottomFeatTypes: + seekingParent.append( + [feat, [], []] + ) # [Feature, all parent candidates, strongest parent candidates] + continue + elif feat.type not in topTypeDict.keys(): + topTypeDict[feat.type] = 1 + else: + topTypeDict[feat.type] += 1 + outFeats.append( + makeGffFeat(feat, topTypeDict[feat.type], recID, identifier) + ) + continue + elif feat.qualifiers[identifier][0] not in locBucket.keys(): + locBucket[feat.qualifiers[identifier][0]] = [] + locBucket[feat.qualifiers[identifier][0]].append(feat) + + for locus in locBucket.keys(): + minLoc = locBucket[locus][0].location.start + maxLoc = locBucket[locus][0].location.end + for feat in locBucket[locus]: + minLoc = min(minLoc, feat.location.start) + maxLoc = max(maxLoc, feat.location.end) + for x in seekingParent: + if x[0].location.start >= minLoc and x[0].location.end <= maxLoc: + x[1].append(locus) + if x[0].location.start == minLoc or x[0].location.end == maxLoc: + x[2].append(locus) + + for x in seekingParent: # Reformat to [Feature, Locus, Unused/Free] + if len(x[2]) == 1: + finList = "" + if len(x[1]) > 1: + for loc in x[1]: + if loc != x[2][0]: + finList += loc + ", " + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + + finList[0:-2] + + "." + ) + else: + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived." + ) + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + x[0].qualifiers["Notes"].append(finList) + x[1] = x[2][0] + elif len(x[2]) > 1: + candidate = x[2][0] # Arbitrarily choose first one + finList = "" + strongList = "" + for loc in x[2]: + if loc != candidate: + finList += loc + ", " + strongList += loc + ", " + for loc in x[1]: + if loc not in x[2]: + finList += loc + ", " + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + + finList[0:-2] + + " (Equally strong candidate(s): " + + strongList[0:-2] + + ")." + ) + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + x[0].qualifiers["Notes"].append(finList) + x[1] = candidate + elif len(x[1]) == 1: + x[1] = x[1][0] + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived." + ) + x[0].qualifiers["Notes"].append(finList) + elif len(x[1]) > 1: + candidate = x[1][0] # Arbitrarily choose first one + finList = "" + for loc in x[1]: + if loc != candidate: + finList += loc + ", " + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + + finList[0:-2] + + "." + ) + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + x[0].qualifiers["Notes"].append(finList) + x[1] = candidate + else: + if makeGene: + sys.stderr.write( + "Warning: Unable to find potential parent for feature with no " + + identifier + + " of type " + + str(x[0].type) + + " at location [" + + str(x[0].location.start + 1) + + ", " + + str(x[0].location.end) + + "], creating standalone gene.\n" + ) + autoGeneNum += 1 + x[0].source = "GbkToGff" + x[0].score = 0 + x[0].shift = 0 + if "ID" not in x[0].qualifiers.keys(): + x[0].qualifiers["ID"] = [ + recID + ".standalone_" + x[0].type + "." + str(autoGeneNum) + ] + tempName = recID + ".derived_Gene." + str(autoGeneNum) + tempQuals = { + "ID": [tempName], + "Notes": [ + "Gene feature automatically generated by Gbk to GFF conversion" + ], + } + tempGene = gffSeqFeature( + FeatureLocation( + x[0].location.start, x[0].location.end, x[0].location.strand + ), + "gene", + "", + x[0].strand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + if makeMRNA: + tempName = recID + ".derived_mRNA." + str(autoGeneNum) + tempQuals = { + "ID": [tempName], + "Notes": [ + "mRNA feature automatically generated by Gbk to GFF conversion" + ], + } + tempGene.sub_features.append( + gffSeqFeature( + FeatureLocation( + x[0].location.start, + x[0].location.end, + x[0].location.strand, + ), + "mRNA", + "", + x[0].strand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + ) + tempGene.sub_features[-1].sub_features.append(x[0]) + else: + tempGene.sub_features.append(x[0]) + + outFeats.append(tempGene) + else: + sys.stderr.write( + "Warning: Unable to find potential parent for feature with no " + + identifier + + " of type " + + str(x[0].type) + + " at location [" + + str(x[0].location.start + 1) + + ", " + + str(x[0].location.end) + + "].\n" + ) + if x[0].type not in topTypeDict.keys(): + topTypeDict[x[0].type] = 1 + else: + topTypeDict[x[0].type] += 1 + outFeats.append( + makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier) + ) + + for locus in locBucket.keys(): + if len(locBucket[locus]) == 1: # No heirarchy to be made + outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier)) + continue + topFeat = None + midFeat = None + bottomFeats = [] + typeDict = {} + minLoc = locBucket[locus][0].location.start + maxLoc = locBucket[locus][0].location.end + geneNum += 1 + for feat in locBucket[locus]: + # If we want to make our own top-level feat? + minLoc = min(minLoc, feat.location.start) + maxLoc = max(maxLoc, feat.location.end) + + # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot + if feat.type in ["gene"]: + if not topFeat: + topFeat = feat + # Else handle multiple top features + elif feat.type in ["mRNA", "tRNA", "rRNA"]: + if not midFeat: + midFeat = feat + # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats) + else: + if feat.type not in typeDict.keys(): + typeDict[feat.type] = 1 + else: + typeDict[feat.type] += 1 + bottomFeats.append(feat) + + for x in seekingParent: + if type(x[1]) != "list" and locus == x[1]: + x[0].qualifiers[identifier] = [locus] + bottomFeats.append(x[0]) + if x[0].type not in typeDict.keys(): + typeDict[x[0].type] = 1 + else: + typeDict[x[0].type] += 1 + + # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds + + for ( + x + ) in ( + typeDict.keys() + ): # If only 1, set it to 0 so we don't append a number to the name + if ( + typeDict[x] == 1 + ): # Else, set to 1 so that we count up as we encounter the features + typeDict[x] = 0 + else: + typeDict[x] = 1 + + if not topFeat: + if makeGene: + if midFeat: + possibleStrand = midFeat.strand + else: + possibleStrand = bottomFeats[0].strand + tempName = recID + ".gene." + str(geneNum) + tempQuals = { + identifier: [locus], + "ID": [tempName], + "Notes": [ + "Gene feature automatically generated by Gbk to GFF conversion" + ], + } + topFeat = gffSeqFeature( + FeatureLocation(minLoc, maxLoc, possibleStrand), + "gene", + "", + possibleStrand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + else: + sys.stderr.write( + "Unable to create a feature heirarchy at location [%d, %d] with features: \n" + % (minLoc, maxLoc) + ) + for x in locBucket[locus]: + sys.stderr.write(str(x)) + sys.stderr.write("\n") + failed = 1 + continue + + outFeats.append(makeGffFeat(topFeat, 0, recID, identifier)) + if not midFeat and topFeat.type == "gene" and makeMRNA: + if identifier in topFeat.qualifiers.keys(): + tempName = topFeat.qualifiers[identifier][0] + ".mRNA" + tempQuals = { + identifier: topFeat.qualifiers[identifier], + "ID": [tempName], + "Notes": [ + "mRNA feature automatically generated by Gbk to GFF conversion" + ], + } + else: + tempName = outFeats[-1].ID + ".mRNA" + tempQuals = { + identifier: topFeat.qualifiers[identifier], + "ID": [tempName], + "Notes": [ + "mRNA feature automatically generated by Gbk to GFF conversion" + ], + } + midFeat = gffSeqFeature( + FeatureLocation(minLoc, maxLoc, topFeat.strand), + "mRNA", + "", + topFeat.strand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + + if ( + midFeat + ): # Again, need a new if statement if we want to handle multiple mid-tier features + outFeats[-1].sub_features.append( + makeGffFeat(midFeat, 0, recID, identifier) + ) + outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] + for x in bottomFeats: + typeDict[x.type] += 1 + outFeats[-1].sub_features[-1].sub_features.append( + makeGffFeat(x, typeDict[x.type], recID, identifier) + ) + outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[ + "Parent" + ] = [outFeats[-1].sub_features[-1].id] + else: # No midFeat, append bottom feats directly to top feats + for x in bottomFeats: + typeDict[x.type] += 1 + outFeats[-1].sub_features.append( + makeGffFeat(x, typeDict[x.type], recID, identifier) + ) + outFeats[-1].sub_features[-1].qualifiers["Parent"] = [ + outFeats[-1].id + ] + + outRec.append( + SeqRecord( + rec.seq, + recID, + rec.name, + rec.description, + rec.dbxrefs, + sorted(outFeats, key=lambda x: x.location.start), + rec.annotations, + rec.letter_annotations, + ) + ) + SeqIO.write([outRec[-1]], fastaFile, "fasta") + gffWrite(outRec, ofh) + exit(failed) # 0 if all features handled, 1 if unable to handle some + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Biopython solution to Gbk to GFF conversion" + ) + + parser.add_argument( + "inFile", type=argparse.FileType("r"), help="Path to an input GBK file" + ) + parser.add_argument( + "--makeMRNA", + action="store_true", + required=False, + help="Automatically create mRNA features", + ) + parser.add_argument( + "--makeGene", + action="store_true", + required=False, + help="Automatically create missing Gene features", + ) + parser.add_argument( + "--identifier", + type=str, + default="locus_tag", + required=False, + help="Qualifier to derive ID property from", + ) + parser.add_argument( + "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences" + ) + parser.add_argument( + "--outFile", type=argparse.FileType("w"), help="GFF feature output" + ) + args = parser.parse_args() + main(**vars(args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:43:04 2023 +0000 @@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros>