Previous changeset 0:a68f32350196 (2022-06-17) Next changeset 2:a921d6148d88 (2024-01-05) |
Commit message:
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c |
added:
cpt-macros.xml cpt_gbkToGff3.xml gbk_to_gff3.py macros.xml |
removed:
cpt_gbk_to_gff/cpt-macros.xml cpt_gbk_to_gff/cpt_gbkToGff3.xml cpt_gbk_to_gff/gbk_to_gff3.py cpt_gbk_to_gff/macros.xml |
b |
diff -r a68f32350196 -r bb6332a85aa6 cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:43:04 2023 +0000 |
[ |
@@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros> |
b |
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbkToGff3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbkToGff3.xml Mon Jun 05 02:43:04 2023 +0000 |
[ |
@@ -0,0 +1,46 @@ +<tool id="edu.tamu.cpt.gff3.customGbkToGff" name="(CPT) Genbank to GFF3: " version="20.1.0.0"> + <description> CPT made Biobython-based solution</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +'$__tool_directory__/gbk_to_gff3.py' +'$gbkIn' +'$makeMRNA' +'$makeGene' +--identifier "$qualID" +--fastaFile '$fastaOut' +> '$default']]></command> + <inputs> + <param label="GenBank file" name="gbkIn" type="data" format="genbank"/> + <param checked="true" label="Automatically generate any missing Gene features if CDS/RBS has none" name="makeGene" type="boolean" truevalue="--makeGene" falsevalue=""/> + <param checked="true" label="Automatically generate missing mRNA features for genes" name="makeMRNA" type="boolean" truevalue="--makeMRNA" falsevalue=""/> + <param label="Qualifier to derive GFF ID from" name="qualID" type="text" value="locus_tag"/> + </inputs> + <outputs> + <data format="gff3" hidden="false" name="default"/> + <data format="fasta" hidden="false" name="fastaOut"/> + </outputs> + <tests> + </tests> + <help><![CDATA[ +**What it does** + +A Biopython-based script to convert Genbank files to GFF3. Should resolve frame shift errors and other problems caused by the old Bioperl solution. + +Will also attempt to automatically parent RBS, CDS, and Exon features without a locus tag to an appropriate gene feature. +]]></help> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </citations> +</tool> |
b |
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/cpt-macros.xml --- a/cpt_gbk_to_gff/cpt-macros.xml Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros> |
b |
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/cpt_gbkToGff3.xml --- a/cpt_gbk_to_gff/cpt_gbkToGff3.xml Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,49 +0,0 @@ -<?xml version="1.0"?> -<tool id="edu.tamu.cpt.gff3.customGbkToGff" name="(CPT) Genbank to GFF3: " version="20.1.0.0"> - <description> CPT made Biobython-based solution</description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -$__tool_directory__/gbk_to_gff3.py -$gbkIn -$makeMRNA -$makeGene ---identifier "$qualID" ---fastaFile $fastaOut -> $default]]></command> - <inputs> - <param label="GenBank file" name="gbkIn" type="data" format="genbank"/> - <param checked="true" label="Automatically generate any missing Gene features if CDS/RBS has none" name="makeGene" - type="boolean" truevalue="--makeGene" falsevalue=""/> - <param checked="true" label="Automatically generate missing mRNA features for genes" name="makeMRNA" - type="boolean" truevalue="--makeMRNA" falsevalue=""/> - <param label="Qualifier to derive GFF ID from" name="qualID" type="text" value="locus_tag"/> - </inputs> - <outputs> - <data format="gff3" hidden="false" name="default"/> - <data format="fasta" hidden="false" name="fastaOut"/> - </outputs> - <tests> - </tests> - <help><![CDATA[ -**What it does** - -A Biopython-based script to convert Genbank files to GFF3. Should resolve frame shift errors and other problems caused by the old Bioperl solution. - -Will also attempt to automatically parent RBS, CDS, and Exon features without a locus tag to an appropriate gene feature. -]]></help> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </citations> -</tool> |
b |
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/gbk_to_gff3.py --- a/cpt_gbk_to_gff/gbk_to_gff3.py Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,274 +0,0 @@\n-#!/usr/bin/env python\n-\n-import argparse\n-import sys\n-\n-from Bio import SeqIO\n-from Bio.SeqRecord import SeqRecord\n-from Bio.SeqFeature import FeatureLocation\n-from CPT_GFFParser import gffSeqFeature, gffWrite\n-\n-bottomFeatTypes = ["exon", "RBS", "CDS"]\n-\n-def makeGffFeat(inFeat, num, recName, identifier):\n- if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"):\n- inFeat.type = "Shine_Dalgarno_sequence"\n- if "codon_start" in inFeat.qualifiers.keys():\n- shift = int(inFeat.qualifiers["codon_start"][0]) - 1\n- else:\n- shift = "."\n- if identifier in inFeat.qualifiers.keys():\n- name = inFeat.qualifiers[identifier][0] + "." + inFeat.type \n- if num > 0:\n- name += "." + str(num)\n- else:\n- name = recName + "." + inFeat.type + "." + str(num)\n- \n- outFeat = gffSeqFeature(inFeat.location, inFeat.type, \'\', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff")\n- outFeat.qualifiers["ID"] = [name] \n- return outFeat\n-\n-def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):\n-\n- ofh = sys.stdout\n- if outFile:\n- ofh = outFile\n-\n- outRec = []\n- failed = 0\n- for rec in SeqIO.parse(inFile, "genbank"):\n- recID = rec.name\n-\n- if len(str(rec.seq)) > 0:\n- seqs_pending_writes = True\n- outSeq = str(rec.seq)\n- seqLen = len(outSeq)\n-\n- locBucket = {}\n- outFeats = []\n- topTypeDict = {}\n- seekingParent = []\n- geneNum = 0\n- autoGeneNum = 0\n- for feat in rec.features:\n- if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC\n- if feat.type in bottomFeatTypes:\n- seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates]\n- continue\n- elif feat.type not in topTypeDict.keys():\n- topTypeDict[feat.type] = 1\n- else:\n- topTypeDict[feat.type] += 1\n- outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier))\n- continue\n- elif feat.qualifiers[identifier][0] not in locBucket.keys():\n- locBucket[feat.qualifiers[identifier][0]] = []\n- locBucket[feat.qualifiers[identifier][0]].append(feat)\n-\n- for locus in locBucket.keys():\n- minLoc = locBucket[locus][0].location.start\n- maxLoc = locBucket[locus][0].location.end\n- for feat in locBucket[locus]:\n- minLoc = min(minLoc, feat.location.start)\n- maxLoc = max(maxLoc, feat.location.end)\n- for x in seekingParent:\n- if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:\n- x[1].append(locus)\n- if x[0].location.start == minLoc or x[0].location.end == maxLoc:\n- x[2].append(locus)\n-\n- for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free]\n- if len(x[2]) == 1:\n- finList = ""\n- if len(x[1]) > 1:\n- for loc in x[1]:\n- if loc != x[2][0]:\n- finList += loc + ", "\n- finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "."\n- else:\n- finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."\n- if "Notes" not in x[0].qualifiers.keys():\n- x[0].qualifiers["Notes"] = []\n- x[0].qualifiers["Notes"].append(finList)\n- x[1] = x[2][0]\n- elif len(x[2]) > 1:\n- candidate = x[2][0] #Arbitrarily choose first one\n- finList = ""\n- strongList = ""\n- f'..b'peDict[x] = 1\n- \n- if not topFeat:\n- if makeGene:\n- if midFeat:\n- possibleStrand = midFeat.strand\n- else:\n- possibleStrand = bottomFeats[0].strand\n- tempName = recID + ".gene." + str(geneNum)\n- tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}\n- topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), \'gene\', \'\', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")\n- else:\n- sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \\n" % (minLoc, maxLoc))\n- for x in locBucket[locus]:\n- sys.stderr.write(str(x))\n- sys.stderr.write(\'\\n\')\n- failed = 1\n- continue\n-\n- outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))\n- if not midFeat and topFeat.type == "gene" and makeMRNA:\n- if identifier in topFeat.qualifiers.keys():\n- tempName = topFeat.qualifiers[identifier][0] + ".mRNA"\n- tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}\n- else:\n- tempName = outFeats[-1].ID + ".mRNA"\n- tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}\n- midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), \'mRNA\', \'\', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")\n- \n- if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features\n- outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier))\n- outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]\n- for x in bottomFeats:\n- typeDict[x.type] += 1\n- outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))\n- outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id]\n- else: # No midFeat, append bottom feats directly to top feats \n- for x in bottomFeats:\n- typeDict[x.type] += 1\n- outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))\n- outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]\n- \n- outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations))\n- SeqIO.write([outRec[-1]], fastaFile, "fasta")\n- gffWrite(outRec, ofh) \n- exit(failed) # 0 if all features handled, 1 if unable to handle some\n-\n-\n-if __name__ == \'__main__\':\n- parser = argparse.ArgumentParser( description=\'Biopython solution to Gbk to GFF conversion\')\n-\n- parser.add_argument(\'inFile\', type=argparse.FileType("r"), help=\'Path to an input GBK file\' )\n- parser.add_argument(\'--makeMRNA\', action="store_true", required=False, help="Automatically create mRNA features")\n- parser.add_argument(\'--makeGene\', action="store_true", required=False, help="Automatically create missing Gene features")\n- parser.add_argument(\'--identifier\', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from")\n- parser.add_argument(\'--fastaFile\', type=argparse.FileType("w"), help=\'Fasta output for sequences\' )\n- parser.add_argument(\'--outFile\', type=argparse.FileType("w"), help=\'GFF feature output\' )\n- args = parser.parse_args()\n- main(**vars(args))\n-\n-\n-\n-\n-\n-\n-\n-\n' |
b |
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/macros.xml --- a/cpt_gbk_to_gff/macros.xml Fri Jun 17 12:46:43 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,105 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <xml name="ldap_ref" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <repeat name="repeat_@NAME@" title="@LABEL@"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </repeat> - </xml> - <xml name="ldap_ref_single" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </xml> - <xml name="gbk_feature_type" - token_label="Feature type to remove" - token_multiple="True" - token_optional="False" - token_name="positional_2"> - <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select"> - <option value="-10_signal">-10_signal</option> - <option value="-35_signal">-35_signal</option> - <option value="3'UTR">3'UTR</option> - <option value="5'UTR">5'UTR</option> - <option value="CAAT_signal">CAAT_signal</option> - <option selected="true" value="CDS">CDS</option> - <option value="C_region">C_region</option> - <option value="D-loop">D-loop</option> - <option value="D_segment">D_segment</option> - <option value="GC_signal">GC_signal</option> - <option value="J_segment">J_segment</option> - <option value="LTR">LTR</option> - <option value="N_region">N_region</option> - <option value="RBS">RBS</option> - <option value="STS">STS</option> - <option value="S_region">S_region</option> - <option value="TATA_signal">TATA_signal</option> - <option value="V_region">V_region</option> - <option value="V_segment">V_segment</option> - <option value="all">all</option> - <option value="assembly_gap">assembly_gap</option> - <option value="attenuator">attenuator</option> - <option value="enhancer">enhancer</option> - <option value="exon">exon</option> - <option value="gap">gap</option> - <option value="gene">gene</option> - <option value="iDNA">iDNA</option> - <option value="intron">intron</option> - <option value="mRNA">mRNA</option> - <option value="mat_peptide">mat_peptide</option> - <option value="misc_RNA">misc_RNA</option> - <option value="misc_binding">misc_binding</option> - <option value="misc_difference">misc_difference</option> - <option value="misc_feature">misc_feature</option> - <option value="misc_recomb">misc_recomb</option> - <option value="misc_signal">misc_signal</option> - <option value="misc_structure">misc_structure</option> - <option value="mobile_element">mobile_element</option> - <option value="modified_base">modified_base</option> - <option value="ncRNA">ncRNA</option> - <option value="old_sequence">old_sequence</option> - <option value="operon">operon</option> - <option value="oriT">oriT</option> - <option value="polyA_signal">polyA_signal</option> - <option value="polyA_site">polyA_site</option> - <option value="precursor_RNA">precursor_RNA</option> - <option value="prim_transcript">prim_transcript</option> - <option value="primer_bind">primer_bind</option> - <option value="promoter">promoter</option> - <option value="protein_bind">protein_bind</option> - <option value="rRNA">rRNA</option> - <option value="rep_origin">rep_origin</option> - <option value="repeat_region">repeat_region</option> - <option value="sig_peptide">sig_peptide</option> - <option value="source">source</option> - <option value="stem_loop">stem_loop</option> - <option value="tRNA">tRNA</option> - <option value="terminator">terminator</option> - <option value="tmRNA">tmRNA</option> - <option value="transit_peptide">transit_peptide</option> - <option value="unsure">unsure</option> - <option value="variation">variation</option> - </param> - </xml> -</macros> |
b |
diff -r a68f32350196 -r bb6332a85aa6 gbk_to_gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_gff3.py Mon Jun 05 02:43:04 2023 +0000 |
[ |
b'@@ -0,0 +1,477 @@\n+#!/usr/bin/env python\n+\n+import argparse\n+import sys\n+\n+from Bio import SeqIO\n+from Bio.SeqRecord import SeqRecord\n+from Bio.SeqFeature import FeatureLocation\n+from CPT_GFFParser import gffSeqFeature, gffWrite\n+\n+bottomFeatTypes = ["exon", "RBS", "CDS"]\n+\n+\n+def makeGffFeat(inFeat, num, recName, identifier):\n+ if inFeat.type == "RBS" or (\n+ inFeat.type == "regulatory"\n+ and "regulatory_class" in inFeat.qualifiers.keys()\n+ and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"\n+ ):\n+ inFeat.type = "Shine_Dalgarno_sequence"\n+ if "codon_start" in inFeat.qualifiers.keys():\n+ shift = int(inFeat.qualifiers["codon_start"][0]) - 1\n+ else:\n+ shift = "."\n+ if identifier in inFeat.qualifiers.keys():\n+ name = inFeat.qualifiers[identifier][0] + "." + inFeat.type\n+ if num > 0:\n+ name += "." + str(num)\n+ else:\n+ name = recName + "." + inFeat.type + "." + str(num)\n+\n+ outFeat = gffSeqFeature(\n+ inFeat.location,\n+ inFeat.type,\n+ "",\n+ inFeat.strand,\n+ name,\n+ inFeat.qualifiers,\n+ None,\n+ None,\n+ None,\n+ shift,\n+ 0,\n+ "GbkToGff",\n+ )\n+ outFeat.qualifiers["ID"] = [name]\n+ return outFeat\n+\n+\n+def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):\n+\n+ ofh = sys.stdout\n+ if outFile:\n+ ofh = outFile\n+\n+ outRec = []\n+ failed = 0\n+ for rec in SeqIO.parse(inFile, "genbank"):\n+ recID = rec.name\n+\n+ if len(str(rec.seq)) > 0:\n+ seqs_pending_writes = True\n+ outSeq = str(rec.seq)\n+ seqLen = len(outSeq)\n+\n+ locBucket = {}\n+ outFeats = []\n+ topTypeDict = {}\n+ seekingParent = []\n+ geneNum = 0\n+ autoGeneNum = 0\n+ for feat in rec.features:\n+ if (\n+ identifier not in feat.qualifiers.keys()\n+ ): # Allow metadata features and other features with no ID (Output warning?) - AJC\n+ if feat.type in bottomFeatTypes:\n+ seekingParent.append(\n+ [feat, [], []]\n+ ) # [Feature, all parent candidates, strongest parent candidates]\n+ continue\n+ elif feat.type not in topTypeDict.keys():\n+ topTypeDict[feat.type] = 1\n+ else:\n+ topTypeDict[feat.type] += 1\n+ outFeats.append(\n+ makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)\n+ )\n+ continue\n+ elif feat.qualifiers[identifier][0] not in locBucket.keys():\n+ locBucket[feat.qualifiers[identifier][0]] = []\n+ locBucket[feat.qualifiers[identifier][0]].append(feat)\n+\n+ for locus in locBucket.keys():\n+ minLoc = locBucket[locus][0].location.start\n+ maxLoc = locBucket[locus][0].location.end\n+ for feat in locBucket[locus]:\n+ minLoc = min(minLoc, feat.location.start)\n+ maxLoc = max(maxLoc, feat.location.end)\n+ for x in seekingParent:\n+ if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:\n+ x[1].append(locus)\n+ if x[0].location.start == minLoc or x[0].location.end == maxLoc:\n+ x[2].append(locus)\n+\n+ for x in seekingParent: # Reformat to [Feature, Locus, Unused/Free]\n+ if len(x[2]) == 1:\n+ finList = ""\n+ if len(x[1]) > 1:\n+ for loc in x[1]:\n+ if loc != x[2][0]:\n+ finList += loc + ", "\n+ finList = (\n+ str(x[0].type)\n+ + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were "\n+ + finList[0:-2]\n+ '..b' tempQuals = {\n+ identifier: topFeat.qualifiers[identifier],\n+ "ID": [tempName],\n+ "Notes": [\n+ "mRNA feature automatically generated by Gbk to GFF conversion"\n+ ],\n+ }\n+ else:\n+ tempName = outFeats[-1].ID + ".mRNA"\n+ tempQuals = {\n+ identifier: topFeat.qualifiers[identifier],\n+ "ID": [tempName],\n+ "Notes": [\n+ "mRNA feature automatically generated by Gbk to GFF conversion"\n+ ],\n+ }\n+ midFeat = gffSeqFeature(\n+ FeatureLocation(minLoc, maxLoc, topFeat.strand),\n+ "mRNA",\n+ "",\n+ topFeat.strand,\n+ tempName,\n+ tempQuals,\n+ None,\n+ None,\n+ None,\n+ ".",\n+ 0,\n+ "GbkToGff",\n+ )\n+\n+ if (\n+ midFeat\n+ ): # Again, need a new if statement if we want to handle multiple mid-tier features\n+ outFeats[-1].sub_features.append(\n+ makeGffFeat(midFeat, 0, recID, identifier)\n+ )\n+ outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]\n+ for x in bottomFeats:\n+ typeDict[x.type] += 1\n+ outFeats[-1].sub_features[-1].sub_features.append(\n+ makeGffFeat(x, typeDict[x.type], recID, identifier)\n+ )\n+ outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[\n+ "Parent"\n+ ] = [outFeats[-1].sub_features[-1].id]\n+ else: # No midFeat, append bottom feats directly to top feats\n+ for x in bottomFeats:\n+ typeDict[x.type] += 1\n+ outFeats[-1].sub_features.append(\n+ makeGffFeat(x, typeDict[x.type], recID, identifier)\n+ )\n+ outFeats[-1].sub_features[-1].qualifiers["Parent"] = [\n+ outFeats[-1].id\n+ ]\n+\n+ outRec.append(\n+ SeqRecord(\n+ rec.seq,\n+ recID,\n+ rec.name,\n+ rec.description,\n+ rec.dbxrefs,\n+ sorted(outFeats, key=lambda x: x.location.start),\n+ rec.annotations,\n+ rec.letter_annotations,\n+ )\n+ )\n+ SeqIO.write([outRec[-1]], fastaFile, "fasta")\n+ gffWrite(outRec, ofh)\n+ exit(failed) # 0 if all features handled, 1 if unable to handle some\n+\n+\n+if __name__ == "__main__":\n+ parser = argparse.ArgumentParser(\n+ description="Biopython solution to Gbk to GFF conversion"\n+ )\n+\n+ parser.add_argument(\n+ "inFile", type=argparse.FileType("r"), help="Path to an input GBK file"\n+ )\n+ parser.add_argument(\n+ "--makeMRNA",\n+ action="store_true",\n+ required=False,\n+ help="Automatically create mRNA features",\n+ )\n+ parser.add_argument(\n+ "--makeGene",\n+ action="store_true",\n+ required=False,\n+ help="Automatically create missing Gene features",\n+ )\n+ parser.add_argument(\n+ "--identifier",\n+ type=str,\n+ default="locus_tag",\n+ required=False,\n+ help="Qualifier to derive ID property from",\n+ )\n+ parser.add_argument(\n+ "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences"\n+ )\n+ parser.add_argument(\n+ "--outFile", type=argparse.FileType("w"), help="GFF feature output"\n+ )\n+ args = parser.parse_args()\n+ main(**vars(args))\n' |
b |
diff -r a68f32350196 -r bb6332a85aa6 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:43:04 2023 +0000 |
b |
@@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros> |