Repository 'project_rm'
hg clone https://toolshed.g2.bx.psu.edu/repos/gianmarco_piccinno/project_rm

Changeset 37:9225a7649585 (2019-02-11)
Previous changeset 36:b52b91e34d15 (2018-12-12) Next changeset 38:18e79593c628 (2019-02-11)
Commit message:
Uploaded
added:
project_rm/codon_usage.py
project_rm/codon_usage.xml
removed:
codon_usage.py
codon_usage.xml
b
diff -r b52b91e34d15 -r 9225a7649585 codon_usage.py
--- a/codon_usage.py Wed Dec 12 09:10:57 2018 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,148 +0,0 @@
-#!/usr/bin/env python
-
-import Bio as Bio
-from Bio import SeqIO
-from Bio.Data import CodonTable
-import re
-from pprint import pprint
-import argparse as ap
-import sys
-import os
-import pandas as pd
-
-
-def read_input(data = "example.fna", type_ = "fasta"):
-
-    """
-    Accepted formats:
-        - fasta (multifasta)
-        - gff
-        - gbk
-
-    """
-
-
-    seqs = ""
-
-    if type_ == "fasta":
-        with open(data, "rU") as handle:
-            for record in SeqIO.parse(handle, type_):
-                seqs = seqs + str(record.seq)
-
-
-    #elif type_ == "gff":
-    #    with open(data, "rU") as handle:
-    #        for record in GFF.parse(handle):
-    #            seqs = seqs + str(record.seq)
-
-    elif type_ == "gbk":
-        with open(data, "rU") as input_handle:
-                for record in SeqIO.parse(input_handle, "genbank"):
-                    seqs = seqs + str(record.seq)
-
-
-    return seqs
-
-def codon_usage(seqs, codonTable):
-
-    codon_usage = {}
-    tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
-
-    b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
-
-
-    for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
-        b_cod_table[cod] = "_Stop"
-
-    for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
-            b_cod_table[cod + " Start"] = b_cod_table[cod]
-            b_cod_table.pop(cod)
-
-    aas = set(b_cod_table.values())
-
-
-    for aa in aas:
-        codon_usage[aa] = {}
-        for codon in b_cod_table.keys():
-            if b_cod_table[codon] == aa:
-                codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
-
-
-    tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
-
-    codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
-    codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
-    codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
-
-    return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
-
-if __name__ == '__main__':
-
-    parser = ap.ArgumentParser(description=
-    'This script takes as input gff, gbk and single or multifasta files and \n'
-    'compute the codon usage for a specified codon table.\n'
-    'Usage:\n'
-    'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
-    'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
-    'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
-    formatter_class=ap.RawTextHelpFormatter)
-
-    parser.add_argument('-i','--input', help='The path to the input file',required=True)
-    parser.add_argument('-t','--type', help=
-    'The format of the file [genebank, fasta, gff ...]', required=True)
-    parser.add_argument('-c','--codonTable', help=
-    'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
-    'Alternative Flatworm Mitochondrial,\\n'
-    'Alternative Yeast Nuclear,\n'
-    'Archaeal,\n'
-    'Ascidian Mitochondrial,\n'
-    'Bacterial,\n'
-    'Blastocrithidia Nuclear,\n'
-    'Blepharisma Macronuclear,\n'
-    'Candidate Division SR1,\n'
-    'Chlorophycean Mitochondrial,\n'
-    'Ciliate Nuclear,\n'
-    'Coelenterate Mitochondrial,\n'
-    'Condylostoma Nuclear,\n'
-    'Dasycladacean Nuclear,\n'
-    'Echinoderm Mitochondrial,\n'
-    'Euplotid Nuclear,\n'
-    'Flatworm Mitochondrial,\n'
-    'Gracilibacteria,\n'
-    'Hexamita Nuclear,\n'
-    'Invertebrate Mitochondrial,\n'
-    'Karyorelict Nuclear,\n'
-    'Mesodinium Nuclear,\n'
-    'Mold Mitochondrial,\n'
-    'Mycoplasma,\n'
-    'Pachysolen tannophilus Nuclear,\n'
-    'Peritrich Nuclear,\n'
-    'Plant Plastid,\n'
-    'Protozoan Mitochondrial,\n'
-    'Pterobranchia Mitochondrial,\n'
-    'SGC0,\n'
-    'SGC1,\n'
-    'SGC2,\n'
-    'SGC3,\n'
-    'SGC4,\n'
-    'SGC5,\n'
-    'SGC8,\n'
-    'SGC9,\n'
-    'Scenedesmus obliquus Mitochondrial,\n'
-    'Spiroplasma,\n'
-    'Standard,\n'
-    'Thraustochytrium Mitochondrial,\n'
-    'Trematode Mitochondrial,\n'
-    'Vertebrate Mitochondrial,\n'
-    'Yeast Mitochondrial\n', required=True)
-
-    parser.add_argument('-o','--output', help='Description for bar argument', required=True)
-    args = vars(parser.parse_args())
-
-    seqs = read_input(data=args['input'], type_=args['type'])
-    out = codon_usage(seqs, args['codonTable'])
-
-    with open(args['output'], "w") as outf:
-        out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
-
-    
\ No newline at end of file
b
diff -r b52b91e34d15 -r 9225a7649585 codon_usage.xml
--- a/codon_usage.xml Wed Dec 12 09:10:57 2018 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,40 +0,0 @@
-<?xml version="1.0"?>
-<tool name="Codon Usage" id="codon_usage" version="0.2.3">
-  <description>for each sequence in a file</description>
-  <requirements>
-    <requirement type="package" version="2.7.15">python</requirement>
-    <requirement type="package" version="1.72">biopython</requirement>
-    <requirement type="package" version="0.23.4">pandas</requirement>
-    <requirement type="package" version="1.15.3">numpy</requirement>
-  </requirements>
-
-  <command>
-<![CDATA[
-  python '$__tool_directory__/codon_usage.py' -i $input -t $input_type -o $output -c $codon_table
-]]>
-  </command>
-  <inputs>
-    <param name="input" format="fasta" type="data" label="Source file"/>
-
-    <param name="input_type" type="select" format="text">
- <label>Indicate the input file format</label>
- <option value="fasta">Fasta</option>
- <option value="gbk">gbk</option>
- </param>
-
-    <param name="codon_table" type="select" format="text">
- <label>Choose the proper codon table for your organism)</label>
-        <option value="Archaeal"> Archaeal</option>
-        <option value="Bacterial">Bacterial</option>
-        <option value="Standard">Standard</option>
- </param>
-  </inputs>
-
-  <outputs>
-    <data format="tabular" name="output" />
-  </outputs>
-
-  <help>
-This tool compute codon usage of an annotated genome [preferably Prokaryotes].
-  </help>
-</tool>
b
diff -r b52b91e34d15 -r 9225a7649585 project_rm/codon_usage.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/project_rm/codon_usage.py Mon Feb 11 04:09:02 2019 -0500
[
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+import Bio as Bio
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import re
+from pprint import pprint
+import argparse as ap
+import sys
+import os
+import pandas as pd
+
+
+def read_input(data = "example.fna", type_ = "fasta"):
+
+    """
+    Accepted formats:
+        - fasta (multifasta)
+        - gff
+        - gbk
+
+    """
+
+    seqs = ""
+
+    if type_ == "fasta":
+        with open(data, "rU") as handle:
+            for record in SeqIO.parse(handle, type_):
+                seqs = seqs + str(record.seq)
+
+    elif type_ == "gbk":
+        with open(data, "rU") as input_handle:
+            types = []
+            for record in SeqIO.parse(input_handle, "genbank"):
+                for feature in record.features:
+                    types.append(feature.type)
+                    if feature.type == "CDS":
+                        if feature.location.strand == +1:
+                            seq = record.seq[feature.location.start:feature.location.end]
+                            seqs = seqs + str(seq)
+                        elif feature.location.strand == -1:
+                            seq = record.seq[feature.location.start:feature.location.end].reverse_complement
+                            seqs = seqs + str(seq)
+    return seqs
+
+def codon_usage(seqs, codonTable):
+
+    codon_usage = {}
+    tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
+
+    b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
+
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
+        b_cod_table[cod] = "_Stop"
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
+            b_cod_table[cod + " Start"] = b_cod_table[cod]
+            b_cod_table.pop(cod)
+
+    aas = set(b_cod_table.values())
+
+
+    for aa in aas:
+        codon_usage[aa] = {}
+        for codon in b_cod_table.keys():
+            if b_cod_table[codon] == aa:
+                codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
+
+
+    tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
+
+    codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
+    codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
+    codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
+
+    return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
+
+if __name__ == '__main__':
+
+    parser = ap.ArgumentParser(description=
+    'This script takes as input gff, gbk and single or multifasta files and \n'
+    'compute the codon usage for a specified codon table.\n'
+    'Usage:\n'
+    'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
+    'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
+    'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
+    formatter_class=ap.RawTextHelpFormatter)
+
+    parser.add_argument('-i','--input', help='The path to the input file',required=True)
+    parser.add_argument('-t','--type', help=
+    'The format of the file [genebank, fasta, gff ...]', required=True)
+    parser.add_argument('-c','--codonTable', help=
+    'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
+    'Alternative Flatworm Mitochondrial,\\n'
+    'Alternative Yeast Nuclear,\n'
+    'Archaeal,\n'
+    'Ascidian Mitochondrial,\n'
+    'Bacterial,\n'
+    'Blastocrithidia Nuclear,\n'
+    'Blepharisma Macronuclear,\n'
+    'Candidate Division SR1,\n'
+    'Chlorophycean Mitochondrial,\n'
+    'Ciliate Nuclear,\n'
+    'Coelenterate Mitochondrial,\n'
+    'Condylostoma Nuclear,\n'
+    'Dasycladacean Nuclear,\n'
+    'Echinoderm Mitochondrial,\n'
+    'Euplotid Nuclear,\n'
+    'Flatworm Mitochondrial,\n'
+    'Gracilibacteria,\n'
+    'Hexamita Nuclear,\n'
+    'Invertebrate Mitochondrial,\n'
+    'Karyorelict Nuclear,\n'
+    'Mesodinium Nuclear,\n'
+    'Mold Mitochondrial,\n'
+    'Mycoplasma,\n'
+    'Pachysolen tannophilus Nuclear,\n'
+    'Peritrich Nuclear,\n'
+    'Plant Plastid,\n'
+    'Protozoan Mitochondrial,\n'
+    'Pterobranchia Mitochondrial,\n'
+    'SGC0,\n'
+    'SGC1,\n'
+    'SGC2,\n'
+    'SGC3,\n'
+    'SGC4,\n'
+    'SGC5,\n'
+    'SGC8,\n'
+    'SGC9,\n'
+    'Scenedesmus obliquus Mitochondrial,\n'
+    'Spiroplasma,\n'
+    'Standard,\n'
+    'Thraustochytrium Mitochondrial,\n'
+    'Trematode Mitochondrial,\n'
+    'Vertebrate Mitochondrial,\n'
+    'Yeast Mitochondrial\n', required=True)
+
+    parser.add_argument('-o','--output', help='Description for bar argument', required=True)
+    args = vars(parser.parse_args())
+
+    seqs = read_input(data=args['input'], type_=args['type'])
+    out = codon_usage(seqs, args['codonTable'])
+
+    with open(args['output'], "w") as outf:
+        out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
+
+    
b
diff -r b52b91e34d15 -r 9225a7649585 project_rm/codon_usage.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/project_rm/codon_usage.xml Mon Feb 11 04:09:02 2019 -0500
[
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<tool name="Codon Usage" id="codon_usage" version="0.2.3">
+  <description>for each sequence in a file</description>
+  <requirements>
+    <requirement type="package" version="2.7.15">python</requirement>
+    <requirement type="package" version="1.72">biopython</requirement>
+    <requirement type="package" version="0.23.4">pandas</requirement>
+    <requirement type="package" version="1.15.3">numpy</requirement>
+  </requirements>
+
+  <command>
+<![CDATA[
+  python '$__tool_directory__/codon_usage.py' -i $input -t $input_type -o $output -c $codon_table
+]]>
+  </command>
+  <inputs>
+    <param name="input" format="fasta" type="data" label="Source file"/>
+
+    <param name="input_type" type="select" format="text">
+ <label>Indicate the input file format</label>
+ <option value="fasta">Fasta</option>
+ <option value="gbk">gbk</option>
+ </param>
+
+    <param name="codon_table" type="select" format="text">
+ <label>Choose the proper codon table for your organism)</label>
+        <option value="Archaeal"> Archaeal</option>
+        <option value="Bacterial">Bacterial</option>
+        <option value="Standard">Standard</option>
+ </param>
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <help>
+This tool compute codon usage of an annotated genome [preferably Prokaryotes].
+  </help>
+</tool>