Repository 'project_rm'
hg clone https://toolshed.g2.bx.psu.edu/repos/gianmarco_piccinno/project_rm

Changeset 0:92d2b0a37086 (2018-12-09)
Next changeset 1:987ac30b5bb0 (2018-12-09)
Commit message:
Uploaded-dev
added:
Project_RM/codon_usage2.py
Project_RM/codon_usage_complete.xml
b
diff -r 000000000000 -r 92d2b0a37086 Project_RM/codon_usage2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Project_RM/codon_usage2.py Sun Dec 09 06:00:26 2018 -0500
[
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+
+import Bio as Bio
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import re
+from pprint import pprint
+import argparse as ap
+import sys
+import os
+import pandas as pd
+#from BCBio import GFF
+
+
+def read_input(data = "example.fna", type_ = "fasta"):
+
+    """
+    Accepted formats:
+        - fasta (multifasta)
+        - gff
+        - gbk
+
+    """
+
+
+    seqs = ""
+
+    if type_ == "fasta":
+        with open(data, "rU") as handle:
+            for record in SeqIO.parse(handle, type_):
+                seqs = seqs + str(record.seq)
+
+
+    #elif type_ == "gff":
+    #    with open(data, "rU") as handle:
+    #        for record in GFF.parse(handle):
+    #            seqs = seqs + str(record.seq)
+
+    elif type_ == "gbk":
+        with open(data, "rU") as input_handle:
+                for record in SeqIO.parse(input_handle, "genbank"):
+                    seqs = seqs + str(record.seq)
+
+
+    return seqs
+
+def codon_usage(seqs, codonTable):
+
+    codon_usage = {}
+    tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
+
+    b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
+
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
+        b_cod_table[cod] = "_Stop"
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
+            b_cod_table[cod + " Start"] = b_cod_table[cod]
+            b_cod_table.pop(cod)
+
+    aas = set(b_cod_table.values())
+
+
+    for aa in aas:
+        codon_usage[aa] = {}
+        for codon in b_cod_table.keys():
+            if b_cod_table[codon] == aa:
+                codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
+
+
+    tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
+
+    codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
+    codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
+    codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
+
+    return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
+
+if __name__ == '__main__':
+
+    parser = ap.ArgumentParser(description=
+    'This script takes as input gff, gbk and single or multifasta files and \n'
+    'compute the codon usage for a specified codon table.\n'
+    'Usage:\n'
+    'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
+    'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
+    'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
+    formatter_class=ap.RawTextHelpFormatter)
+
+    parser.add_argument('-i','--input', help='The path to the input file',required=True)
+    parser.add_argument('-t','--type', help=
+    'The format of the file [genebank, fasta, gff ...]', required=True)
+    parser.add_argument('-c','--codonTable', help=
+    'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
+    'Alternative Flatworm Mitochondrial,\\n'
+    'Alternative Yeast Nuclear,\n'
+    'Archaeal,\n'
+    'Ascidian Mitochondrial,\n'
+    'Bacterial,\n'
+    'Blastocrithidia Nuclear,\n'
+    'Blepharisma Macronuclear,\n'
+    'Candidate Division SR1,\n'
+    'Chlorophycean Mitochondrial,\n'
+    'Ciliate Nuclear,\n'
+    'Coelenterate Mitochondrial,\n'
+    'Condylostoma Nuclear,\n'
+    'Dasycladacean Nuclear,\n'
+    'Echinoderm Mitochondrial,\n'
+    'Euplotid Nuclear,\n'
+    'Flatworm Mitochondrial,\n'
+    'Gracilibacteria,\n'
+    'Hexamita Nuclear,\n'
+    'Invertebrate Mitochondrial,\n'
+    'Karyorelict Nuclear,\n'
+    'Mesodinium Nuclear,\n'
+    'Mold Mitochondrial,\n'
+    'Mycoplasma,\n'
+    'Pachysolen tannophilus Nuclear,\n'
+    'Peritrich Nuclear,\n'
+    'Plant Plastid,\n'
+    'Protozoan Mitochondrial,\n'
+    'Pterobranchia Mitochondrial,\n'
+    'SGC0,\n'
+    'SGC1,\n'
+    'SGC2,\n'
+    'SGC3,\n'
+    'SGC4,\n'
+    'SGC5,\n'
+    'SGC8,\n'
+    'SGC9,\n'
+    'Scenedesmus obliquus Mitochondrial,\n'
+    'Spiroplasma,\n'
+    'Standard,\n'
+    'Thraustochytrium Mitochondrial,\n'
+    'Trematode Mitochondrial,\n'
+    'Vertebrate Mitochondrial,\n'
+    'Yeast Mitochondrial\n', required=True)
+
+    parser.add_argument('-o','--output', help='Description for bar argument', required=True)
+    args = vars(parser.parse_args())
+
+    seqs = read_input(data=args['input'], type_=args['type'])
+    out = codon_usage(seqs, args['codonTable'])
+
+    with open(args['output'], "w") as outf:
+        out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
+
+    
\ No newline at end of file
b
diff -r 000000000000 -r 92d2b0a37086 Project_RM/codon_usage_complete.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Project_RM/codon_usage_complete.xml Sun Dec 09 06:00:26 2018 -0500
[
@@ -0,0 +1,39 @@
+<tool id="codon_usage" name="Codon Usage" version="0.1.0">
+  <description>for each sequence in a file</description>
+  <requirements>
+    <requirement type="python module">biopython</requirement>
+    <requirement type="python module">pandas</requirement>
+  </requirements> 
+  
+  <command interpreter="python">codon_usage2.py python -i $input -t $input_type -o $output -c $codon_table</command>
+  <inputs>
+    <param name="input" format="fasta" type="data" label="Source file"/>
+
+    <param name="input_type" type="select" format="text">
+ <label>Indicate the input file format</label>
+ <option value="fasta">Fasta</option>
+ <option value="gbk">gbk</option>
+ <option value="gff">gff</option>
+ </param>
+
+    
+
+  </inputs>
+
+
+
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="fa_gc_content_input.fa"/>
+      <output name="out_file1" file="fa_gc_content_output.txt"/>
+    </test>
+  </tests>
+
+  <help>
+This tool compute codon usage of an annotated genome [preferably Prokaryotes].
+  </help>
+</tool>