Mercurial > repos > gianmarco_piccinno > project_rm

Binary file ._.shed.yml has changed
Binary file ._codon_usage.py has changed
Binary file ._codon_usage.xml has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Wed Dec 12 08:12:25 2018 -0500
@@ -0,0 +1,11 @@
+name: codon_usage
+owner: fabio
+categories:
+  - Text Manipulation
+description: RM-tool
+long_description: |
+  RM-tool - dev
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  descriptor_template: "Wrapper for Codon Usage application: {{ tool_name }}."
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/codon_usage.py	Wed Dec 12 08:12:25 2018 -0500
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+
+import Bio as Bio
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import re
+from pprint import pprint
+import argparse as ap
+import sys
+import os
+import pandas as pd
+#from BCBio import GFF
+
+
+def read_input(data = "example.fna", type_ = "fasta"):
+
+    """
+    Accepted formats:
+        - fasta (multifasta)
+        - gff
+        - gbk
+
+    """
+
+
+    seqs = ""
+
+    if type_ == "fasta":
+        with open(data, "rU") as handle:
+            for record in SeqIO.parse(handle, type_):
+                seqs = seqs + str(record.seq)
+
+
+    #elif type_ == "gff":
+    #    with open(data, "rU") as handle:
+    #        for record in GFF.parse(handle):
+    #            seqs = seqs + str(record.seq)
+
+    elif type_ == "gbk":
+        with open(data, "rU") as input_handle:
+                for record in SeqIO.parse(input_handle, "genbank"):
+                    seqs = seqs + str(record.seq)
+
+
+    return seqs
+
+def codon_usage(seqs, codonTable):
+
+    codon_usage = {}
+    tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
+
+    b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
+
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
+        b_cod_table[cod] = "_Stop"
+
+    for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
+            b_cod_table[cod + " Start"] = b_cod_table[cod]
+            b_cod_table.pop(cod)
+
+    aas = set(b_cod_table.values())
+
+
+    for aa in aas:
+        codon_usage[aa] = {}
+        for codon in b_cod_table.keys():
+            if b_cod_table[codon] == aa:
+                codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
+
+
+    tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
+
+    codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
+    codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
+    codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
+
+    return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
+
+if __name__ == '__main__':
+
+    parser = ap.ArgumentParser(description=
+    'This script takes as input gff, gbk and single or multifasta files and \n'
+    'compute the codon usage for a specified codon table.\n'
+    'Usage:\n'
+    'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
+    'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
+    'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
+    formatter_class=ap.RawTextHelpFormatter)
+
+    parser.add_argument('-i','--input', help='The path to the input file',required=True)
+    parser.add_argument('-t','--type', help=
+    'The format of the file [genebank, fasta, gff ...]', required=True)
+    parser.add_argument('-c','--codonTable', help=
+    'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
+    'Alternative Flatworm Mitochondrial,\\n'
+    'Alternative Yeast Nuclear,\n'
+    'Archaeal,\n'
+    'Ascidian Mitochondrial,\n'
+    'Bacterial,\n'
+    'Blastocrithidia Nuclear,\n'
+    'Blepharisma Macronuclear,\n'
+    'Candidate Division SR1,\n'
+    'Chlorophycean Mitochondrial,\n'
+    'Ciliate Nuclear,\n'
+    'Coelenterate Mitochondrial,\n'
+    'Condylostoma Nuclear,\n'
+    'Dasycladacean Nuclear,\n'
+    'Echinoderm Mitochondrial,\n'
+    'Euplotid Nuclear,\n'
+    'Flatworm Mitochondrial,\n'
+    'Gracilibacteria,\n'
+    'Hexamita Nuclear,\n'
+    'Invertebrate Mitochondrial,\n'
+    'Karyorelict Nuclear,\n'
+    'Mesodinium Nuclear,\n'
+    'Mold Mitochondrial,\n'
+    'Mycoplasma,\n'
+    'Pachysolen tannophilus Nuclear,\n'
+    'Peritrich Nuclear,\n'
+    'Plant Plastid,\n'
+    'Protozoan Mitochondrial,\n'
+    'Pterobranchia Mitochondrial,\n'
+    'SGC0,\n'
+    'SGC1,\n'
+    'SGC2,\n'
+    'SGC3,\n'
+    'SGC4,\n'
+    'SGC5,\n'
+    'SGC8,\n'
+    'SGC9,\n'
+    'Scenedesmus obliquus Mitochondrial,\n'
+    'Spiroplasma,\n'
+    'Standard,\n'
+    'Thraustochytrium Mitochondrial,\n'
+    'Trematode Mitochondrial,\n'
+    'Vertebrate Mitochondrial,\n'
+    'Yeast Mitochondrial\n', required=True)
+
+    parser.add_argument('-o','--output', help='Description for bar argument', required=True)
+    args = vars(parser.parse_args())
+
+    seqs = read_input(data=args['input'], type_=args['type'])
+    out = codon_usage(seqs, args['codonTable'])
+
+    with open(args['output'], "w") as outf:
+        out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
+
+
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/codon_usage.xml	Wed Dec 12 08:12:25 2018 -0500
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<tool name="Codon Usage" id="codon_usage" version="0.2.0">
+  <description>for each sequence in a file</description>
+  <requirements>
+    <requirement type="package" version="3.6.0">python</requirement>
+    <requirement type="package" version="1.72">biopython</requirement>
+    <requirement type="package" version="0.23.4">pandas</requirement>
+  </requirements>
+
+  <command>
+<![CDATA[
+  python '$__tool_directory__/codon_usage.py' -i $input -t $input_type -o $output -c $codon_table
+]]>
+  </command>
+  <inputs>
+    <param name="input" format="fasta" type="data" label="Source file"/>
+
+    <param name="input_type" type="select" format="text">
+			<label>Indicate the input file format</label>
+			<option value="fasta">Fasta</option>
+			<option value="gbk">gbk</option>
+		</param>
+
+    <param name="codon_table" type="select" format="text">
+			<label>Choose the proper codon table for your organism)</label>
+        <option value="Archaeal"> Archaeal</option>
+        <option value="Bacterial">Bacterial</option>
+        <option value="Standard">Standard</option>
+		</param>
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <help>
+This tool compute codon usage of an annotated genome [preferably Prokaryotes].
+  </help>
+</tool>