# HG changeset patch
# User gianmarco_piccinno
# Date 1549889529 18000
# Node ID 9f69df09adf02bb79c630d63a6ea55a12dcedb19
# Parent 18e79593c6282c783574f9187ef95f9dc9f32124
Uploaded
diff -r 18e79593c628 -r 9f69df09adf0 project_rm/codon_usage.py
--- a/project_rm/codon_usage.py Mon Feb 11 04:30:17 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,148 +0,0 @@
-#!/usr/bin/env python
-
-import Bio as Bio
-from Bio import SeqIO
-from Bio.Data import CodonTable
-import re
-from pprint import pprint
-import argparse as ap
-import sys
-import os
-import pandas as pd
-
-
-def read_input(data = "example.fna", type_ = "fasta"):
-
- """
- Accepted formats:
- - fasta (multifasta)
- - gff
- - gbk
-
- """
-
- seqs = ""
-
- if type_ == "fasta":
- with open(data, "rU") as handle:
- for record in SeqIO.parse(handle, type_):
- seqs = seqs + str(record.seq)
-
- elif type_ == "gbk":
- with open(data, "rU") as input_handle:
- types = []
- for record in SeqIO.parse(input_handle, "genbank"):
- for feature in record.features:
- types.append(feature.type)
- if feature.type == "CDS":
- if feature.location.strand == +1:
- seq = record.seq[feature.location.start:feature.location.end]
- seqs = seqs + str(seq)
- elif feature.location.strand == -1:
- seq = record.seq[feature.location.start:feature.location.end].reverse_complement
- seqs = seqs + str(seq)
- return seqs
-
-def codon_usage(seqs, codonTable):
-
- codon_usage = {}
- tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
-
- b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
-
-
- for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
- b_cod_table[cod] = "_Stop"
-
- for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
- b_cod_table[cod + " Start"] = b_cod_table[cod]
- b_cod_table.pop(cod)
-
- aas = set(b_cod_table.values())
-
-
- for aa in aas:
- codon_usage[aa] = {}
- for codon in b_cod_table.keys():
- if b_cod_table[codon] == aa:
- codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
-
-
- tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
-
- codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
- codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
- codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
-
- return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
-
-if __name__ == '__main__':
-
- parser = ap.ArgumentParser(description=
- 'This script takes as input gff, gbk and single or multifasta files and \n'
- 'compute the codon usage for a specified codon table.\n'
- 'Usage:\n'
- 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
- 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
- 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
- formatter_class=ap.RawTextHelpFormatter)
-
- parser.add_argument('-i','--input', help='The path to the input file',required=True)
- parser.add_argument('-t','--type', help=
- 'The format of the file [genebank, fasta, gff ...]', required=True)
- parser.add_argument('-c','--codonTable', help=
- 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
- 'Alternative Flatworm Mitochondrial,\\n'
- 'Alternative Yeast Nuclear,\n'
- 'Archaeal,\n'
- 'Ascidian Mitochondrial,\n'
- 'Bacterial,\n'
- 'Blastocrithidia Nuclear,\n'
- 'Blepharisma Macronuclear,\n'
- 'Candidate Division SR1,\n'
- 'Chlorophycean Mitochondrial,\n'
- 'Ciliate Nuclear,\n'
- 'Coelenterate Mitochondrial,\n'
- 'Condylostoma Nuclear,\n'
- 'Dasycladacean Nuclear,\n'
- 'Echinoderm Mitochondrial,\n'
- 'Euplotid Nuclear,\n'
- 'Flatworm Mitochondrial,\n'
- 'Gracilibacteria,\n'
- 'Hexamita Nuclear,\n'
- 'Invertebrate Mitochondrial,\n'
- 'Karyorelict Nuclear,\n'
- 'Mesodinium Nuclear,\n'
- 'Mold Mitochondrial,\n'
- 'Mycoplasma,\n'
- 'Pachysolen tannophilus Nuclear,\n'
- 'Peritrich Nuclear,\n'
- 'Plant Plastid,\n'
- 'Protozoan Mitochondrial,\n'
- 'Pterobranchia Mitochondrial,\n'
- 'SGC0,\n'
- 'SGC1,\n'
- 'SGC2,\n'
- 'SGC3,\n'
- 'SGC4,\n'
- 'SGC5,\n'
- 'SGC8,\n'
- 'SGC9,\n'
- 'Scenedesmus obliquus Mitochondrial,\n'
- 'Spiroplasma,\n'
- 'Standard,\n'
- 'Thraustochytrium Mitochondrial,\n'
- 'Trematode Mitochondrial,\n'
- 'Vertebrate Mitochondrial,\n'
- 'Yeast Mitochondrial\n', required=True)
-
- parser.add_argument('-o','--output', help='Description for bar argument', required=True)
- args = vars(parser.parse_args())
-
- seqs = read_input(data=args['input'], type_=args['type'])
- out = codon_usage(seqs, args['codonTable'])
-
- with open(args['output'], "w") as outf:
- out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
-
-
diff -r 18e79593c628 -r 9f69df09adf0 project_rm/codon_usage.xml
--- a/project_rm/codon_usage.xml Mon Feb 11 04:30:17 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,40 +0,0 @@
-
-
- for each sequence in a file
-
- python
- biopython
- pandas
- numpy
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-This tool compute codon usage of an annotated genome [preferably Prokaryotes].
-
-
diff -r 18e79593c628 -r 9f69df09adf0 project_rm/project_rm/codon_usage.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/project_rm/project_rm/codon_usage.py Mon Feb 11 07:52:09 2019 -0500
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+import Bio as Bio
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import re
+from pprint import pprint
+import argparse as ap
+import sys
+import os
+import pandas as pd
+
+
+def read_input(data = "example.fna", type_ = "fasta"):
+
+ """
+ Accepted formats:
+ - fasta (multifasta)
+ - gff
+ - gbk
+
+ """
+
+ seqs = ""
+
+ if type_ == "fasta":
+ with open(data, "rU") as handle:
+ for record in SeqIO.parse(handle, type_):
+ seqs = seqs + str(record.seq)
+
+ elif type_ == "gbk":
+ with open(data, "rU") as input_handle:
+ types = []
+ for record in SeqIO.parse(input_handle, "genbank"):
+ for feature in record.features:
+ types.append(feature.type)
+ if feature.type == "CDS":
+ if feature.location.strand == +1:
+ seq = record.seq[feature.location.start:feature.location.end]
+ seqs = seqs + str(seq)
+ elif feature.location.strand == -1:
+ seq = record.seq[feature.location.start:feature.location.end].reverse_complement
+ seqs = seqs + str(seq)
+ return seqs
+
+def codon_usage(seqs, codonTable):
+
+ codon_usage = {}
+ tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""]
+
+ b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table
+
+
+ for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons:
+ b_cod_table[cod] = "_Stop"
+
+ for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:
+ b_cod_table[cod + " Start"] = b_cod_table[cod]
+ b_cod_table.pop(cod)
+
+ aas = set(b_cod_table.values())
+
+
+ for aa in aas:
+ codon_usage[aa] = {}
+ for codon in b_cod_table.keys():
+ if b_cod_table[codon] == aa:
+ codon_usage[aa][codon] = tmp.count(codon.split(" ")[0])
+
+
+ tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}
+
+ codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])
+ codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])
+ codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))
+
+ return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}
+
+if __name__ == '__main__':
+
+ parser = ap.ArgumentParser(description=
+ 'This script takes as input gff, gbk and single or multifasta files and \n'
+ 'compute the codon usage for a specified codon table.\n'
+ 'Usage:\n'
+ 'python codon_usage.py -i example.gbk -t genebank -o gbk_example -c Bacterial\n'
+ 'python codon_usage.py -i example.ffn -t fasta -o fasta_example -c Bacterial\n'
+ 'python codon_usage.py -i example.gff -t gff -o gff_example -c Bacterial\n',
+ formatter_class=ap.RawTextHelpFormatter)
+
+ parser.add_argument('-i','--input', help='The path to the input file',required=True)
+ parser.add_argument('-t','--type', help=
+ 'The format of the file [genebank, fasta, gff ...]', required=True)
+ parser.add_argument('-c','--codonTable', help=
+ 'The codon table to be used [Standard, Bacterial, Archaeal ...]\n'
+ 'Alternative Flatworm Mitochondrial,\\n'
+ 'Alternative Yeast Nuclear,\n'
+ 'Archaeal,\n'
+ 'Ascidian Mitochondrial,\n'
+ 'Bacterial,\n'
+ 'Blastocrithidia Nuclear,\n'
+ 'Blepharisma Macronuclear,\n'
+ 'Candidate Division SR1,\n'
+ 'Chlorophycean Mitochondrial,\n'
+ 'Ciliate Nuclear,\n'
+ 'Coelenterate Mitochondrial,\n'
+ 'Condylostoma Nuclear,\n'
+ 'Dasycladacean Nuclear,\n'
+ 'Echinoderm Mitochondrial,\n'
+ 'Euplotid Nuclear,\n'
+ 'Flatworm Mitochondrial,\n'
+ 'Gracilibacteria,\n'
+ 'Hexamita Nuclear,\n'
+ 'Invertebrate Mitochondrial,\n'
+ 'Karyorelict Nuclear,\n'
+ 'Mesodinium Nuclear,\n'
+ 'Mold Mitochondrial,\n'
+ 'Mycoplasma,\n'
+ 'Pachysolen tannophilus Nuclear,\n'
+ 'Peritrich Nuclear,\n'
+ 'Plant Plastid,\n'
+ 'Protozoan Mitochondrial,\n'
+ 'Pterobranchia Mitochondrial,\n'
+ 'SGC0,\n'
+ 'SGC1,\n'
+ 'SGC2,\n'
+ 'SGC3,\n'
+ 'SGC4,\n'
+ 'SGC5,\n'
+ 'SGC8,\n'
+ 'SGC9,\n'
+ 'Scenedesmus obliquus Mitochondrial,\n'
+ 'Spiroplasma,\n'
+ 'Standard,\n'
+ 'Thraustochytrium Mitochondrial,\n'
+ 'Trematode Mitochondrial,\n'
+ 'Vertebrate Mitochondrial,\n'
+ 'Yeast Mitochondrial\n', required=True)
+
+ parser.add_argument('-o','--output', help='Description for bar argument', required=True)
+ args = vars(parser.parse_args())
+
+ seqs = read_input(data=args['input'], type_=args['type'])
+ out = codon_usage(seqs, args['codonTable'])
+
+ with open(args['output'], "w") as outf:
+ out["Table"].to_csv(outf, sep="\t", index_label=["AA", "Codon"])
+
+
diff -r 18e79593c628 -r 9f69df09adf0 project_rm/project_rm/codon_usage.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/project_rm/project_rm/codon_usage.xml Mon Feb 11 07:52:09 2019 -0500
@@ -0,0 +1,80 @@
+
+
+ for each sequence in a file
+
+ python
+ biopython
+ pandas
+ numpy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes codon usage of an annotated genome [preferably Prokaryotes].
+
+