Mercurial > repos > gianmarco_piccinno > project_rm
changeset 12:8cc325f758b0 draft
Uploaded
| author | gianmarco_piccinno | 
|---|---|
| date | Tue, 11 Dec 2018 10:38:18 -0500 | 
| parents | 55d338c493f4 | 
| children | b407081a010f | 
| files | Project_RM/Project_RM/codon_usage.py Project_RM/Project_RM/codon_usage.xml Project_RM/codon_usage.py Project_RM/codon_usage.xml | 
| diffstat | 4 files changed, 120 insertions(+), 119 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Project_RM/Project_RM/codon_usage.py Tue Dec 11 10:38:18 2018 -0500 @@ -0,0 +1,75 @@ +#!/home/gianmarco/galaxy-python/python + +import Bio +from Bio import SeqIO +from Bio.Data import CodonTable +import re +import sys +import os +import pandas as pd + +def read_input(data = "example.fna"): + + seqs = "" + with open(data, "rU") as handle: + for record in SeqIO.parse(handle, "fasta"): + seqs = seqs + str(record.seq) + + return seqs + +def codon_usage(seqs, codonTable): + + codon_usage = {} + tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] + + b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table + + + for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: + b_cod_table[cod] = "_Stop" + + for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: + b_cod_table[cod + " Start"] = b_cod_table[cod] + b_cod_table.pop(cod) + + aas = set(b_cod_table.values()) + + + for aa in aas: + codon_usage[aa] = {} + for codon in b_cod_table.keys(): + if b_cod_table[codon] == aa: + codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) + + + tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} + + #aas_ = set(tups.keys()) + + #stops_ = {el for el in aas_ if el[0] == "Stop"} + #aas_ = list(aas_.difference(stops_)) + #stops_ = list(stops_) + #aas_.sort() + #stops_.sort() + + codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) + codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) + #codon_usage_.index.reindex(pd.MultiIndex.from_tuples([aas_, stops_], names=('AA', 'Codon')), level=[0,1]) + + + codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) + + return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} + + + +if __name__ == '__main__': + + + seqs = read_input(data=sys.argv[1]) + out = codon_usage(seqs,"Bacterial") + + + with open(sys.argv[2], "w") as outf: + out["Table"].to_csv(outf, sep="\t") + #sys.stdout.write(out['Table']) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Project_RM/Project_RM/codon_usage.xml Tue Dec 11 10:38:18 2018 -0500 @@ -0,0 +1,45 @@ +<tool id="codon_usage" name="Codon Usage" version="0.1.0"> + <description>for each sequence in a file</description> + <requirements> + <requirement type=“package” version=“2.7.10”>python</requirement> + <requirement type=“package” version=“1.72”>biopython</requirement> + <requirement type=“package” version=“0.23.4”>pandas</requirement> + </requirements> + + <command>codon_usage.py -i $input -t $input_type -o $output -c $codon_table</command> + <inputs> + <param name="input" format="fasta" type="data" label="Source file"/> + + <param name="input_type" type="select" format="text"> + <label>Indicate the input file format</label> + <option value="fasta">Fasta</option> + <option value="gbk">gbk</option> + </param> + + <param name="codon_table" type="select" format="text"> + <label>Choose the proper codon table for your organism)</label> + <option value="Archaeal"> Archaeal</option> + <option value="Bacterial">Bacterial</option> + <option value="Standard">Standard</option> + + </param> + + </inputs> + + + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <tests> + <test> + <param name="input" value="fa_gc_content_input.fa"/> + <output name="out_file1" file="fa_gc_content_output.txt"/> + </test> + </tests> + + <help> +This tool compute codon usage of an annotated genome [preferably Prokaryotes]. + </help> +</tool>
--- a/Project_RM/codon_usage.py Tue Dec 11 08:49:02 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -#!/home/gianmarco/galaxy-python/python - -import Bio -from Bio import SeqIO -from Bio.Data import CodonTable -import re -import sys -import os -import pandas as pd - -def read_input(data = "example.fna"): - - seqs = "" - with open(data, "rU") as handle: - for record in SeqIO.parse(handle, "fasta"): - seqs = seqs + str(record.seq) - - return seqs - -def codon_usage(seqs, codonTable): - - codon_usage = {} - tmp = [x for x in re.split(r'(\w{3})', seqs) if x != ""] - - b_cod_table = CodonTable.unambiguous_dna_by_name[codonTable].forward_table - - - for cod in CodonTable.unambiguous_dna_by_name[codonTable].stop_codons: - b_cod_table[cod] = "_Stop" - - for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons: - b_cod_table[cod + " Start"] = b_cod_table[cod] - b_cod_table.pop(cod) - - aas = set(b_cod_table.values()) - - - for aa in aas: - codon_usage[aa] = {} - for codon in b_cod_table.keys(): - if b_cod_table[codon] == aa: - codon_usage[aa][codon] = tmp.count(codon.split(" ")[0]) - - - tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()} - - #aas_ = set(tups.keys()) - - #stops_ = {el for el in aas_ if el[0] == "Stop"} - #aas_ = list(aas_.difference(stops_)) - #stops_ = list(stops_) - #aas_.sort() - #stops_.sort() - - codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"]) - codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"]) - #codon_usage_.index.reindex(pd.MultiIndex.from_tuples([aas_, stops_], names=('AA', 'Codon')), level=[0,1]) - - - codon_usage_['Proportion'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2)) - - return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_} - - - -if __name__ == '__main__': - - - seqs = read_input(data=sys.argv[1]) - out = codon_usage(seqs,"Bacterial") - - - with open(sys.argv[2], "w") as outf: - out["Table"].to_csv(outf, sep="\t") - #sys.stdout.write(out['Table']) \ No newline at end of file
--- a/Project_RM/codon_usage.xml Tue Dec 11 08:49:02 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -<tool id="codon_usage" name="Codon Usage" version="0.1.0"> - <description>for each sequence in a file</description> - <requirements> - <requirement type=“package” version=“1.72”>biopython</requirement> - <requirement type=“package” version=“0.23.4”>pandas</requirement> - </requirements> - - <command interpreter="python2">codon_usage.py -i $input -t $input_type -o $output -c $codon_table</command> - <inputs> - <param name="input" format="fasta" type="data" label="Source file"/> - - <param name="input_type" type="select" format="text"> - <label>Indicate the input file format</label> - <option value="fasta">Fasta</option> - <option value="gbk">gbk</option> - </param> - - <param name="codon_table" type="select" format="text"> - <label>Choose the proper codon table for your organism)</label> - <option value="Archaeal"> Archaeal</option> - <option value="Bacterial">Bacterial</option> - <option value="Standard">Standard</option> - - </param> - - </inputs> - - - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <tests> - <test> - <param name="input" value="fa_gc_content_input.fa"/> - <output name="out_file1" file="fa_gc_content_output.txt"/> - </test> - </tests> - - <help> -This tool compute codon usage of an annotated genome [preferably Prokaryotes]. - </help> -</tool>
