Mercurial > repos > gianmarco_piccinno > cs_tool_project_rm
diff codon_switch.py @ 0:5397da1ef896 draft
Uploaded
author | gianmarco_piccinno |
---|---|
date | Tue, 21 May 2019 05:05:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/codon_switch.py Tue May 21 05:05:15 2019 -0400 @@ -0,0 +1,172 @@ +#!/usr/bin/env python + +__author__= "Gianmarco Piccinno" +__version__ = "1.0.0" + +from syngenic import * +from functions import * +from Bio import * +import argparse as ap + +if __name__ == '__main__': + + parser = ap.ArgumentParser(description="", formatter_class=ap.RawTextHelpFormatter) + + parser.add_argument( + '-i', '--input_plasmid', help='Input plasmid', required=True) + parser.add_argument( + '-l', '--plasmid_format', help='Format of the plasmid: {fasta, genbank}', required=True) + parser.add_argument( + '-p', '--input_patterns', help='Input patterns separated by new_line', required=True) + parser.add_argument( + '-g', '--input_genome', help='Input annotated genome', required=True) + parser.add_argument( + '-q', '--genome_format', help='Format of the annotated genome: {fasta, gbk}', required=True) + parser.add_argument( + '-c', '--codon_table', help='Codon table to be used {Bacterial}', required=True) + parser.add_argument( + '-m', '--max_row', help='Max row length when print', required=False) + parser.add_argument( + '-d', '--demonstration', help='Use demonstration simplication', required=False) + parser.add_argument( + '-f', '--n_plasmids', help='Use demonstration simplication', required=False) + parser.add_argument( + '-o', '--output_folder', help='Folder for writing the output file', required=True) + args = vars(parser.parse_args()) + + """ + + python codon_switch_v2.py + -i ./pEPSA5_annotated.gb + -l genbank + -p ./patterns.txt + -g S_aureus_JE2.gbf + -q gbk -c Bacterial + -o ./output + + python codon_switch_v2.py -i ./pEPSA5_annotated.gb -l genbank -p ./patterns.txt -g S_aureus_JE2.gbf -q genbank -c Bacterial -o ./output + + """ + + + pl = SeqIO.read( + open(args['input_plasmid'], "r"), args['plasmid_format']) + + if args['demonstration'] == "demonstration": + pl = pl[0:3000] + pats = read_patterns(args['input_patterns']) + + + ############################################################# + # + ############################################################# + + #pl = fake_from_real(path = "./data/pEPSA5_annotated.gb", id_ = "Trial", name = "Fake_plasmid") + print(type(pl)) + print(pl); print(pl.seq); print(pl.features) + + #for feat in pl.features: + # print(str(feat.extract(pl))) + # print(str(pl[feat.location.start:feat.location.end])) + # print("\n") + + + n_pl = plasmid(pl) + print(n_pl); print(len(n_pl)) + print(n_pl.features) + + + patts, n_patts = all_patterns(input_ = pats) + + + f_patts = n_pl.findpatterns(n_patts, patts) + print(f_patts) + print(pl.seq) + print(len(pl.seq)) + + + n_poss = punctuate_targets(f_patts, n_pl) + print(n_poss) + + print_seq(n_pl.seq) + + synonims_tables = synonims_(table_name=args['codon_table']) + + synonims_tables + + plasmids = generalization(n_poss, n_pl, synonims_tables) + + print(len(plasmids)) + + #plasmids + + #if len(plasmids) > 5000000: + #redo generalization without considering internal bases + #in target sites that are not in CDS + #this means considering only the outer bases of the target + # plasmids = generalization(n_poss, n_pl, synonims_tables, + # reduced = True) + + ######################################################### + # Read plasmid and compute codon usage + ######################################################### + + genome = annotated_genome(read_annotated_genome( + data=args['input_genome'], type_=args['genome_format'])) + + out_genome = genome.codon_usage(args['codon_table']) + print(out_genome.keys()) + print(out_genome["Table"]) + + print(out_genome["Table"].loc["GCA"]["Proportion"]) + print(type(out_genome["Table"].loc["GCA"]["Proportion"])) + + + ######################################################### + # Evaluate the plasmid + ######################################################### + + useful_plasmids = evaluate_plasmids(plasmids = plasmids, + original_plasmid = n_pl, + codon_usage_table = out_genome["Table"], + n_patts = n_patts, + f_patts = patts) + + dat_plasmids = rank_plasmids(original_useful_plasmids = useful_plasmids) + + def_pls = dat_plasmids.index[:int(args['n_plasmids'])] + + for to_save in def_pls: + #print(to_save) + #print(useful_plasmids[to_save]) + with open(to_save+".fa", "w") as handle: + handle.write(">"+to_save+"\n") + handle.write(useful_plasmids[to_save]["sequence"]) + + + + if args['max_row'] != None: + tmp_max_row = int(args['max_row']) + else: + tmp_max_row = 27 + + print_color_seq(original = n_pl, + others = def_pls, + annotation_information = useful_plasmids, + tot = useful_plasmids, + ind_range = None, + patterns = n_poss, + f_patterns = f_patts, + patts = patts, + max_row = tmp_max_row) + + + print_to_pdf(original = n_pl, + others = def_pls, + annotation_information = useful_plasmids, + tot = useful_plasmids, + ind_range = None, + patterns = n_poss, + f_patterns = f_patts, + patts = patts, + max_row = tmp_max_row)