Previous changeset 47:05965eee6b59 (2019-05-20) Next changeset 49:640db7b6847b (2019-05-20) |
Commit message:
Deleted selected files |
removed:
project_rm/S_aureus_JE2.gbf project_rm/cachingseq.py project_rm/codon_switch.py project_rm/codon_switch.xml project_rm/fastdivmod.py project_rm/functions.py project_rm/pEPSA5_annotated.gb project_rm/patterns.txt project_rm/run_codon_switch.sh project_rm/sre_yield.py project_rm/syngenic.py |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/S_aureus_JE2.gbf --- a/project_rm/S_aureus_JE2.gbf Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,92850 +0,0 @@\n-LOCUS S. 2903350 bp DNA linear 13-NOV-2018\n-DEFINITION Streptococcus aureus strain JE2.\n-ACCESSION \n-VERSION\n-KEYWORDS .\n-SOURCE Streptococcus aureus\n- ORGANISM Streptococcus aureus\n- Unclassified.\n-COMMENT Annotated using prokka 1.12 from\n- https://github.com/tseemann/prokka.\n-FEATURES Location/Qualifiers\n- source 1..2903350\n- /organism="Streptococcus aureus"\n- /mol_type="genomic DNA"\n- /strain="JE2"\n- gene complement(40..321)\n- /locus_tag="SaJE2__00001"\n- CDS complement(40..321)\n- /locus_tag="SaJE2__00001"\n- /inference="ab initio prediction:Prodigal:2.6"\n- /codon_start=1\n- /transl_table=11\n- /product="hypothetical protein"\n- /translation="MAERISSKIRRLEKSEEQIKLESLNEVTEAIAANKDSILKAIKL\n- IKTLDDAKLLDALNGAIRGRQVIINNLQLNLIKIFIQGYYLIWLQWYFY"\n- gene complement(798..1187)\n- /locus_tag="SaJE2__00002"\n- CDS complement(798..1187)\n- /locus_tag="SaJE2__00002"\n- /EC_number="1.2.1.2"\n- /inference="ab initio prediction:Prodigal:2.6"\n- /inference="similar to AA sequence:UniProtKB:Q99RW4"\n- /codon_start=1\n- /transl_table=11\n- /product="Putative formate dehydrogenase"\n- /translation="MPASPSLERRYFYKYRRRIQRLYQALEPLGDSKPDWKIFQAIAN\n- RLGFDWNYKHPSEIMDEVARLTPLYAGVSYDRLEGFNSLQWPVQPDGTDEPILYLEGF\n- NFDNGKANYSHYHLIITLSKMKFMIFM"\n- gene complement(1165..2010)\n- /locus_tag="SaJE2__00003"\n- CDS complement(1165..2010)\n- /locus_tag="SaJE2__00003"\n- /EC_number="1.2.1.2"\n- /inference="ab initio prediction:Prodigal:2.6"\n- /inference="similar to AA sequence:UniProtKB:Q99RW4"\n- /codon_start=1\n- /transl_table=11\n- /product="Putative formate dehydrogenase"\n- /translation="MSVLIGTNTAEAHPVIASRMKRAQKLFGQKIHVFDIRKHEMAER\n- ADRFYQPKPGTDLAWLSAVTKYIIDHDLHDKAFIDEWVDDFDEYYKSLETFTMAFAEE\n- ATGIPESELIKFAEECAKAESVVICWAMGITQQDIGSDSSTAISNLLLVTGNYRRPGT\n- GAYPLRGHNNVQGCSDMGSMPDKITGYQSIEADDIRAKFEKEYGVKLNPKAGKDNHEM\n- VEGIHDGEVHSLYLYGEDTGIVDSNINFVQAAFEKLDFMVVQDEFFNIHSNIRRCCIA\n- SKSFT"\n- gene complement(1988..2218)\n- /locus_tag="SaJE2__00004"\n- CDS complement(1988..2218)\n- /locus_tag="SaJE2__00004"\n- /EC_number="1.2.1.2"\n- /inference="ab initio prediction:Prodigal:2.6"\n- /inference="similar to AA sequence:UniProtKB:Q99RW4"\n- /codon_start=1\n- /transl_table=11\n- /product="Putative formate dehydrogenase"\n- /translation="MAQMHYHSFSSKATNEESYLMQKLARQVIGTNNVDNCSRYCQAP\n- ATKGLFRTVGHGGDSGSIEDLEKSGNVCIDRY"\n- gene complement(2184..2399)\n- /locus_tag="SaJE2__00005"\n- CDS complement(2184..2399)\n- /locus_tag="SaJE2__00005"\n- /EC_number="1.2.1.2"\n- /inference="ab initio prediction:Prodigal:2.6"\n- /inference="similar to AA sequence:UniProtKB:Q99RW4"\n- /codon_start=1\n- /transl_table=11\n- /product="Putative formate dehydrogenase"\n- /translation="MILQQIKIA'..b'c cttctccacg ttctttcgcc tcttctgcta attttaatgc\n- 2900281 ttcatctaaa tcagctgttt taacatcaca gtatttcgta tcaattcgct tatcaacacg\n- 2900341 tgtttcatca acatccacgc aaattgctac cccatgattc atagtaattg ctaacggttg\n- 2900401 cgcaccaccc ataccaccta aacctgctgt cagtgtaaca gtgcctgcta aatctccatt\n- 2900461 aaagtgttga ttacctagct cggcaaatgt ctcataagta ccttgcacaa taccttgaga\n- 2900521 accaatatat atccaactac cggctgtcat ctgtccatac atgattaaac cttttttatc\n- 2900581 taattcatta aaatgatccc agtttgccca ttcaggcact aatactgaat ttgaaattaa\n- 2900641 tacacgtggc gcttcttcat gtgttttaaa tacagcaact ggctttcctg attgtactaa\n- 2900701 cattgtctca tctgattcta attctcgtaa cgttttctct attgcttcaa aagcttccca\n- 2900761 attacgtgct gcttttccaa taccaccata aacaactaaa tcttctggtc tttcagcaac\n- 2900821 ttctgggtct aaattgttgt ataacattct aagtactgct tcttgttccc aacctttaca\n- 2900881 ctcaatactc aaaccttttt ttgcttgaat ttttctcata aaattcgctc ctgttctttt\n- 2900941 aagaagttaa ttccactaaa tttaaaacgc ttacattatt atcttcaata ttcattatag\n- 2901001 tatgttaaaa tatagccaac aaatataaat aaactaatta tccatagctt gaatctataa\n- 2901061 ataaaaggag caaaacacat gaaaattatt cagttagaat acttcttggc tatcgtgaaa\n- 2901121 tataatagtt ttactaaagc tgcacaattt ttacatatta gccagccatc tttaactgct\n- 2901181 acgattaaaa aaaatggaag cagatttagg ttatgactta tttacacgtt caacaaaaga\n- 2901241 catcaagatt accgaaaaag gaatacagtt ttatcgttat gcgagcgaat tagttcaaca\n- 2901301 atatcgatcc acgatggaaa aaatgtatga tttaagcgtt acatcagaac caaggataaa\n- 2901361 aattgggact cttgaatcta cgaatcaatg gattgcgaat ttaattcgaa agcaccattc\n- 2901421 cgactaccct gaacagcaat atcgtttata tgaaatacat gataaacatc aatctataga\n- 2901481 gcaattactg aattttaata ttcatttagc tataacaaat gaaaaataac ccacgaagat\n- 2901541 ataagatcca ttcctttata tgaggaatct tacattttat tagcacccaa ggaaacattt\n- 2901601 aaaaatcaaa attgggtaga tgttgaaaat ttgccactca tattaccaaa caaaaattct\n- 2901661 caagtgcgca aacacttaga tgactatttt aatagaagaa atattcgtcc aaatgtcgtt\n- 2901721 gtagaaacag atcgattcga atcagcagtt ggatttgttc atctcggctt aggttacgct\n- 2901781 atcattccga gattttatta ccaatcattt cacacgtcta atttagaata taaaaaaaat\n- 2901841 tcgtccaaac ttaggccgaa aaatttatat caattaccat aaaaaacgca aacactccga\n- 2901901 acagtacata cattcgtaca acaatgccaa gattatttat atggactttt agaggctctt\n- 2901961 taacttaagt tattagagcc tcttatgcag ttgcgcagat catcgtataa aaattaatga\n- 2902021 cgtcatttca aaaatcgata caaaaataat ttattataaa aattctaaga aagtgaagca\n- 2902081 gatgttaaaa tctattaatc atatatgctt ttcagtcaga aatttaaacg attcaataca\n- 2902141 tttttataga gatattttac ttgggaaatt gctattgact ggtaaaaaaa ctgcttattt\n- 2902201 tgagcttgca ggcctatgga ttgctttaaa tgaagaaaaa gatataccac gtaatgaaat\n- 2902261 tcacttttca tatacacata tagctttcac tatagatgac agcgaattta aatattggca\n- 2902321 tcagaggtta aaagataata acgtgaatat tttagaagga agagttagag atattagaga\n- 2902381 tagacaatca atttacttta ccgaccctga tggtcataag ctagaattac atactggcac\n- 2902441 acttgagaac agattaaatt attataaaga ggctaaacca catatgacat tttacaaata\n- 2902501 aggtgtcatt ataaaaaggc ctcttgaact ccgttaaaat tttaattaat tattatataa\n- 2902561 taagagaact tttcaaacaa tacagttgtt atttttgcta tttcaacaaa cataaataag\n- 2902621 cagtaagatg actacaactt aagagtcttc ttactgcaat tatttttcaa atatatcaac\n- 2902681 gttaatataa cttctattaa gaaatactca cattctgccc tgcaatgcaa atctcgtcac\n- 2902741 atataaatat ttttaattat tttaaaaaat gatgcactaa attagcaacg agcttagcag\n- 2902801 ttctattgtc agcgtcatat gttggattca tctcagcaat actaactgaa gacaccttat\n- 2902861 cacttggaat aatacgtttt gctaattcaa gaacagtatg tggatacaaa cctaacactg\n- 2902921 ccggcgcact taccccaggc gcaaacgcac tatcaatgac atccatacaa atcgtaaaca\n- 2902981 taatgacatc atgttcatgt acaaaacgtt caatcatatc tttaattgtt ggtgatacgt\n- 2903041 gactcaataa ttcatctgca aagacataat caatcttttt ctctttagca taatcaaata\n- 2903101 aaactttgcg tattaccacc ttgagcaata ccaagcacta aataatctgt gttttcatct\n- 2903161 tcttctaaaa tttgtctaaa gctcgttcca gatgtagatt gttgttcagc acgtgtatca\n- 2903221 aaatgcgcat caatatttat cacaccaata gattgtgttg gatagacttt acgtgttgct\n- 2903281 aaatattgag catacgcaat atcatgtcca ccacctaata aaaatgtttg tctatgatta\n- 2903341 gcaattgact\n-//\n' |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/cachingseq.py --- a/project_rm/cachingseq.py Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,55 +0,0 @@ -#!/usr/bin/env python2 -# -# Copyright 2011-2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# vim: sw=2 sts=2 et - -class CachingFuncSequence(object): - def __init__(self, func, length, inc_func=None): - """ - length: Length of this sequence. - func: function(index) - inc_func: function(index, value_of_previous) - """ - - self.func = func - self.inc_func = inc_func - self.length = length - self._cache = {} - - def __getitem__(self, i): - if i < 0: - i += self.length - if i < 0 or i >= self.length: - raise IndexError() - - v = self._cache.get(i) - if v is not None: - return v - - if self.inc_func and i-1 in self._cache: - v = self.inc_func(i, self._cache[i-1]) - else: - v = self.func(i) - - self._cache[i] = v - return v - - def __len__(self): - return self.length - - def __iter__(self): - for i in range(self.length): - yield self[i] |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/codon_switch.py --- a/project_rm/codon_switch.py Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,172 +0,0 @@ -#!/usr/bin/env python - -__author__= "Gianmarco Piccinno" -__version__ = "1.0.0" - -from syngenic import * -from functions import * -from Bio import * -import argparse as ap - -if __name__ == '__main__': - - parser = ap.ArgumentParser(description="", formatter_class=ap.RawTextHelpFormatter) - - parser.add_argument( - '-i', '--input_plasmid', help='Input plasmid', required=True) - parser.add_argument( - '-l', '--plasmid_format', help='Format of the plasmid: {fasta, genbank}', required=True) - parser.add_argument( - '-p', '--input_patterns', help='Input patterns separated by new_line', required=True) - parser.add_argument( - '-g', '--input_genome', help='Input annotated genome', required=True) - parser.add_argument( - '-q', '--genome_format', help='Format of the annotated genome: {fasta, gbk}', required=True) - parser.add_argument( - '-c', '--codon_table', help='Codon table to be used {Bacterial}', required=True) - parser.add_argument( - '-m', '--max_row', help='Max row length when print', required=False) - parser.add_argument( - '-d', '--demonstration', help='Use demonstration simplication', required=False) - parser.add_argument( - '-f', '--n_plasmids', help='Use demonstration simplication', required=False) - parser.add_argument( - '-o', '--output_folder', help='Folder for writing the output file', required=True) - args = vars(parser.parse_args()) - - """ - - python codon_switch_v2.py - -i ./pEPSA5_annotated.gb - -l genbank - -p ./patterns.txt - -g S_aureus_JE2.gbf - -q gbk -c Bacterial - -o ./output - - python codon_switch_v2.py -i ./pEPSA5_annotated.gb -l genbank -p ./patterns.txt -g S_aureus_JE2.gbf -q genbank -c Bacterial -o ./output - - """ - - - pl = SeqIO.read( - open(args['input_plasmid'], "r"), args['plasmid_format']) - - if args['demonstration'] == "demonstration": - pl = pl[0:3000] - pats = read_patterns(args['input_patterns']) - - - ############################################################# - # - ############################################################# - - #pl = fake_from_real(path = "./data/pEPSA5_annotated.gb", id_ = "Trial", name = "Fake_plasmid") - print(type(pl)) - print(pl); print(pl.seq); print(pl.features) - - #for feat in pl.features: - # print(str(feat.extract(pl))) - # print(str(pl[feat.location.start:feat.location.end])) - # print("\n") - - - n_pl = plasmid(pl) - print(n_pl); print(len(n_pl)) - print(n_pl.features) - - - patts, n_patts = all_patterns(input_ = pats) - - - f_patts = n_pl.findpatterns(n_patts, patts) - print(f_patts) - print(pl.seq) - print(len(pl.seq)) - - - n_poss = punctuate_targets(f_patts, n_pl) - print(n_poss) - - print_seq(n_pl.seq) - - synonims_tables = synonims_(table_name=args['codon_table']) - - synonims_tables - - plasmids = generalization(n_poss, n_pl, synonims_tables) - - print(len(plasmids)) - - #plasmids - - #if len(plasmids) > 5000000: - #redo generalization without considering internal bases - #in target sites that are not in CDS - #this means considering only the outer bases of the target - # plasmids = generalization(n_poss, n_pl, synonims_tables, - # reduced = True) - - ######################################################### - # Read plasmid and compute codon usage - ######################################################### - - genome = annotated_genome(read_annotated_genome( - data=args['input_genome'], type_=args['genome_format'])) - - out_genome = genome.codon_usage(args['codon_table']) - print(out_genome.keys()) - print(out_genome["Table"]) - - print(out_genome["Table"].loc["GCA"]["Proportion"]) - print(type(out_genome["Table"].loc["GCA"]["Proportion"])) - - - ######################################################### - # Evaluate the plasmid - ######################################################### - - useful_plasmids = evaluate_plasmids(plasmids = plasmids, - original_plasmid = n_pl, - codon_usage_table = out_genome["Table"], - n_patts = n_patts, - f_patts = patts) - - dat_plasmids = rank_plasmids(original_useful_plasmids = useful_plasmids) - - def_pls = dat_plasmids.index[:int(args['n_plasmids'])] - - for to_save in def_pls: - #print(to_save) - #print(useful_plasmids[to_save]) - with open(to_save+".fa", "w") as handle: - handle.write(">"+to_save+"\n") - handle.write(useful_plasmids[to_save]["sequence"]) - - - - if args['max_row'] != None: - tmp_max_row = int(args['max_row']) - else: - tmp_max_row = 27 - - print_color_seq(original = n_pl, - others = def_pls, - annotation_information = useful_plasmids, - tot = useful_plasmids, - ind_range = None, - patterns = n_poss, - f_patterns = f_patts, - patts = patts, - max_row = tmp_max_row) - - - print_to_pdf(original = n_pl, - others = def_pls, - annotation_information = useful_plasmids, - tot = useful_plasmids, - ind_range = None, - patterns = n_poss, - f_patterns = f_patts, - patts = patts, - max_row = tmp_max_row) |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/codon_switch.xml --- a/project_rm/codon_switch.xml Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,107 +0,0 @@ -<?xml version="1.0"?> -<tool name="Codon Switch Tool" id="codon_switch" version="0.2.6"> - <description>for each sequence in a file</description> - <requirements> - <requirement type="package" version="2.7.15">python</requirement> - <requirement type="package" version="1.72">biopython</requirement> - <requirement type="package" version="0.23.4">pandas</requirement> - <requirement type="package" version="1.15.3">numpy</requirement> - </requirements> - - <command> -<![CDATA[ - python '$__tool_directory__/codon_switch.py' -i $input -l $input_type -p $patterns -g $genome -g $genome_type -c $codon_table -f $num_plasmids -m $row_len -d $demonstration -o $output -]]> - </command> - <inputs> - <param name="input" format="genbank" type="data" label="Input plasmid"/> - - <param name="input_type" type="select" format="text"> - <label>Indicate the input file format.</label> - <option value="genbank">genbank</option> - </param> - - <param name="patterns" format="txt" type="data" label="Patterns file"/> - - <param name="genome" format="genbank" type="data" label="Input Genome"/> - - <param name="genome_type" type="select" format="text"> - <label>Indicate the input genome format.</label> - <option value="fasta">fasta</option> - <option value="genbank">genbank</option> - </param> - - <param name="demonstration" type="select" format="text"> - <label>Demonstration</label> - <option value="demonstration">yes</option> - <option value="no_demonstration">no</option> - </param> - - <param name="num_plasmids" type="select" format="text"> - <label>Indicate the number of best output plamids.</label> - <option value=3>3</option> - <option value=7>7</option> - </param> - - <param name="row_len" type="select" format="text"> - <label>Indicate the length of the row in report file.</label> - <option value=9>9</option> - <option value=27>27</option> - </param> - - <param name="codon_table" type="select" format="text"> - <label>Choose the proper codon table for your organism.</label> - <option value="Alternative Flatworm Mitochondrial">"Alternative Flatworm Mitochondrial"</option> - <option value="Alternative Yeast Nuclear">"Alternative Yeast Nuclear"</option> - <option value="Archaeal">"Archaeal"</option> - <option value="Ascidian Mitochondrial">"Ascidian Mitochondrial"</option> - <option value="Bacterial">"Bacterial"</option> - <option value="Blastocrithidia Nuclear">"Blastocrithidia Nuclear"</option> - <option value="Blepharisma Macronuclear">"Blepharisma Macronuclear"</option> - <option value="Candidate Division SR1">"Candidate Division SR1"</option> - <option value="Chlorophycean Mitochondrial">"Chlorophycean Mitochondrial"</option> - <option value="Ciliate Nuclear">"Ciliate Nuclear"</option> - <option value="Coelenterate Mitochondrial">"Coelenterate Mitochondrial"</option> - <option value="Condylostoma Nuclear">"Condylostoma Nuclear"</option> - <option value="Dasycladacean Nuclear">"Dasycladacean Nuclear"</option> - <option value="Echinoderm Mitochondrial">"Echinoderm Mitochondrial"</option> - <option value="Euplotid Nuclear">"Euplotid Nuclear"</option> - <option value="Flatworm Mitochondrial">"Flatworm Mitochondrial"</option> - <option value="Gracilibacteria">"Gracilibacteria"</option> - <option value="Hexamita Nuclear">"Hexamita Nuclear"</option> - <option value="Invertebrate Mitochondrial">"Invertebrate Mitochondrial"</option> - <option value="Karyorelict Nuclear">"Karyorelict Nuclear"</option> - <option value="Mesodinium Nuclear">"Mesodinium Nuclear"</option> - <option value="Mold Mitochondrial">"Mold Mitochondrial"</option> - <option value="Mycoplasma">"Mycoplasma"</option> - <option value="Pachysolen tannophilus Nuclear">"Pachysolen tannophilus Nuclear"</option> - <option value="Peritrich Nuclear">"Peritrich Nuclear"</option> - <option value="Plant Plastid">"Plant Plastid"</option> - <option value="Protozoan Mitochondrial">"Protozoan Mitochondrial"</option> - <option value="Pterobranchia Mitochondrial">"Pterobranchia Mitochondrial"</option> - <option value="SGC0">"SGC0"</option> - <option value="SGC1">"SGC1"</option> - <option value="SGC2">"SGC2"</option> - <option value="SGC3">"SGC3"</option> - <option value="SGC4">"SGC4"</option> - <option value="SGC5">"SGC5"</option> - <option value="SGC8">"SGC8"</option> - <option value="SGC9">"SGC9"</option> - <option value="Scenedesmus obliquus Mitochondrial">"Scenedesmus obliquus Mitochondrial"</option> - <option value="Spiroplasma">"Spiroplasma"</option> - <option value="Standard">"Standard"</option> - <option value="Thraustochytrium Mitochondrial">"Thraustochytrium Mitochondrial"</option> - <option value="Trematode Mitochondrial">"Trematode Mitochondrial"</option> - <option value="Vertebrate Mitochondrial">"Vertebrate Mitochondrial"</option> - <option value="Yeast Mitochondrial">"Yeast Mitochondrial"</option> - </param> - </inputs> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <help> -This tool permits codon switch and transversion in targeted regions. - </help> -</tool> |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/fastdivmod.py --- a/project_rm/fastdivmod.py Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,93 +0,0 @@ -#!/usr/bin/env python2 -# -# Copyright 2011-2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# vim: sw=2 sts=2 et - -from math import log, ceil -import sys - - -def find_largest_power(less_than, base): - power = int(log(less_than) / log(base)) - return base ** power - - -def divmod_iter(x, by, chunk=None): - if x < by: - return [x] - - if hasattr(x, 'bit_length'): - # crude log(2, x) - divisions = x.bit_length() // by.bit_length() - else: - divisions = log(x) / log(by) - - if divisions < 1024: - return divmod_iter_basic(x, by, chunk) - else: - return divmod_iter_chunking(x, by, chunk) - - -def divmod_iter_chunking(x, by, chunk=None): - """Generate successive (x % by); x /= by, but faster. - - If provided, |chunk| must be a power of |by| (otherwise it is determined - automatically for 1024 per inner loop, based on analysis of bench_genmod.py) - """ - - if by == 1: - assert x == 0, x - yield 0 - return - - if chunk is None: - digits_per_chunk = 1024 - chunk = by ** digits_per_chunk - else: - digits_per_chunk = int(round(log(chunk) / log(by))) - if (by ** digits_per_chunk) != chunk: - raise ValueError("Chunk=%d must be a power of by=%d" % (chunk, by)) - - assert digits_per_chunk > 0 - - while x: - x, this_chunk = divmod(x, chunk) - #this_chunk = int(this_chunk) - for _ in range(digits_per_chunk): - this_chunk, m = divmod(this_chunk, by) - yield m - - if this_chunk == 0 and x == 0: - break - - -def divmod_iter_basic(x, by, chunk=None): - """Generate successive (x % by); x /= by, the obvious way. - - Chunk is ignored. - """ - while x: - x, m = divmod(x, by) - yield m - -def powersum(x, low, high): - # http://mikestoolbox.com/powersum.html - xm1 = x - 1 - if xm1 == 0: - return high - low + 1 - a = x ** (high + 1) - b = x ** low - return (a - b) // xm1 |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/functions.py --- a/project_rm/functions.py Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,960 +0,0 @@\n-import string\r\n-from syngenic import *\r\n-from Bio.Seq import Seq\r\n-from Bio.SeqFeature import SeqFeature, FeatureLocation\r\n-from pprint import pprint\r\n-\r\n-from itertools import izip\r\n-\r\n-import numpy as np\r\n-import pandas as pd\r\n-\r\n-def all_patterns(input_ = []):\r\n-\r\n- patts = []\r\n- n_patts = []\r\n-\r\n- for patt in input_:\r\n- tmp_patt = patt#Seq(patt.rstrip(), IUPAC.ambiguous_dna)\r\n- tmp_revc = tmp_patt.reverse_complement()\r\n-\r\n- patts.append(str(tmp_patt))\r\n- patts.append(str(tmp_revc))\r\n-\r\n- n_patts.append(pattern(tmp_patt).plan_ambiguity())\r\n- n_patts.append(pattern(tmp_revc).plan_ambiguity())\r\n-\r\n-\r\n- return patts, n_patts\r\n-\r\n-def fake_from_real(path = None, id_ = None, name = None):\r\n-\r\n- plasmid_seq = SeqIO.read(open(path, "r"), "genbank")\r\n-\r\n- f_p = plasmid_seq.seq[:10]\r\n- f_CDS = []\r\n- for f in plasmid_seq.features:\r\n- if f.type == "CDS":\r\n- tmp_start = len(f_p)\r\n- tmp_cds = plasmid_seq[f.location.start:f.location.start+9] + plasmid_seq[f.location.end-9:f.location.end]\r\n- tmp_end = tmp_start + len(tmp_cds)\r\n- f_p += tmp_cds\r\n- f_CDS.append(SeqFeature(FeatureLocation(tmp_start, tmp_end), type="gene", strand=f.location.strand))\r\n- #f_p += plasmid_seq.seq[tmp_end:tmp_end+5]\r\n- f_p += plasmid_seq.seq[-10:]\r\n-\r\n- for feat in f_CDS:\r\n- f_p.features.append(feat)\r\n- f_p.id = id_\r\n- f_p.name = name\r\n-\r\n- #feature_seq_0 = f_CDS[0].extract(f_p)\r\n-\r\n- return f_p\r\n-\r\n-def punctuate_targets(f_patts, n_pl):\r\n-\r\n- n_poss = {}\r\n- max_len = len(n_pl)\r\n- for key in f_patts.keys():\r\n- for el in f_patts[key]:\r\n- if not el[2] < el[1]:\r\n- tmp = range(el[1], el[2])\r\n- for i in range(len(tmp)):\r\n- if not tmp[i] in n_poss.keys():\r\n- n_poss[tmp[i]] = [key[i]]\r\n- else:\r\n- n_poss[tmp[i]].append(key[i])\r\n- else:\r\n- tmp = range(el[1], max_len) + range(0, el[2])\r\n- for i in range(len(tmp)):\r\n- if not tmp[i] in n_poss.keys():\r\n- n_poss[tmp[i]] = [key[i]]\r\n- else:\r\n- n_poss[tmp[i]].append(key[i])\r\n-\r\n- for key in n_poss.keys():\r\n- n_poss[key] = set(n_poss[key])\r\n-\r\n- #print(n_poss)\r\n-\r\n- return n_poss\r\n-\r\n-\r\n-def print_seq(n_pl, ind_range = None):\r\n-\r\n- if ind_range == None:\r\n-\r\n- data = filter(None, re.split(r\'(\\w{1})\', n_pl))\r\n- index = range(len(n_pl))\r\n-\r\n- seq = []\r\n- ind = []\r\n-\r\n- j = 0\r\n-\r\n- seq.append("")\r\n- ind.append("")\r\n-\r\n- for i in range(len(data)):\r\n-\r\n- if (i % 9 == 0) & (i > 0):\r\n- j += 1\r\n- seq.append("")\r\n- ind.append("")\r\n- print("\\n")\r\n- print(seq[j-1])\r\n- print(ind[j-1])\r\n-\r\n-\r\n- seq[j] += " "\r\n- ind[j] += " "\r\n- for n in range(len(str(index[i]))-1):\r\n- seq[j] += " "\r\n- seq[j] += data[i]\r\n- ind[j] += str(index[i])\r\n- print("\\n")\r\n- print(seq[j])\r\n- print(ind[j])\r\n- else:\r\n- data = filter(None, re.split(r\'(\\w{1})\', n_pl[ind_range[0]:ind_range[1]]))\r\n- index = range(ind_range[0], ind_range[1])\r\n-\r\n- seq = []\r\n- ind = []\r\n-\r\n- j = 0\r\n-\r\n- seq.append("")\r\n- ind.append("")\r\n-\r\n- for i in range(len(data)):\r\n-\r\n- if (i % 9 == 0) & (i > 0):\r\n- j += 1\r\n- seq.append("")\r\n- ind.append("")\r\n- print("\\n")\r\n- print(seq[j-1])\r\n- print(ind[j-1])\r\n-\r\n-\r\n- seq[j] += " "\r\n- ind[j] += " "\r\n- for n in range(len(str(index[i]))-1):\r\n- seq[j] += '..b'arget_positions,\r\n- 2:annot,\r\n- 3:direction,\r\n- 4:["Original"] + sequences["original"],\r\n- 5:new_plasmids,\r\n- 6:index}\r\n-\r\n- doc = SimpleDocTemplate("comparison_syngenic_plasmids.pdf",pagesize=letter,\r\n- rightMargin=30,leftMargin=30,\r\n- topMargin=30,bottomMargin=30)\r\n-\r\n- elements = []\r\n- #max_row = 18\r\n- blocks = {}\r\n-\r\n- if len(range(max_row, len(original.seq)+1, max_row)) % max_row == 0:\r\n- n_blocks = len(range(max_row, len(original.seq)+1, max_row))\r\n- else:\r\n- n_blocks = len(range(max_row, len(original.seq)+1, max_row)) + 1\r\n-\r\n- j = 0\r\n-\r\n- for i in range(n_blocks):\r\n- blocks[i] = []\r\n- for l in range(7):\r\n- if l in [0, 5]:\r\n- for el in data[l]:\r\n- if len(el[j:]) > max_row:\r\n- if i >= 1:\r\n- blocks[i].append([el[0]] + el[j:j+max_row])\r\n- else:\r\n- blocks[i].append(el[j:j+max_row])\r\n- else:\r\n- blocks[i].append([el[0]] + el[j:])\r\n- else:\r\n- if len(data[l][j:]) > max_row:\r\n- if i >= 1:\r\n- blocks[i].append([data[l][0]] + data[l][j:j+max_row])\r\n- else:\r\n- blocks[i].append(data[l][j:j+max_row])\r\n- else:\r\n- blocks[i].append([data[l][0]] + data[l][j:])\r\n- j += max_row\r\n- #print("\\n")\r\n- #print(blocks[i])\r\n-\r\n- elements.append(Table(blocks[i], hAlign=\'LEFT\'))#,\r\n- #style=[(\'BACKGROUND\',(0,0),(0,0),colors.palegreen),\r\n- # (\'BACKGROUND\',(1,1),(1,1),colors.palegreen),\r\n- # (\'TEXTCOLOR\',(2,2),(3,2),colors.palegreen),\r\n- # (\'BOX\',(0,0),(0,0),2,colors.red)]))\r\n- elements.append(Table([["", "", "", "", ""]]))\r\n-\r\n- doc.build(elements)\r\n-\r\n-\r\n- #new_doc = SimpleDocTemplate("further_information.pdf",pagesize=letter,\r\n- # rightMargin=30,leftMargin=30,\r\n- # topMargin=30,bottomMargin=30)\r\n- #new_elements = []\r\n-\r\n- #new_elements.append([f for f in original.features if f.type.lower() in ["gene", "cds"]])\r\n- #new_elements.append(f_patterns)\r\n-\r\n- #doc.build(new_elements)\r\n-\r\n- c = canvas.Canvas("./further_information.pdf")\r\n- c.drawString(100,750,"CDS regions:")\r\n- upper_bound = 750\r\n- for feat in original.features:\r\n- if feat.type.lower() in ["gene", "cds"]:\r\n- upper_bound -= 15\r\n- if feat.location.strand == -1:\r\n- sign = "-"\r\n- else:\r\n- sign = "+"\r\n- c.drawString(115,upper_bound, str("[") + str(feat.location.start)+ ":" + str(feat.location.end) + "]" + "(" + sign + ")")\r\n- upper_bound -= 30\r\n- c.drawString(100,upper_bound,"Patterns and the corresponding targets on the plasmid sequence:")\r\n- for f_pattern in f_patterns.keys():\r\n- upper_bound -= 15\r\n- c.drawString(115,upper_bound,f_pattern + ":")\r\n- for val in f_patterns[f_pattern]:\r\n- upper_bound -= 15\r\n- c.drawString(130,upper_bound,str(val))\r\n- upper_bound -= 5\r\n-\r\n- upper_bound -= 30\r\n- c.drawString(100,upper_bound,"Identifiers of the targets found in the plasmid sequence:")\r\n- for target in targets.keys():\r\n- upper_bound -= 15\r\n- c.drawString(115,upper_bound,target + ": " + targets[target])\r\n-\r\n- c.save()\r\n-\r\n-\r\n- return\r\n-\r\n-\r\n-def produce_random_targets(sequence):\r\n-\r\n- # Produce a target on two continous CDS\r\n- # Produce a target in a non-coding region\r\n- # Produce a target in coding region\r\n- # Produce a target on a overlapping left\r\n- # Produce a target on a overlapping right\r\n-\r\n-\r\n-\r\n- return\r\n' |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/pEPSA5_annotated.gb --- a/project_rm/pEPSA5_annotated.gb Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,192 +0,0 @@\n-LOCUS Exported 6850 bp ds-DNA circular SYN 13-NOV-2018\r\n-DEFINITION synthetic circular DNA\r\n-ACCESSION .\r\n-VERSION .\r\n-KEYWORDS .\r\n-SOURCE synthetic DNA construct\r\n- ORGANISM synthetic DNA construct\r\n-REFERENCE 1 (bases 1 to 6850)\r\n- AUTHORS .\r\n- TITLE Direct Submission\r\n- JOURNAL Exported Nov 13, 2018 from SnapGene 4.2.6\r\n- http://www.snapgene.com\r\n-COMMENT LOCUS dna 6850 bp\r\n-FEATURES Location/Qualifiers\r\n- source 1..6850\r\n- /organism="synthetic DNA construct"\r\n- /mol_type="other DNA"\r\n- misc_feature 1..819\r\n- /label=p15A origin of replication region\r\n- /note="p15A origin of replication region"\r\n- misc_feature 1032..3625\r\n- /label=Staphylococcus aureus plasmid pC194\r\n- /note="Staphylococcus aureus plasmid pC194 region\r\n- (NC_002013.1)"\r\n- misc_feature 1126..1324\r\n- /label=Repeat sequence 1 of 2\r\n- /note="Repeat sequence 1 of 2"\r\n- CDS complement(1562..2212)\r\n- /label=Chloramphenicol Resistance\r\n- CDS complement(2346..2744)\r\n- /label=Potential Copy number associated protein\r\n- /note="Potential Copy number associated protein"\r\n- misc_feature 2463..2517\r\n- /label=pC194 replication origin\r\n- /note="pC194 replication origin (detailed in PMC401278)"\r\n- misc_feature 2670..3605\r\n- /label=Replication initiation protein Rep RC\r\n- misc_feature 3626..3824\r\n- /label=Repeat sequence 2 of 2\r\n- /note="Repeat sequence 2 of 2"\r\n- misc_feature 3825..5278\r\n- /label=Xylose Inducible Promoter Region\r\n- /note="Xylose Inducible Promoter Region, Xyl promoter, Xyl\r\n- repressor protein, Xyl terminator sequence and pTX5\r\n- promoter facing MCS:\r\n- http://parts.igem.org/Part:BBa_K1323014"\r\n- CDS 4000..5151\r\n- /label=XylR repressor protein\r\n- misc_feature 5191..5241\r\n- /label=Bacteriophage T5 PN25 promoter\r\n- /note="Bacteriophage T5 PN25 promoter (Deuschle et al,\r\n- PMC1167251)"\r\n- misc_feature 5206..5211\r\n- /label=-30 region\r\n- /note="-30 region"\r\n- misc_feature 5229..5234\r\n- /label=-10 region\r\n- /note="-10 region"\r\n- misc_feature 5241..5279\r\n- /label=Xyl operator containing palindromic\r\n- /note="interfering with transcription initiation"\r\n- misc_feature 5279..5317\r\n- /label=MCS\r\n- /note="MCS"\r\n- terminator 5393..5550\r\n- /label=rrnB term\r\n- terminator 5516..5559\r\n- /label=rrnB_T1 term\r\n- terminator 5691..5718\r\n- /label=rrnB_T2 term\r\n- misc_feature complement(5860..6720)\r\n- /label=Ampicillin Resistance\r\n- /note="gene of the plasmid pLEX5BA"\r\n- promoter complement(6762..6790)\r\n- /label=amp prom\r\n-ORIGIN\r\n- 1 ggcggccgca ctggcttact atgttggcac tgatgagggt gtcagtgaag tgcttcatgt\r\n- 61 ggcaggagaa aaaaggctgc accggtgcgt cagcagaata tgtgatacag gatatattcc\r\n- 121 gcttcctcgc tcactgactc gctacgctcg gtcgttcgac tgcggcgagc ggaaatggct\r\n- 181 tacgaacggg gcggagattt cctggaagat gccaggaaga tacttaacag ggaagtgaga\r\n- 241 gggccgcggc aaagccgttt ttccataggc tccgcccccc tgacaagcat cacgaaatct\r\n- 301 gacgctcaaa tcagtggtgg cgaaacccga caggactata aagataccag gcgtttcccc\r\n- 361 ctggcggct'..b'781 ataaaaccac tcctttttaa caaactttat cacaagaaat attttggcat tctacgacta\r\n- 3841 taacttaaat ttatattttt tactttataa tatataattg attatagaat aatgttgctc\r\n- 3901 atatcgtttg ccaacatcta gtactcaaat tacactatgt tacacttggt aatattaacc\r\n- 3961 gaacttcccc tgtccaaatt agataagagg taataataaa tggaaaataa ttttatagta\r\n- 4021 aatgaaaatg agaagcgtgt attaaaacaa attttcaata acagcaatat ttcacgaaca\r\n- 4081 caaatatcga agaatttaga acttaataaa gctactattt ctaacattct gaacaactta\r\n- 4141 aaacacaaga gtttagttaa tgaagtagga gaaggtaata gtactaaaag tggtggacga\r\n- 4201 aagcctattt tactcgaaat taaccaaaaa tatggctact atatttctat ggatttaaca\r\n- 4261 tatgattccg ttgaattaat gtacaactac tttgatgcta ctatattaaa gcaagattcc\r\n- 4321 tacgaattaa atgataaaaa tgtaagcagt atattacaaa ttttaaaatc taatataaac\r\n- 4381 gtctcagaaa aatatgatac gttatatggg ttacttggta tatctatatc catacacggt\r\n- 4441 atcgttgacg atgagcaaaa cataatcaat cttccttttc ataaaaatga gaaacgcaca\r\n- 4501 tttaccgatg aattaaagtc attcacaaat gttcctgtcg ttatagaaaa tgaagcaaat\r\n- 4561 ttatcagcgc tatatgaaaa aagtttatat attaattcaa acataaataa tttgattact\r\n- 4621 ttaagtattc acaagggtat aggcgctggc atcctaataa ataaaaaact ttatcgtggc\r\n- 4681 tcaaatggag aggctggaga gataggtaag acattggttt tggaatctat aaataacaat\r\n- 4741 gacaacaaat attataaaat cgaagatata tgctcccaag acgctttaat acagaaaata\r\n- 4801 aataataggt tgggcgtcac attgacgttt acagaactaa tccaatatta caacgaagga\r\n- 4861 aattcaattg ttgctcatga aattaaacaa tttattaata aaatgacagt tctgattcat\r\n- 4921 aatttgaata cacaatttaa cccagacgct atttatatta actgtccttt aattaatgaa\r\n- 4981 ttaccaaata ttttaaatga aattaaagag caattctcct gtttttctca aggcagtcca\r\n- 5041 gttcaattac atttaactac taatgtaaaa caagctactt tattgggtgg cactttagca\r\n- 5101 ataatgcaaa aaacattaaa tataaataac attcaaatga atattaaata attacagcag\r\n- 5161 tctgagttat aaaatagata tctcggaccg tcataaaaaa tttatttgct ttcaggaaaa\r\n- 5221 tttttctgta taatagattc aagttagttt gtttattaaa ttaaccaact aaaatgtaga\r\n- 5281 attcgagctc ggtacccggg gatcctctag agtcgacctg cagccaagct tgggcttttc\r\n- 5341 agcctgatac agattaaatc agaacgcaga agcggtctga taaaacagaa tttgcctggc\r\n- 5401 ggcagtagcg cggtggtccc acctgacccc atgccgaact cagaagtgaa acgccgtagc\r\n- 5461 gccgatggta gtgtggggtc tccccatgcg agagtaggga actgccaggc atcaaataaa\r\n- 5521 acgaaaggct cagtcgaaag actgggcctt tcgttttatc tgttgtttgt cggtgaacgc\r\n- 5581 tctcctgagt aggacaaatc cgccgggagc ggatttgaac gttgcgaagc aacggcccgg\r\n- 5641 agggtggcgg gcaggacgcc cgccataaac tgccaggcat caaattaagc agaaggccat\r\n- 5701 cctgacggat ggcctttttg cgtttctaca aactcttttg tttatttttc taaatacatt\r\n- 5761 caaatatgta tccgctcatc cccatcctat cgatgataag ctgtcaaaca tgagaattaa\r\n- 5821 atcaatctaa agtatatatg agtaaacttg gtctgacagt taccaatgct taatcagtga\r\n- 5881 ggcacctatc tcagcgatct gtctatttcg ttcatccata gttgcctgac tccccgtcgt\r\n- 5941 gtagataact acgatacggg agggcttacc atctggcccc agtgctgcaa tgataccgcg\r\n- 6001 agacccacgc tcaccggctc cagatttatc agcaataaac cagccagccg gaagggccga\r\n- 6061 gcgcagaagt ggtcctgcaa ctttatccgc ctccatccag tctattaatt gttgccggga\r\n- 6121 agctagagta agtagttcgc cagttaatag tttgcgcaac gttgttgcca ttgctacagg\r\n- 6181 catcgtggtg tcacgctcgt cgtttggtat ggcttcattc agctccggtt cccaacgatc\r\n- 6241 aaggcgagtt acatgatccc ccatgttgtg caaaaaagcg gttagctcct tcggtcctcc\r\n- 6301 gatcgttgtc agaagtaagt tggccgcagt gttatcactc atggttatgg cagcactgca\r\n- 6361 taattctctt actgtcatgc catccgtaag atgcttttct gtgactggtg agtactcaac\r\n- 6421 caagtcattc tgagaatagt gtatgcggcg accgagttgc tcttgcccgg cgtcaacacg\r\n- 6481 ggataatacc gcgccacata gcagaacttt aaaagtgctc atcattggaa aacgctcttc\r\n- 6541 ggggcgaaaa ctctcaagga tcttaccgct gttgagatcc agttcgatgt aacccactcg\r\n- 6601 tgcacccaac tgatcttcag catcttttac tttcaccagc gtttctgggt gagcaaaaac\r\n- 6661 aggaaggcaa aatgccgcaa aaaagggaat aagggcgaca cggaaatgtt gaatactcat\r\n- 6721 actcttcctt tttcaatatt attgaagcat ttatcagggt tattgtctca tgagcggata\r\n- 6781 catatttgaa tgtatttaga aaaataaaca aataggggtt ccgcgcacat ttccccgaaa\r\n- 6841 agtgccacct\r\n-//\r\n' |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/patterns.txt --- a/project_rm/patterns.txt Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,3 +0,0 @@ -ACANNNNNNRTGG -ATCNNNNNCCT -BNNNNNNNGCGGTAVY |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/run_codon_switch.sh --- a/project_rm/run_codon_switch.sh Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,2 +0,0 @@ -#python setup.py build_ext --inplace -python codon_switch.py -i pEPSA5_annotated.gb -l genbank -p patterns.txt -g S_aureus_JE2.gbf -q genbank -c Bacterial -f 3 -m 27 -d demonstration -o ./output # -d demonstration |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/sre_yield.py --- a/project_rm/sre_yield.py Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,611 +0,0 @@\n-#!/usr/bin/env python2\n-#\n-# Copyright 2011-2016 Google Inc.\n-#\n-# Licensed under the Apache License, Version 2.0 (the "License");\n-# you may not use this file except in compliance with the License.\n-# You may obtain a copy of the License at\n-#\n-# http://www.apache.org/licenses/LICENSE-2.0\n-#\n-# Unless required by applicable law or agreed to in writing, software\n-# distributed under the License is distributed on an "AS IS" BASIS,\n-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n-# See the License for the specific language governing permissions and\n-# limitations under the License.\n-#\n-# vim: sw=2 sts=2 et\n-\n-"""This module can generate all strings that match a regular expression.\n-\n-The regex is parsed using the SRE module that is standard in python,\n-then the data structure is executed to form a bunch of iterators.\n-"""\n-\n-__author__ = \'alexperry@google.com (Alex Perry)\'\n-__all__ = [\'Values\', \'AllStrings\', \'AllMatches\', \'ParseError\']\n-\n-\n-import bisect\n-import math\n-import re\n-import sre_constants\n-import sre_parse\n-import string\n-import sys\n-import types\n-\n-import cachingseq\n-import fastdivmod\n-\n-try:\n- xrange = xrange\n-except NameError:\n- xrange = range\n-\n-_RE_METACHARS = r\'$^{}*+\\\\\'\n-_ESCAPED_METACHAR = r\'\\\\[\' + _RE_METACHARS + r\']\'\n-ESCAPED_METACHAR_RE = re.compile(_ESCAPED_METACHAR)\n-# ASCII by default, see https://github.com/google/sre_yield/issues/3\n-CHARSET = [chr(c) for c in range(256)]\n-\n-WORD = string.ascii_letters + string.digits + \'_\'\n-\n-try:\n- DEFAULT_RE_FLAGS = re.ASCII\n-except AttributeError:\n- DEFAULT_RE_FLAGS = 0\n-\n-STATE_START, STATE_MIDDLE, STATE_END = list(range(3))\n-\n-def Not(chars):\n- return \'\'.join(sorted(set(CHARSET) - set(chars)))\n-\n-\n-CATEGORIES = {\n- sre_constants.CATEGORY_WORD: WORD,\n- sre_constants.CATEGORY_NOT_WORD: Not(WORD),\n- sre_constants.CATEGORY_DIGIT: string.digits,\n- sre_constants.CATEGORY_NOT_DIGIT: Not(string.digits),\n- sre_constants.CATEGORY_SPACE: string.whitespace,\n- sre_constants.CATEGORY_NOT_SPACE: Not(string.whitespace),\n-}\n-\n-# This constant varies between builds of Python; this is the lower value.\n-MAX_REPEAT_COUNT = 65535\n-\n-\n-class ParseError(Exception):\n- pass\n-\n-\n-def slice_indices(slice_obj, size):\n- """slice_obj.indices() except this one supports longs."""\n- # start stop step\n- start = slice_obj.start\n- stop = slice_obj.stop\n- step = slice_obj.step\n-\n- # We don\'t always update a value for negative indices (if we wrote it here\n- # due to None).\n- if step is None:\n- step = 1\n- if start is None:\n- if step > 0:\n- start = 0\n- else:\n- start = size - 1\n- else:\n- start = _adjust_index(start, size)\n-\n- if stop is None:\n- if step > 0:\n- stop = size\n- else:\n- stop = -1\n- else:\n- stop = _adjust_index(stop, size)\n-\n- return (start, stop, step)\n-\n-\n-def _adjust_index(n, size):\n- if n < 0:\n- n += size\n-\n- if n < 0:\n- raise IndexError("Out of range")\n- if n > size:\n- n = size\n- return n\n-\n-\n-def _xrange(*args):\n- """Because xrange doesn\'t support longs :("""\n- # prefer real xrange if it works\n- try:\n- return xrange(*args)\n- except OverflowError:\n- return _bigrange(*args)\n-\n-\n-def _bigrange(*args):\n- if len(args) == 1:\n- start = 0; stop = args[0]; step = 1\n- elif len(args) == 2:\n- start, stop = args\n- step = 1\n- elif len(args) == 3:\n- start, stop, step = args\n- else:\n- raise ValueError("Too many args for _bigrange")\n-\n- i = start\n- while True:\n- yield i\n- i += step\n- if step < 0 and i <= stop:\n- break\n- if step > 0 and i >= stop:\n- break\n-\n-\n-class WrappedSequence(object):\n- """This wraps a sequence, purely as a base clase for the other uses."""\n-\n- def __init__(self, raw):\n- # Derived classes will li'..b'm/google/sre_yield/issues/3\n- if flags & re.IGNORECASE:\n- raise ParseError(\'Flag "i" not supported. https://github.com/google/sre_yield/issues/4\')\n- elif flags & re.UNICODE:\n- raise ParseError(\'Flag "u" not supported. https://github.com/google/sre_yield/issues/3\')\n- elif flags & re.LOCALE:\n- raise ParseError(\'Flag "l" not supported. https://github.com/google/sre_yield/issues/5\')\n-\n- if max_count is None:\n- self.max_count = MAX_REPEAT_COUNT\n- else:\n- self.max_count = max_count\n-\n- self.has_groupref = False\n-\n- # Configure the parser backends\n- self.backends = {\n- sre_constants.LITERAL: lambda y: [chr(y)],\n- sre_constants.RANGE: lambda l, h: [chr(c) for c in range(l, h+1)],\n- sre_constants.SUBPATTERN: self.maybe_save,\n- sre_constants.BRANCH: self.branch_values,\n- sre_constants.MIN_REPEAT: self.max_repeat_values,\n- sre_constants.MAX_REPEAT: self.max_repeat_values,\n- sre_constants.AT: self.nothing_added,\n- sre_constants.ASSERT: self.empty_list,\n- sre_constants.ASSERT_NOT: self.empty_list,\n- sre_constants.ANY:\n- lambda _: self.in_values(((sre_constants.NEGATE,),)),\n- sre_constants.IN: self.in_values,\n- sre_constants.NOT_LITERAL: self.not_literal,\n- sre_constants.CATEGORY: self.category,\n- sre_constants.GROUPREF: self.groupref,\n- }\n- self.state = STATE_START\n- # Now build a generator that knows all possible patterns\n- self.raw = self.sub_values(sre_parse.parse(pattern, flags))\n- # Configure this class instance to know about that result\n- self.length = self.raw.__len__()\n-\n- def __contains__(self, item):\n- # Since we have a regex, we can search the list really cheaply\n- return self.matcher.match(item) is not None\n-\n-\n-class RegexMembershipSequenceMatches(RegexMembershipSequence):\n- def __getitem__(self, i):\n- if isinstance(i, slice):\n- result = SlicedSequence(self, slicer=i)\n- if result.__len__() < 16:\n- # Short lists are unpacked\n- result = [item for item in result]\n- return result\n-\n- d = {}\n- s = super(RegexMembershipSequenceMatches, self).get_item(i, d)\n- return Match(s, d, self.named_group_lookup)\n-\n-\n-def AllStrings(regex, flags=0, charset=CHARSET, max_count=None):\n- """Constructs an object that will generate all matching strings."""\n- return RegexMembershipSequence(regex, flags, charset, max_count=max_count)\n-\n-Values = AllStrings\n-\n-\n-class Match(object):\n- def __init__(self, string, groups, named_groups):\n- # TODO keep group(0) only, and spans for the rest.\n- self._string = string\n- self._groups = groups\n- self._named_groups = named_groups\n- self.lastindex = len(groups) + 1\n-\n- def group(self, n=0):\n- if n == 0:\n- return self._string\n- if not isinstance(n, int):\n- n = self._named_groups[n]\n- return self._groups[n]\n-\n- def groups(self):\n- return tuple(self._groups[i] for i in range(1, self.lastindex))\n-\n- def groupdict(self):\n- d = {}\n- for k, v in self._named_groups.items():\n- d[k] = self._groups[v]\n- return d\n-\n- def span(self, n=0):\n- raise NotImplementedError()\n-\n-\n-def AllMatches(regex, flags=0, charset=CHARSET, max_count=None):\n- """Constructs an object that will generate all matching strings."""\n- return RegexMembershipSequenceMatches(regex, flags, charset, max_count=max_count)\n-\n-\n-def main(argv=None):\n- """This module can be executed on the command line for testing."""\n- if argv is None:\n- argv = sys.argv\n- for arg in argv[1:]:\n- for i in AllStrings(arg):\n- print(i)\n-\n-\n-if __name__ == \'__main__\':\n- main()\n' |
b |
diff -r 05965eee6b59 -r 611cac5e3066 project_rm/syngenic.py --- a/project_rm/syngenic.py Mon May 20 18:01:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,362 +0,0 @@\n-#!/usr/bin/env python\r\n-\r\n-__author__= "Gianmarco Piccinno"\r\n-__version__ = "1.0.0"\r\n-\r\n-import Bio\r\n-from Bio import SeqIO\r\n-from Bio.Seq import Seq\r\n-from Bio.Alphabet import IUPAC\r\n-from Bio.Data import IUPACData\r\n-from Bio.Data import CodonTable\r\n-import re\r\n-import sre_yield\r\n-\r\n-import re\r\n-import itertools\r\n-from functools import reduce\r\n-\r\n-import Bio\r\n-from Bio import Data\r\n-from Bio.Data import IUPACData\r\n-from Bio.Data import CodonTable\r\n-\r\n-from pprint import pprint\r\n-\r\n-import pandas as pd\r\n-\r\n-def _check_bases(seq_string):\r\n- """\r\n- Check characters in a string (PRIVATE).\r\n- Remove digits and white space present in string. Allows any valid ambiguous\r\n- IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, upper case are converted).\r\n-\r\n- Other characters (e.g. symbols) trigger a TypeError.\r\n-\r\n- Returns the string WITH A LEADING SPACE (!). This is for backwards\r\n- compatibility, and may in part be explained by the fact that\r\n- Bio.Restriction doesn\'t use zero based counting.\r\n- """\r\n- # Remove white space and make upper case:\r\n- seq_string = "".join(seq_string.split()).upper()\r\n- # Remove digits\r\n- for c in "0123456789":\r\n- seq_string = seq_string.replace(c, "")\r\n- # Check only allowed IUPAC letters\r\n- if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")):\r\n- raise TypeError("Invalid character found in %s" % repr(seq_string))\r\n- return " " + seq_string\r\n-\r\n- matching = {\'A\': \'ARWMHVDN\', \'C\': \'CYSMHBVN\', \'G\': \'GRSKBVDN\',\r\n- \'T\': \'TYWKHBDN\', \'R\': \'ABDGHKMNSRWV\', \'Y\': \'CBDHKMNSTWVY\',\r\n- \'W\': \'ABDHKMNRTWVY\', \'S\': \'CBDGHKMNSRVY\', \'M\': \'ACBDHMNSRWVY\',\r\n- \'K\': \'BDGHKNSRTWVY\', \'H\': \'ACBDHKMNSRTWVY\',\r\n- \'B\': \'CBDGHKMNSRTWVY\', \'V\': \'ACBDGHKMNSRWVY\',\r\n- \'D\': \'ABDGHKMNSRTWVY\', \'N\': \'ACBDGHKMNSRTWVY\'}\r\n-\r\n-class pattern(object):\r\n-\r\n-\r\n- def __init__(self, pattern_input):\r\n- s = str(pattern_input)\r\n- self.upper = s.isupper()\r\n- self.data = _check_bases(s)\r\n- self.pattern = s\r\n-\r\n- def plan_ambiguity(self):\r\n- val = Bio.Data.IUPACData.ambiguous_dna_values\r\n- re_pattern = ""\r\n- for el in self.pattern:\r\n- re_pattern = re_pattern + "[" + val[el] + "]"\r\n- return re_pattern\r\n-\r\n-class annotated_genome(object):\r\n-\r\n- def __init__(self, seq):\r\n- s = str(seq)\r\n- self.upper = s.isupper()\r\n- self.data = _check_bases(s)\r\n- self.seq = s\r\n-\r\n- def codon_usage(self, codonTable):\r\n-\r\n- codon_usage = {}\r\n- tmp = [x for x in re.split(r\'(\\w{3})\', self.seq) if x != ""]\r\n-\r\n- b_cod_table = CodonTable.unambiguous_dna_by_name["Bacterial"].forward_table\r\n- aas = set(b_cod_table.values())\r\n-\r\n- for aa in aas:\r\n- codon_usage[aa] = {}\r\n- for codon in b_cod_table.keys():\r\n- if b_cod_table[codon] == aa:\r\n- codon_usage[aa][codon] = tmp.count(codon)\r\n-\r\n- tups = {(outerKey, innerKey): values for outerKey, innerDict in codon_usage.iteritems() for innerKey, values in innerDict.iteritems()}\r\n-\r\n- codon_usage_ = pd.DataFrame(pd.Series(tups), columns = ["Count"])\r\n- codon_usage_.index = codon_usage_.index.set_names(["AA", "Codon"])\r\n- codon_usage_[\'Proportion\'] = codon_usage_.groupby(level=0).transform(lambda x: (x / x.sum()).round(2))\r\n-\r\n- codon_usage_.reset_index(inplace=True)\r\n- codon_usage_.index = codon_usage_["Codon"]\r\n-\r\n- return {"Dictionary": codon_usage, "Tuples": tups, "Table": codon_usage_}\r\n-\r\n-class plasmid(object):\r\n- """\r\n- This class represents a circular plasmid\r\n- """\r\n-\r\n- def __init__(self, seq = "", circular=True, features = None):\r\n-\r\n- if type(seq) in [Bio.SeqRecord.SeqRecord, plasmid, Seq]:\r\n- s = str(seq.seq)\r\n- self.features = seq.features\r\n- else:\r\n- s = str(seq)\r\n- i'..b'codons:\r\n- b_cod_table[cod] = "_Stop"\r\n-\r\n- for cod in CodonTable.unambiguous_dna_by_name[codonTable].start_codons:\r\n- #print(cod)\r\n- b_cod_table[cod] = b_cod_table[cod]\r\n-\r\n- aas = set(b_cod_table.values())\r\n-\r\n- for aa in aas:\r\n- #print(aa)\r\n- #codon_usage[aa] = {}\r\n- for codon in b_cod_table.keys():\r\n- if b_cod_table[codon] == aa:\r\n- codon_usage[codon] = tmp.count(codon.split(" ")[0])\r\n-\r\n- return codon_usage\r\n-\r\n-\r\n-def read_annotated_genome(data="example.fna", type_="fasta"):\r\n- """\r\n- Accepted formats:\r\n- - fasta (multifasta)\r\n- - gbk\r\n-\r\n- """\r\n-\r\n- seqs = ""\r\n-\r\n- if type_ == "fasta":\r\n- with open(data, "rU") as handle:\r\n- for record in SeqIO.parse(handle, type_):\r\n- seqs = seqs + str(record.seq)\r\n-\r\n- elif type_ == "genbank":\r\n- with open(data, "rU") as input_handle:\r\n- types = []\r\n- for record in SeqIO.parse(input_handle, "genbank"):\r\n- for feature in record.features:\r\n- types.append(feature.type)\r\n- if feature.type == "CDS":\r\n- if feature.location.strand == +1:\r\n- seq = record.seq[feature.location.start:feature.location.end]\r\n- seqs = seqs + str(seq)\r\n- elif feature.location.strand == -1:\r\n- seq = record.seq[feature.location.start:\r\n- feature.location.end].reverse_complement()\r\n- seqs = seqs + str(seq)\r\n- return seqs\r\n-\r\n-\r\n-def synonims_(table_name="Bacterial"):\r\n-\r\n- b_cod_table = CodonTable.unambiguous_dna_by_name[table_name].forward_table\r\n-\r\n- print(b_cod_table)\r\n-\r\n- for cod in CodonTable.unambiguous_dna_by_name[table_name].stop_codons:\r\n- b_cod_table[cod] = "_Stop"\r\n-\r\n- for cod in CodonTable.unambiguous_dna_by_name[table_name].start_codons:\r\n- b_cod_table[cod] = "_Start"\r\n-\r\n- #pprint(b_cod_table)\r\n- codons = {}\r\n-\r\n- aas = set(b_cod_table.values())\r\n-\r\n- for aa in aas:\r\n- codons[aa] = []\r\n- for codon in b_cod_table.keys():\r\n- if b_cod_table[codon] == aa:\r\n- codons[aa].append(codon)\r\n-\r\n- #break\r\n-\r\n- synonims = {}\r\n-\r\n- for el1 in codons.keys():\r\n- print(el1)\r\n- for el2 in codons[el1]:\r\n- print(el2)\r\n- synonims[el2] = codons[el1]\r\n- #synonims[el2] = []\r\n- #for el3 in codons[el1]#set.difference(set(codons[el1]), {el2}):\r\n- # print(el3)\r\n- # synonims[el2].append(el3)\r\n- #break\r\n- #break\r\n- #break\r\n-\r\n-\r\n- anti_codons = {}\r\n-\r\n- for codon in synonims.keys():\r\n- tmp_codon = Bio.Seq.Seq(codon, IUPAC.unambiguous_dna)\r\n- tmp_anticodon = str(tmp_codon.reverse_complement())\r\n-\r\n- anti_codons[tmp_anticodon] = []\r\n-\r\n- for synonim in synonims[codon]:\r\n- tmp_synonim = Bio.Seq.Seq(synonim, IUPAC.unambiguous_dna)\r\n- tmp_antisynonim = str(tmp_synonim.reverse_complement())\r\n- anti_codons[tmp_anticodon].append(tmp_antisynonim)\r\n-\r\n- check = Bio.Seq.Seq("CTT")\r\n- anti_check = check.reverse_complement()\r\n- print("\\nCheck:\\n" + str(check))\r\n- print("\\nCodons:\\n")\r\n-\r\n- for key in codons.keys():\r\n- if str(check) in codons[key]:\r\n- print(codons[key])\r\n-\r\n- #pprint(codons)\r\n- print("\\nSynonims:\\n")\r\n- pprint(synonims[str(check)])\r\n- print("\\nAnti_Codons:\\n")\r\n- pprint(anti_codons[str(anti_check)])\r\n-\r\n- #i = synonims.keys()\r\n- #right = True\r\n- #while len(i) > 0:\r\n- # tmp = i.pop()\r\n- # check = Bio.Seq.Seq(tmp)\r\n- # anti_check = check.reverse_complement()\r\n-\r\n-\r\n- return {"synonims":synonims, "anti_synonims":anti_codons}\r\n' |