Previous changeset 1:9a357864ec5c (2018-07-09) Next changeset 3:d74e8875822e (2018-07-09) |
Commit message:
Deleted selected files |
removed:
auxiliar.py auxiliar.pyc phage_promoters.py phage_promoters.xml scaler_2400.sav |
b |
diff -r 9a357864ec5c -r 9ed8de889d97 auxiliar.py --- a/auxiliar.py Mon Jul 09 09:30:40 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,144 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Sun May 27 17:37:09 2018 - -@author: Marta -""" - - - -def get_bacteria(file): - import pandas as pd - df = pd.read_excel(file,header=0,index_col=0) - bacteria = {} - for ind,row in df.iterrows(): - bac = row['Bacteria'] - bacteria[ind] = bac - return bacteria - -#retorna a familia do fago -def get_families(file): - import pandas as pd - df = pd.read_excel(file,header=0,index_col=0) - families = {} - for ind,row in df.iterrows(): - fam = row['Family'] - families[ind] = fam - return families - -def get_lifecycle(file): - import pandas as pd - df = pd.read_excel(file,header=0,index_col=0) - types = {} - for ind,row in df.iterrows(): - lc = row['lifecycle'] - types[ind] = lc - return types - -#dá os scores e as posições do motif numa sequencia, ao ler o ficheiro com a pssm - -def get_max_pssm(file_pssm): - from Bio.Alphabet import IUPAC - from Bio.motifs import matrix - m = [] - fic = open(file_pssm,'r') - rf = fic.readline() - while rf: - new_l = [] - l = rf.strip().split('\t') - for val in l: - x = float(val) - new_l.append(x) - m.append(new_l) - rf = fic.readline() - a = IUPAC.unambiguous_dna - dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} - pssm = matrix.PositionSpecificScoringMatrix(a,dic) - return pssm.max - - -def get_scores(file_pssm, seq): - from Bio.Alphabet import IUPAC - from Bio.motifs import matrix - maxi = get_max_pssm(file_pssm) - m = [] - fic = open(file_pssm,'r') - rf = fic.readline() - while rf: - new_l = [] - l = rf.strip().split('\t') - for val in l: - x = float(val) - new_l.append(x) - m.append(new_l) - rf = fic.readline() - a = IUPAC.unambiguous_dna - dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} - pssm = matrix.PositionSpecificScoringMatrix(a,dic) - scores = [] - positions = [] - a = IUPAC.unambiguous_dna - seq.alphabet = a - for pos, score in pssm.search(seq, both=False,threshold=-50): - scores.append(score/maxi) - positions.append(pos) - return scores,positions - -def get_genes(fic_name): - from Bio import SeqIO - numbers = [] - fic = open(fic_name,'r') - rf = fic.readline() - while rf: - phage = rf.strip() - numbers.append(phage) - rf = fic.readline() - fic.close() - dic = {} - for number in numbers: - rec = SeqIO.read('genomas/'+number+'.gb','gb') - comp = [] - dire = [] - for feat in rec.features: - if feat.type == 'gene': - loc = feat.location - if loc.strand == 1: dire.append(loc) - else: comp.append(loc) - dic[number] = {'comp':comp, 'dir':dire} - return dic - -def freq_base(seq): - A = seq.count('A') - C = seq.count('C') - G = seq.count('G') - T = seq.count('T') - AT = A+T - CG = C+G - return AT,CG - -def free_energy(seq): - dic1 = {'AA':-1.00, - 'TT':-1.00, - 'AT':-0.88, - 'TA':-0.58, - 'CA':-1.45, - 'AC':-1.44, - 'GG':-1.84, - 'CC':-1.84, - 'GA':-1.30, - 'AG':-1.28, - 'TC':-1.30, - 'CT':-1.28, - 'TG':-1.45, - 'GT':-1.44, - 'GC':-2.24, - 'CG':-2.17} - total = 0 - i = 0 - j = 1 - while i < len(seq)-1: - dint = seq[i]+seq[j] - total += dic1[dint] - i += 1 - j += 1 - return total |
b |
diff -r 9a357864ec5c -r 9ed8de889d97 auxiliar.pyc |
b |
Binary file auxiliar.pyc has changed |
b |
diff -r 9a357864ec5c -r 9ed8de889d97 phage_promoters.py --- a/phage_promoters.py Mon Jul 09 09:30:40 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,533 +0,0 @@\n-# -*- coding: utf-8 -*-\r\n-"""\r\n-Created on Mon Jun 11 21:08:47 2018\r\n-\r\n-@author: Marta\r\n-"""\r\n-\r\n-from Bio import SeqIO\r\n-import numpy as np\r\n-import pandas as pd\r\n-from Bio.Seq import Seq\r\n-from Bio.SeqRecord import SeqRecord\r\n-from Bio.Alphabet import IUPAC\r\n-from auxiliar import get_max_pssm, get_scores, free_energy,freq_base\r\n-\r\n-def get_testseqs65(form,fic,both=False):\r\n- ALL = []\r\n- indexes = []\r\n- a = 0\r\n- rec = SeqIO.read(fic, form)\r\n- genome = rec.seq\r\n- i = 0\r\n- j = 65\r\n- while j < len(genome):\r\n- s = genome[i:j]\r\n- ALL.append([1,i,j,s])\r\n- i += 20\r\n- j += 20\r\n- a += 1\r\n- indexes.append(rec.name+":"+str(a))\r\n- if both:\r\n- i = 0\r\n- j = 65\r\n- while j < len(genome):\r\n- s = genome[i:j].reverse_complement()\r\n- ALL.append([-1,i,j,s])\r\n- i += 20\r\n- j += 20\r\n- a += 1\r\n- indexes.append(rec.name+":"+str(a))\r\n- df = pd.DataFrame(ALL, index=indexes, columns=[\'strand\',\'iniprom\',\'endprom\',\'seq\'])\r\n- return df\r\n-\r\n-\r\n-def get_dftest(loc, test):\r\n- scores = []\r\n- posis = []\r\n- sizes = []\r\n- dic = {}\r\n- for ind,row in test.iterrows():\r\n- _,window = ind.split(\':\')\r\n- strand = row[\'strand\']\r\n- ini = row[\'iniprom\']\r\n- end = row[\'endprom\']\r\n- seq = row[\'seq\']\r\n- pos = [ini,end,strand]\r\n- dic[window] = pos\r\n- s = seq\r\n- score10_6,pos10_6 = get_scores(os.path.join(loc,\'pssm10_6.txt\'), s)\r\n- maxi10_6 = get_max_pssm(os.path.join(loc,\'pssm10_6.txt\'))\r\n- score10_8,pos10_8 = get_scores(os.path.join(loc,\'pssm10_8.txt\'), s)\r\n- maxi10_8 = get_max_pssm(os.path.join(loc,\'pssm10_8.txt\'))\r\n- scores23,pos23 = get_scores(os.path.join(loc,\'pssm_23.txt\'), s)\r\n- maxi23 = get_max_pssm(os.path.join(loc,\'pssm_23.txt\'))\r\n- scores21,pos21 = get_scores(os.path.join(loc,\'pssm_21.txt\'), s)\r\n- maxi21 = get_max_pssm(os.path.join(loc,\'pssm_21.txt\'))\r\n- scores27,pos27 = get_scores(os.path.join(loc,\'pssm_27.txt\'), s)\r\n- maxi27 = get_max_pssm(os.path.join(loc,\'pssm_27.txt\'))\r\n- scores32,pos32 = get_scores(os.path.join(loc,\'pssm_32.txt\'), s)\r\n- maxi32 = get_max_pssm(os.path.join(loc,\'pssm_32.txt\'))\r\n- score23 = max(scores23)\r\n- score21 = max(scores21)\r\n- score27 = max(scores27)\r\n- score32 = max(scores32)\r\n- maxiphage = max(score23,score21,score27,score32)\r\n- if maxiphage == score23: phage_max = score23*maxi23\r\n- elif maxiphage == score21: phage_max = score21*maxi21\r\n- elif maxiphage == score27: phage_max = score27*maxi27\r\n- elif maxiphage == score32: phage_max = score32*maxi32\r\n- score35_6,pos35_6 = get_scores(os.path.join(loc,\'pssm35_6.txt\'), s)\r\n- maxi35_6 = get_max_pssm(os.path.join(loc,\'pssm35_6.txt\'))\r\n- score35_9,pos35_9 = get_scores(os.path.join(loc,\'pssm35_9.txt\'), s)\r\n- maxi35_9 = get_max_pssm(os.path.join(loc,\'pssm35_9.txt\'))\r\n- score35_t4,pos35_t4 = get_scores(os.path.join(loc,\'pssm35_t4.txt\'), s)\r\n- maxi35_t4 = get_max_pssm(os.path.join(loc,\'pssm35_t4.txt\'))\r\n- score35_cbb,pos35_cbb = get_scores(os.path.join(loc,\'pssm35_cbb.txt\'), s)\r\n- maxi35_cbb = get_max_pssm(os.path.join(loc,\'pssm35_cbb.txt\'))\r\n- score35_lb,pos35_lb = get_scores(os.path.join(loc,\'pssm35_lb.txt\'),s)\r\n- maxi35_lb = get_max_pssm(os.path.join(loc,\'pssm35_lb.txt\'))\r\n- score35_mu, pos35_mu = get_scores(os.path.join(loc,\'pssm35_mu.txt\'),s)\r\n- maxi35_mu = get_max_pssm(os.path.join(loc,\'pssm35_mu.txt\'))\r\n- \r\n- dists6 = []\r\n- score6 = []\r\n- for p in pos10_6:\r\n- for a in range(14,22):\r\n- d = p-a-6\r\n- if d >= 0: \r\n- s10 = score10_6[p]\r\n- s35_6 = score35_6[d]\r\n- score6.append([s35_6,s10])\r\n- '..b' inij = int(groups[j][11:].split(\'..\')[0])\r\n- else:\r\n- inij = int(groups[j][1:].split(\'..\')[0])\r\n- if inij < inii:\r\n- temp = groups[i]\r\n- groups[i] = groups[j]\r\n- groups[j] = temp\r\n- new_inds = []\r\n- for g in groups:\r\n- inds = new_df.groups[g]\r\n- if len(inds) == 1: new_inds.append(inds[0])\r\n- else:\r\n- #maxi = max(g[\'scores\'])\r\n- maxi = max(new_df.get_group(g)[\'scores\'])\r\n- i = new_df.groups[g][new_df.get_group(g)[\'scores\']==maxi][0]\r\n- new_inds.append(i)\r\n- output = test.loc[new_inds,:]\r\n- #output.to_excel(\'output.xlsx\', header=True, index=True)\r\n- output.to_html(\'output.html\',index=False)\r\n- recs = []\r\n- for ind,row in output.iterrows():\r\n- s = Seq(row[\'promoter_seq\'])\r\n- posis = row[\'positions\']\r\n- typ = row[\'promoter_type\']\r\n- score = row[\'scores\']\r\n- sq = SeqRecord(seq=s, id=ind, description=typ+\' \'+posis+\' score=\'+str(score))\r\n- recs.append(sq)\r\n- SeqIO.write(recs, \'output.fasta\',\'fasta\')\r\n- \r\n- \r\n-def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):\r\n- from sklearn.externals import joblib\r\n- scaler = joblib.load(scaler_file)\r\n- model = joblib.load(model_file)\r\n- feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])\r\n- TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)\r\n- #pred = model.predict(TEST_scaled)\r\n- scores = model.predict_proba(TEST_scaled)\r\n- pos_scores = np.empty((TEST_scaled.shape[0],0), float)\r\n- for x in scores: pos_scores = np.append(pos_scores,x[1])\r\n- try: positive_indexes = np.nonzero(pos_scores>=float(threshold))[0] #escolher os positivos, podia ser escolher com score > x\r\n- except ValueError: return \'The threshold value is not a float\'\r\n- else:\r\n- if len(positive_indexes) == 0: return None\r\n- else:\r\n- positive_windows = TEST_scaled.index[positive_indexes]\r\n- INFO = df_testinfo.loc[positive_windows,[\'positions\',\'promoter_seq\']]\r\n- promoter_type = []\r\n- for x in df_testinfo.loc[positive_windows,\'host\'].tolist():\r\n- if x == 0: promoter_type.append(\'phage\')\r\n- else: promoter_type.append(\'host\')\r\n- INFO[\'promoter_type\'] = promoter_type\r\n- INFO[\'scores\'] = np.around(pos_scores[positive_indexes],decimals=3)\r\n- INFO.index = positive_windows\r\n- return INFO\r\n-\r\n-if __name__== "__main__":\r\n- import sys\r\n- import os\r\n- __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))\r\n- scaler_file = os.path.join(__location__, \'scaler_2400.sav\')\r\n- model_file = os.path.join(__location__, \'model_SVM_2400.sav\')\r\n- \r\n- gen_format = sys.argv[1]\r\n- genome_file = sys.argv[2]\r\n- both = sys.argv[3]\r\n- threshold = sys.argv[4]\r\n- family = sys.argv[5]\r\n- host = sys.argv[6]\r\n- phage_type = sys.argv[7]\r\n- \'\'\'\r\n- \r\n- gen_format = \'gb\'\r\n- genome_file = \'test-data/NC_015264.gb\'\r\n- genbank_fasta = \'genbank\'\r\n- both = False\r\n- threshold = \'0.50\'\r\n- family = \'Podoviridae\'\r\n- host = \'Pseudomonas\'\r\n- phage_type = \'virulent\'\r\n- \'\'\'\r\n- test_windows = get_testseqs65(gen_format, genome_file,both)\r\n- try: score_test,dic_window = get_dftest(__location__,test_windows)\r\n- except IndexError: print(\'Error. Input sequence can only have A,C,G or T\')\r\n- else:\r\n- df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)\r\n- preds = get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)\r\n- if preds is None: print(\'There is no sequence with a score value higher or equal to the threshold \'+str(threshold))\r\n- elif type(preds) == str: print(preds)\r\n- else: output = get_finaldf(preds)\r\n- \r\n' |
b |
diff -r 9a357864ec5c -r 9ed8de889d97 phage_promoters.xml --- a/phage_promoters.xml Mon Jul 09 09:30:40 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,104 +0,0 @@ -<tool id="get_proms" name="PhagePromoters" version="0.1.0"> - <description> -Get promoters of phage genomes - </description> - <requirements> - <requirement type="package">biopython</requirement> - <requirement type="package">scikit-learn</requirement> - <requirement type="package">numpy</requirement> - <requirement type="package">pandas</requirement> - </requirements> - <command detect_errors="exit_code" interpreter="python3"><![CDATA[ - phage_promoters.py "$input_type.genome_format" "$genome" "$both" "$threshold" "$family" "$bacteria" "$lifecycle" - ]]> - </command> - <inputs> - <conditional name="input_type"> - <param type="select" name="genome_format" label='file format'> - <option value="genbank" selected="yes">genbank</option> - <option value="fasta">fasta</option> - </param> - <when value="genbank"> - <param type="data" name="genome" format="genbank" label='genome'/> - </when> - <when value="fasta"> - <param type="data" name="genome" format="fasta" label='genome'/> - </when> - </conditional> - <param type="boolean" name="both" label='Search both strands' checked="false" truevalue="-both" falsevalue="" /> - <param name="threshold" type="float" value="0.50" label="Threshold" help="Probabilty of being a promoter (float between 0 and 1)" /> - <param type="select" name="family" label='Phage family'> - <option value="Podoviridae" selected="yes">Podoviridae</option> - <option value="Siphoviridae">Siphoviridae</option> - <option value="Myoviridae">Myoviridae</option> - </param> - <param type="select" name="bacteria" label='Host bacteria Genus'> - <option value="Escherichia coli" selected="yes">Escherichia coli</option> - <option value="Salmonella">Salmonella</option> - <option value="Pseudomonas">Pseudomonas</option> - <option value="Yersinia">Yersinia</option> - <option value="Morganella">Morganella</option> - <option value="Cronobacter">Cronobacter</option> - <option value="Staphylococcus">Staphylococcus</option> - <option value="Streptococcus">Streptococcus</option> - <option value="Lactococcus">Lactococcus</option> - <option value="Streptomyces">Streptomyces</option> - <option value="Klebsiella">Klebsiella</option> - <option value="Bacillus">Bacillus</option> - <option value="Pectobacterium">Pectobacterium</option> - <option value="other">other</option> - </param> - <param type="select" name="lifecycle" label='Phage type'> - <option value="virulent" selected="yes">virulent</option> - <option value="temperate">temperate</option> - </param> - </inputs> - <outputs> - <data name="output1" format="html" from_work_dir="output.html" /> - <data name="output2" format="fasta" from_work_dir="output.fasta" /> - </outputs> - <tests> - <test> - <param name="genome_format" value="genbank"/> - <param name="genome" value="NC_015264.gb"/> - <param name="both" value="False"/> - <param name="threshold" value="0.50"/> - <param name="family" value="Podoviridae"/> - <param name="bacteria" value="Pseudomonas"/> - <param name="lifecycle" value="virulent"/> - <output name="output1" file="output.html"/> - <output name="output2" file="output.fasta"/> - </test> - </tests> - <help><![CDATA[ - -=============== -PhagePromoters -=============== - -Get promoters of phage genomes - -PhagePromoters is a python script that predicts promoter sequences in phage genomes, using a machine learning SVM model. This model was built from a train dataset with 25 features and 3200 examples (800 positives and 2400 negatives), each representing a 65 bp sequence of a phage genome. The positive cases represent the phage sequences that are already identified as promoters. - -**Inputs:** - -* genome format: fasta vs genbank; -* genome file: acepts both genbank and fasta formats; -* both strands (yes or no): allows the search in both DNA strands; -* threshold: represents the probability of the test sequence be a promoter (float between 0 and 1)" -* family: The family of the testing phage - Podoviridae, Siphoviridae or Myoviridae; -* Bacteria: The host of the phage. The train dataset include the following hosts: Bacillus, EColi, Salmonella, Pseudomonas, Yersinia, Klebsiella, Pectobacterium, Morganella, Cronobacter, Staphylococcus, Streptococcus, Streptomyces, Lactococcus. If the testing phage has a different host, select the option 'other', and it is recommended the use of a higher threshold value for more accurate results. -* phage type: The type of the phage, according to its lifecycle: virulent or temperate; - -**Outputs:** -This tool outputs two files: a FASTA file and a table in HTML, with the locations, sequence, score and type (recognized by host or phage RNAP) of the predicted promoters. - -**Requirements:** - -* Biopython -* Sklearn -* Numpy -* Pandas - - ]]></help> -</tool> |
b |
diff -r 9a357864ec5c -r 9ed8de889d97 scaler_2400.sav |
b |
Binary file scaler_2400.sav has changed |