Next changeset 1:f35b2f3ca139 (2013-10-11) |
Commit message:
Uploaded v0.0.4, first public release. |
added:
test-data/four_human_proteins.fasta test-data/four_human_proteins.predictnls.tabular tools/protein_analysis/My_NLS_list tools/protein_analysis/predictnls.py tools/protein_analysis/predictnls.txt tools/protein_analysis/predictnls.xml |
b |
diff -r 000000000000 -r 6e26c5a48e9a test-data/four_human_proteins.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta Wed Feb 20 11:39:06 2013 -0500 |
b |
@@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA |
b |
diff -r 000000000000 -r 6e26c5a48e9a test-data/four_human_proteins.predictnls.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.predictnls.tabular Wed Feb 20 11:39:06 2013 -0500 |
[ |
@@ -0,0 +1,2 @@ +#ID NLS start NLS seq NLS pattern Type ProtCount %NucProt ProtList ProtLoci +sp|P06213|INSR_HUMAN 758 SRKRRS [STQM]RKRR[STQM] Potential 1 100 mklp_human nuc |
b |
diff -r 000000000000 -r 6e26c5a48e9a tools/protein_analysis/My_NLS_list --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/My_NLS_list Wed Feb 20 11:39:06 2013 -0500 |
[ |
b'@@ -0,0 +1,310 @@\n+RRMKWKK\tExperimental\t77\t100\t0\thxa5_ambme,scr_apime,hx5l_brare,hxb4_brare,hxb5_brare,hxb6_brare,hxc5_brare,hxc6_brare,hxd4_brare,hxa4_chick,hxa7_cotja,hxb4_chick,hxb5_chick,hxb6_chick,hxd4_chick,hxd8_chick,hmdf_drome,scr_drome,hxb4_fugru,hxa4_human,hxa5_human,hxa6_human,hxa7_human,hxb4_human,hxb5_human,hxb6_human,hxb7_human,hxb8_human,hxc4_human,hxc5_human,hxc6_human,hxc8_human,hxd4_human,hxd8_human,ipf1_human,hxa4_mouse,hxa5_mouse,hxa6_mouse,hxa7_mouse,hxb4_mouse,hxb5_mouse,hxb6_mouse,hxb7_mouse,hxb8_mouse,hxc4_mouse,hxc5_mouse,hxc6_mouse,hxc8_mouse,hxd4_mouse,hxd8_mouse,ipf1_mesau,ipf1_mouse,hxc5_notvi,hxc6_notvi,hxb8_pig,hxa4_rat,hxa5_rat,hxa7_rat,hxb7_rat,hxb8_rat,hxc4_rat,hxc8_rat,ipf1_rat,hxa4_sheep,hxa5_salsa,hxa5_sheep,hxa7_sheep,hxc6_sheep,hb7a_xenla,hb7b_xenla,hm8_xenla,hxa7_xenla,hxb4_xenla,hxb5_xenla,hxb6_xenla,hxc5_xenla,hxc6_xenla\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+RVHPYQR\tExperimental\t0\t0\t0\n+KRPACTLKPECVQQLLVCSQEAKK\tExperimental\t0\t0\t0\n+PKKKRKV\tExperimental\t3\t100\t0\ttala_povba,tala_povbk,tala_sv40\tnuc,nuc,nuc\n+GKKRSKA\tExperimental\t2\t100\t0\tppol_drome,h2b1_yeast\tnuc,nuc\n+KAKRQR\tExperimental\t3\t66.6666666666667\t33.3333333333333\trel_avire,rel_chick,rel_melga\tcyt,nuc,nuc\n+RGRRRRQR\tExperimental\t0\t0\t0\n+RKRRR\tExperimental\t20\t85\t15\tht31_arath,mb11_copci,sdc3_caeel,chd3_human,sn22_human,ve2_hpv04,ve2_hpv07,ve2_hpv40,atf3_mouse,rms5_neucr,h2b_patgr,rpb1_plafd,fre6_rat,spm1_rat,leu3_salty,prt2_scyca,tat_sivmk,tat_sivml,leu3_theaq,yox1_yeast\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,mit,nuc,nuc,nuc,nuc,cyt,nuc,nuc,nuc,cyt,nuc\n+PPVKRERTS\tExperimental\t0\t0\t0\n+PYLNKRKGKP\tExperimental\t0\t0\t0\n+CYGSKNTGAKKRKIDDA\tExperimental\t0\t0\t0\n+KKKKRKREK\tExperimental\t1\t100\t0\tlef1_mouse\tnuc\n+KKKRRSREK\tExperimental\t2\t100\t0\ttcf1_human,tcf1_mouse\tnuc,nuc\n+KRx{7,9}PQPKKKP\tExperimental\t6\t100\t0\tp53_bovin,p53_cerae,p53_human,p53_macfa,p53_macmu,p53_spebe\tnuc,nuc,nuc,nuc,nuc,nuc\n+KVTKRKHDNEGSGSKRPK\tExperimental\t1\t100\t0\tku70_human\tnuc\n+RLKKLKCSKx{19}KTKR\tExperimental\t1\t100\t0\tgal4_yeast\tnuc\n+RRERx{4}RPRKIPR\tExperimental\t0\t0\t0\n+KKKKKEEEGEGKKK\tExperimental\t0\t0\t0\n+PRPRKIPR\tExperimental\t0\t0\t0\n+PPRIYPQLPSAPT\tExperimental\t0\t0\t0\n+KDCVINKHHRNRCQYCRLQR\tExperimental\t0\t0\t0\n+KRx{9}KTKK\tExperimental\t0\t0\t0\n+APKRKSGVSKC\tExperimental\t0\t0\t0\n+RKKRRQRRR\tExperimental\t17\t100\t0\ttat_hv112,tat_hv1a2,tat_hv1b1,tat_hv1b5,tat_hv1br,tat_hv1c4,tat_hv1h2,tat_hv1jr,tat_hv1ma,tat_hv1mn,tat_hv1oy,tat_hv1pv,tat_hv1s1,tat_hv1sc,tat_hv1y2,tat_hv1z2,tat_hv1z6\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+RQARRNRRRRWR\tExperimental\t26\t100\t0\trev_hv112,rev_hv1a2,rev_hv1b1,rev_hv1b8,rev_hv1bn,rev_hv1br,rev_hv1c4,rev_hv1el,rev_hv1h2,rev_hv1j3,rev_hv1jr,rev_hv1lw,rev_hv1ma,rev_hv1mn,rev_hv1nd,rev_hv1oy,rev_hv1pv,rev_hv1rh,rev_hv1s1,rev_hv1s3,rev_hv1sc,rev_hv1w2,rev_hv1y2,rev_hv1z2,rev_hv1z6,rev_hv1z8\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+MPKTRRRPRRSQRKRPPT\tExperimental\t0\t0\t0\n+KRPMNAFIVWSRDQRRK\tExperimental\t4\t100\t0\tsry_calja,sry_gorgo,sry_human,sry_pig\tnuc,nuc,nuc,nuc\n+RPRRK\tExperimental\t16\t100\t0\tsox2_chick,sox3_chick,sry_calja,sry_caphi,sry_gorgo,sox2_human,sox3_human,sry_horse,sry_human,sox2_mouse,sox3_mouse,sx18_mouse,sry_pig,sox2_sheep,sry_sheep,sox3_xenla\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+KRPMNAFMVWAQAARRK\tExperimental\t1\t100\t0\tsx21_mouse\tnuc\n+PRRRK\tExperimental\t1\t100\t0\tsx21_mouse\tnuc\n+[KAR]TPIQKHWRPTVLTEGPPVKIRIETGEWE[KA]\tExperimental\t0\t0\t0\n+PPRKKRTVV\tExperimental\t0\t0\t0\n+YKRPCKRSFIRFI\tExperimental\t0\t0\t0\n+LKDVRKRKLGPGH\tExperimental\t0\t0\t0\n+KRPRP\tExperimental\t4\t100\t0\tebn2_ebv,tala_povm3,tala_povma,tala_povmc\tnuc,nuc,nuc,nuc\n+RRSMKRK\tExperimental\t7\t100\t0\tvdr_bovin,vdr_chick,vdr_cotja,vdr_human,vdr_mous'..b's2_caeel,dbp_human,tef_human,dbp_mouse,dbp_rat,tef_rat\tnuc,nuc,nuc,nuc,nuc,nuc\n+K[RK]{2,4}[ST]H\tPotential\t29\t96.55\t3.44\tmyc_brare,myc1_cypca,myc2_cypca,myc_calja,myc_canfa,myc_carau,myc_chick,myc_felca,clk1_human,myc_human,myc_hylla,zep1_human,clk1_mouse,mcm3_mouse,myc_marmo,myc_mouse,zep1_mouse,myc_oncmy,p53_oryla,myc_pantr,myc_pig,myc_rat,myc_sheep,sye_theth,myc1_xenla,myc2_xenla,esp1_yeast,rpc2_yeast,snf6_yeast\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,cyt,nuc,nuc,nuc,nuc,nuc\n+RH[RK]Hx{2,4}[RK]{2,4}[PL]R\tPotential\t3\t100\t0\thap2_klula,php2_schpo,hap2_yeast\tnuc,nuc,nuc\n+R[RK]{2,4}x{15,19}[RK]{2,4}[QLM]K\tPotential\t7\t100\t0\tandr_human,hxcc_human,andr_mouse,dbx_mouse,andr_rabit,andr_rat,rme1_yeast\tnuc,nuc,nuc,nuc,nuc,nuc,nuc\n+K{3,4}R{2,3}\tPotential\t15\t100\t0\th2b_chith,t2d1_drome,atrx_human,nnp1_human,sn22_human,ssrp_human,t2d1_human,tcf1_human,zfy_human,atrx_mouse,ssrp_mouse,tcf1_mouse,h2b_pladu,ssrp_rat,h2b_ureca\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+R{2,3}xK{2,3}R[ST]\tPotential\t12\t100\t0\toct1_chick,pdm1_drome,pdm2_drome,pdm2_drovi,oct1_human,oct2_human,oct1_mouse,oct2_mouse,oct2_pig,dpol_rcmvm,oct1_rat,oct1_xenla\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\tDNA_BIND\t11\t 91.66\t11\t100\thbox: 100\t\n+RKR{3,5}[ST]\tPotential\t4\t100\t0\tmb11_copci,tat_sivmk,tat_sivml,yox1_yeast\tnuc,nuc,nuc,nuc\n+Q[RK][HRK][RK]xRR\tPotential\t17\t100\t0\tprtb_acigu,tra1_caebr,sus_drome,atrx_human,fre4_human,hsp2_hylla,atrx_mouse,hsp1_notty,hsp1_phaci,hsp1_plams,hsp1_sagim,rev_siva1,rev_sivag,rev_sivat,rev_sivs4,rev_sivsp,mcm2_yeast\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+RRR{3,5}T\tPotential\t5\t100\t0\tprt1_bufja,prt2_bufja,hsp1_plain,prt_perfv,hsp1_sagim\tnuc,nuc,nuc,nuc,nuc\n+D[KR]x{0,1}[QL][RK]{2,3}R\tPotential\t9\t100\t0\tqin_avis3,ftf1_drome,mam_drome,ceba_human,cg2f_human,ceba_mouse,rag1_mouse,ceba_rat,rpc8_yeast\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+Px[PQLVMN][KR]{2,3}xKQ\tPotential\t7\t100\t0\tmyb_bovin,myb_chick,cbf_human,myb_human,myb_mouse,h1c1_xenla,myb_xenla\tnuc,nuc,nuc,nuc,nuc,nuc,nuc\n+PKKKxRK\tPotential\t7\t100\t0\tdnb2_ade04,dnb2_ade07,dnb2_ade40,dnb2_ade41,baso_human,nil2_human,h1l_myttr\tnuc,nuc,nuc,nuc,nuc,nuc,nuc\n+KRx{10}KKKL\tExperimental\t1\t100\t0\tifi3_mouse\tnuc\n+KRQRx{20}KKSKK\tExperimental\t1\t100\t0\tcenf_human\tnuc\n+RRRx{11}KRRK\tExperimental\t1\t100\t0\tcb80_human\tnuc\n+RKRIREDRKATTAQKVQQMKQRLNENERKRKR\tExperimental\t1\t0\t100\tptn2_human\tcyt\n+KRKRRP\tExperimental\t0\t0\t0\n+PKKNRLRRP\tExperimental\t0\t0\t0\n+QRKRQK\tExperimental\t0\t0\t0\n+HRIEEKRKRTYETFKSI\tExperimental\t0\t0\t0\n+KKKYKLK\tExperimental\t0\t0\t0\n+KSKKKAQ\tExperimental\t0\t0\t0\n+LKRPRSPSS\tExperimental\t0\t0\t0\n+KRKx{22}KELQKQITK\tExperimental\t0\t0\t0\n+GKKKYKLKH\tExperimental\t0\t0\t0\n+KKKYKLK\tExperimental\t0\t0\t0\n+KSKKKAQ\tExperimental\t0\t0\t0\n+KKKRERLD\tExperimental\t0\t0\t0\n+RKKRKx{9}KAKKSK\tExperimental\t0\t0\t0\n+RRPSx{22}RRKRQ\tExperimental\t0\t0\t0\n+HKKKKIRTSPTFTTPKTLRLRRQPKYPRKSAPRRNKLDHY\tExperimental\t0\t0\t0\n+YLTQETNKVETYKEQPLKTPGKKKKGKP\tExperimental\t0\t0\t0\n+NQSSNFGPMKGGNFGGRSSGPYGGGGQYFAKPRNQGGY\tExperimental\t0\t0\t0\n+MAPSAKATAAKKAVVKGTNGKKALKVRTSATFRLPKTLKLAR\tExperimental\t0\t0\t0\n+SANKVTKNKSNSSPYLNKRGKPGPDS\tExperimental\t0\t0\t0\n+[KR]XXKNKX{6,8}K[KR]\tPotential\t12\t100\t0\tdet1_arath,bbf2_drome,cbp_human,hmgc_human,p300_human,cbp_mouse,h1l_myttr,hmgc_mouse,tf3a_ranpi,nucl_xenla,pho4_yeast,pr43_yeast\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n+KKx{15}KKRK\tExperimental\t1\t100\t0\tapn1_yeast\tnuc\n+RKRKK\tExperimental\t36\t100\t0\tbr11_brare,pou1_brare,sgf3_bommo,zp12_brare,zp23_brare,zp47_brare,zp50_brare,cf1a_drome,sus_drome,trx_drome,brn1_human,brn4_human,chd3_human,if16_human,oc3n_human,oct6_human,t2d1_human,brn1_mouse,brn4_mouse,elf1_mouse,oc11_mouse,oc3n_mouse,oct6_mouse,oc3n_rat,oct6_rat,sk1a_rat,sk1i_rat,dpoa_schpo,hm16_xenla,hm19_xenla,hm20_xenla,po3a_xenla,po3b_xenla,pou1_xenla,pou2_xenla,sko1_yeast\tnuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc,nuc\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r 6e26c5a48e9a tools/protein_analysis/predictnls.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/predictnls.py Wed Feb 20 11:39:06 2013 -0500 |
[ |
@@ -0,0 +1,167 @@ +#!/usr/bin/env python + +#Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK +# +#Licenced under the GPL (GNU General Public Licence) version 3. +# +#Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later +#versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair +#(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab, +#Columbia University http://rostlab.org/ + +"""Batch mode predictNLS, for finding nuclear localization signals + +This is a Python script re-implementing the predictNLS method, originally +written in Perl, described here: + +Murat Cokol, Rajesh Nair, and Burkhard Rost. +Finding nuclear localization signals. +EMBO reports 1(5), 411-415, 2000 + +http://dx.doi.org/10.1093/embo-reports/kvd092 + +The original Perl script was designed to work on a single sequence at a time, +but offers quite detailed output, including HTML (webpage). + +This Python version is designed to work on a single FASTA file containing +multiple sequences, and produces a single tabular output file, with one line +per NLS found (i.e. zero or more rows per query sequence). + +It takes either two or three command line arguments: + +predictNLS_batch input_file output_file [nls_motif_file] + +The input file should be protein sequences in FASTA format, the output file +is tab separated plain text, and the NLS motif file defaults to using the +plain text My_NLS_list file located next to the script file, or in a data +subdirectory. + +For convience if using this outside Galaxy, the input filename can be '-' +to mean stdin, and likewise the output filename can be '-' to mean stdout. + +Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to +predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in +August 2010, see the change log included in those tar-balls). + +The Rost Lab provide source code tar balls for predictNLS on the FTP site +ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they +recommend their package repository instead, +https://rostlab.org/owiki/index.php/Packages +""" + +import os +import sys +import re + +def stop_err(msg, return_code=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(return_code) + +if len(sys.argv) == 4: + fasta_filename, tabular_filename, re_filename = sys.argv[1:] +elif len(sys.argv) == 3: + fasta_filename, tabular_filename = sys.argv[1:] + #Use os.path.realpath(...) to handle being called via a symlink + #Try under subdirectory data: + re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), + "data", "My_NLS_list") + if not os.path.isfile(re_filename): + #Try in same directory as this script: + re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), + "My_NLS_list") +else: + stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file") + +if not os.path.isfile(fasta_filename): + stop_err("Could not find FASTA input file: %s" % fasta_filename) + +if not os.path.isfile(re_filename): + stop_err("Could not find NLS motif file: %s" % re_filename) + +def load_re(filename): + """Parse the 5+ column tabular NLS motif file.""" + handle = open(filename, "rU") + for line in handle: + line = line.rstrip("\n") + if not line: + continue + parts = line.split("\t") + assert 5 <= len(parts), parts + regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5] + try: + regex = re.compile(regex) + p_count = int(p_count) + except ValueError: + stop_err("Bad data in line: %s" % line) + if 6 <= len(parts): + proteins = parts[5] + assert p_count == len(proteins.split(",")), line + else: + proteins = "" + assert p_count == 0 + if 7 <= len(parts): + domains = parts[6] + assert int(p_count) == len(domains.split(",")), line + else: + domains = "" + assert p_count == 0 + #There can be further columns (DNA binding?), but we don't use them. + yield regex, evidence, p_count, percent_nuc, proteins, domains + handle.close() + +def fasta_iterator(filename): + """Simple FASTA parser yielding tuples of (name, upper case sequence).""" + if filename == "-": + handle = sys.stdin + else: + handle = open(filename) + name, seq = "", "" + for line in handle: + if line.startswith(">"): + if name: + yield name, seq + #Take the first word only as the name: + name = line[1:].rstrip().split(None,1)[0] + seq = "" + elif name: + #Simple way would leave in any internal white space, + #seq += line.strip().upper() + seq += "".join(line.strip().upper().split()) + elif not line.strip(): + #Ignore blank lines before first record + pass + else: + raise ValueError("Bad FASTA line %r" % line) + if filename != "-": + handle.close() + if name: + yield name, seq + raise StopIteration + +motifs = list(load_re(re_filename)) +print "Looking for %i NLS motifs" % len(motifs) + +if tabular_filename == "-": + out_handle = sys.stdout +else: + out_handle = open(tabular_filename, "w") +out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n") +count = 0 +nls = 0 +for idn, seq in fasta_iterator(fasta_filename): + for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs: + #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40 + #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches + for match in regex.finditer(seq): + #Perl predictnls v1.0.17 (and older) return NLS start position with zero + #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38 + #We therefore also use one based couting, hence the start+1 here: + out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \ + % (idn, match.start()+1, match.group(), + regex.pattern, evidence, p_count, + percent_nuc_prot, proteins, domains)) + nls += 1 + count += 1 +if tabular_filename != "-": + out_handle.close() +print "Found %i NLS motifs in %i sequences" % (nls, count) |
b |
diff -r 000000000000 -r 6e26c5a48e9a tools/protein_analysis/predictnls.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/predictnls.txt Wed Feb 20 11:39:06 2013 -0500 |
b |
@@ -0,0 +1,92 @@ +Python re-implementation of predictNLS with Galaxy wrapper +========================================================== + +This Galaxy tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +The tool consists of a Galaxy interface definition (predictnls.xml), and a Python +script (predictnls.py) which re-implements the command line tool predictNLS. This +should match the behaviour of predictNLS v1.0.20 (July 2011), the current latest +release from the Rost Lab, see http://rostlab.org and their paper: + +Murat Cokol, Rajesh Nair, and Burkhard Rost. +Finding nuclear localization signals. +EMBO reports 1(5), 411–415, 2000 +http://dx.doi.org/10.1093/embo-reports/kvd092 + + +Automatic Installation +====================== + +This Galaxy tool is self contained, and so should install automatically via the +Galaxy Tool Shed. See http://toolshed.g2.bx.psu.edu/view/peterjc/predictnls + + +Manual Installation +=================== + +There are just four files which should be moved under the Galaxy tools folder, +e.g. in a tools/protein_analysis filter: + +* predictlns.xml (the Galaxy tool definition) +* predictlns.py (the Python script) +* predictlns.txt (this README file) +* My_NLS_list (the default set of NLS motifs from the Rost Lab) + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer the +tool. If you are using other protein analysis tools like TMHMM or SignalP, put +it next to them. Just add the line: + +<tool file="protein_analysis/predictnls.xml" /> + +If you want to run the unit tests, also add this to tool_conf.xml.sample, and +copy the test files under test-data, then run: + +./run_functional_tests.sh -id predictnls + +That's it. + + +History +======= + +v0.0.4 - Initial public release + + +Developers +========== + +This script and related tools are being developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder: + +$ tar -czf predictnls.tar.gz tools/protein_analysis/predictnls.xml tools/protein_analysis/predictnls.py tools/protein_analysis/predictnls.txt tools/protein_analysis/My_NLS_list test-data/four_human_proteins.fasta test-data/four_human_proteins.predictnls.tabular + +Check this worked: + +$ tar -tzf predictnls.tar.gz +tools/protein_analysis/predictnls.xml +tools/protein_analysis/predictnls.py +tools/protein_analysis/predictnls.txt +tools/protein_analysis/My_NLS_list +test-data/four_human_proteins.fasta +test-data/four_human_proteins.predictnls.tabular + + +Licence (GPL) +============= + +This tool is open source, licensed under the GNU GENERAL PUBLIC LICENSE +version 3 (GNU v3), see http://www.gnu.org/licenses/gpl.html + +The Python script is my reimplementation of the original Perl program from +the Rost Lab, which was released under the GPL v3. Therefore, as I consider +this to be a derivative work, this too is released under the GPL v3. + +Please note that the My_NLS_list should be an exact copy of the file of the +same name included with predictnls-1.0.7.tar.gz to predictnls-1.0.20.tar.gz +inclusive (the list was extended in v1.0.7 in August 2010, see the change log +included in those tar-balls), available from ftp://rostlab.org/predictnls/ |
b |
diff -r 000000000000 -r 6e26c5a48e9a tools/protein_analysis/predictnls.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/predictnls.xml Wed Feb 20 11:39:06 2013 -0500 |
b |
@@ -0,0 +1,82 @@ +<tool id="predictnls" name="PredictNLS" version="0.0.4"> + <description>Find nuclear localization signals (NLSs) in protein sequences</description> + <command interpreter="python"> + predictnls.py $fasta_file $tabular_file + </command> + <inputs> + <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> + </inputs> + <outputs> + <data name="tabular_file" format="tabular" label="predictNLS results" /> + </outputs> + <tests> + <test> + <param name="fasta_file" value="four_human_proteins.fasta"/> + <output name="tabular_file" file="four_human_proteins.predictnls.tabular"/> + </test> + </tests> + <requirements> + <requirement type="binary">predictnls</requirement> + </requirements> + <help> + +**What it does** + +This calls a Python re-implementation of the PredictNLS tool for prediction of +nuclear localization signals (NLSs), which works by looking for matches to +a known set of patterns (described using regular expressions). + +The input is a FASTA file of protein sequences, and the output is tabular with +these columns (multiple rows per protein): + +====== ========================================================================== +Column Description +------ -------------------------------------------------------------------------- + 1 Sequence identifier + 2 Start of NLS + 3 NLS sequence + 4 NLS pattern (regular expression) + 5 Number of reference proteins with this NLS + 6 Percentage of reference proteins with this NLS which are nuclear localized + 7 Comma separated list of reference proteins + 8 Comma separated list of reference proteins' localizations +====== ========================================================================== + +If a sequence has no predicted NLS, then there is no line in the output file +for it. This is a simplification of the text rich output from the command line +tool, to give a tabular file suitable for use within Galaxy. + +Information about potential DNA binding (shown in the original predictnls +tool) is not given. + +**Localizations** + +The following abbreviations are used (derived from SWISS-PROT): + +==== ======================= +Abbr Localization +---- ----------------------- +cyt Cytoplasm +pla Chloroplast +ret Eendoplasmic reticululm +ext Extracellular +gol Golgi +lys Lysosomal +mit Mitochondria +nuc Nuclear +oxi Peroxisom +vac Vacuolar +rip Periplasmic +==== ======================= + +**References** + +Murat Cokol, Rajesh Nair, and Burkhard Rost. +Finding nuclear localization signals. +EMBO reports 1(5), 411–415, 2000 +http://dx.doi.org/10.1093/embo-reports/kvd092 + +http://rostlab.org + + </help> +</tool> |