| Next changeset 1:c0eb0e5792f3 (2015-05-13) |
|
Commit message:
Uploaded v0.0.1 (with embedded citation) |
|
added:
test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/ecoli.fastq test-data/ecoli.seq_composition.tabular test-data/four_human_proteins.fasta test-data/four_human_proteins.seq_composition.tabular tools/seq_composition/README.rst tools/seq_composition/seq_composition.py tools/seq_composition/seq_composition.xml tools/seq_composition/tool_dependencies.xml |
| b |
| diff -r 000000000000 -r 087a226e501e test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular Tue Nov 18 10:01:27 2014 -0500 |
| b |
| @@ -0,0 +1,5 @@ +Letter Count Percentage +A 1733 27.68 +C 1267 20.24 +G 1551 24.78 +T 1709 27.30 |
| b |
| diff -r 000000000000 -r 087a226e501e test-data/MID4_GLZRM4E04_rnd30_frclip.sff |
| b |
| Binary file test-data/MID4_GLZRM4E04_rnd30_frclip.sff has changed |
| b |
| diff -r 000000000000 -r 087a226e501e test-data/ecoli.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ecoli.fastq Tue Nov 18 10:01:27 2014 -0500 |
| b |
| b"@@ -0,0 +1,20164 @@\n+@frag_1\n+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC\n++\n+##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_1_a\n+GAGACATATTGCCCGTTGCAGTCAGAATGAAAAGCT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##\n+@frag_2\n+AGAGACATATTGCCCGTTGCAGTCAGAATGAAAAGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%#\n+@frag_3\n+CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTG\n++\n+%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_4\n+ACAGAGACATATTGCCCGTTGCAGTCAGAATGAAAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'\n+@frag_5\n+TTTCATTCTGACTGCAACGGGCAATATGTCTCTGTG\n++\n+)+.024JMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_6\n+ACACAGAGACATATTGCCCGTTGCAGTCAGAATGAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMJ420.+\n+@frag_7\n+TCATTCTGACTGCAACGGGCAATATGTCTCTGTGTG\n++\n+.024JMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_8\n+CCACACAGAGACATATTGCCCGTTGCAGTCAGAATG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMJ420\n+@frag_9\n+ATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGA\n++\n+24JMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_10\n+ATCCACACAGAGACATATTGCCCGTTGCAGTCAGAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMJ4\n+@frag_11\n+TCTGACTGCAACGGGCAATATGTCTCTGTGTGGATT\n++\n+JMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_12\n+TAATCCACACAGAGACATATTGCCCGTTGCAGTCAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_13\n+TGACTGCAACGGGCAATATGTCTCTGTGTGGATTAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_14\n+TTTAATCCACACAGAGACATATTGCCCGTTGCAGTC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_15\n+ACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_16\n+TTTTTAATCCACACAGAGACATATTGCCCGTTGCAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_17\n+TGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_18\n+TTTTTTTAATCCACACAGAGACATATTGCCCGTTGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_19\n+CAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_20\n+TCTTTTTTTAATCCACACAGAGACATATTGCCCGTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_21\n+ACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_22\n+ACTCTTTTTTTAATCCACACAGAGACATATTGCCCG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_23\n+GGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_24\n+ACACTCTTTTTTTAATCCACACAGAGACATATTGCC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_25\n+GCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_26\n+AGACACTCTTTTTTTAATCCACACAGAGACATATTG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_27\n+AATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_28\n+TCAGACACTCTTTTTTTAATCCACACAGAGACATAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_29\n+TATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_30\n+TATCAGACACTCTTTTTTTAATCCACACAGAGACAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_31\n+TGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_32\n+GCTATCAGACACTCTTTTTTTAATCCACACAGAGAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_33\n+TCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_34\n+CTGCTATCAGACACTCTTTTTTTAATCCACACAGAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_35\n+TCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_36\n+AGCTGCTATCAGACACTCTTTTTTTAATCCACACAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_37\n+TGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_38\n+GAAGCTGCTATCAGACACTCTTTTTTTAATCCACAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_39\n+TGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_40\n+CAGAAGCTGCTATCAGACACTCTTTTTTTAATCCAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_41\n+TGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_42\n+TTCAGAAGCTGCTATCAGACACTCTTTTTTTAATCC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_43\n+GATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_44\n+AGTTCAGAAGCTGCTATCAGACACTCTTTTTTTAAT\n++\n+MMMMMMMMMMMMMMMMMMM"..b"4997\n+AATTGATGATGAATCATCAGTAAAATCTATTCATTA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_4998\n+ATAATGAATAGATTTTACTGATGATTCATCATCAAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_4999\n+TTGATGATGAATCATCAGTAAAATCTATTCATTATC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5000\n+AGATAATGAATAGATTTTACTGATGATTCATCATCA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5001\n+GATGATGAATCATCAGTAAAATCTATTCATTATCTC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMK\n+@frag_5002\n+TGAGATAATGAATAGATTTTACTGATGATTCATCAT\n++\n+KKMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5003\n+TGATGAATCATCAGTAAAATCTATTCATTATCTCAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMKKK\n+@frag_5004\n+ATTGAGATAATGAATAGATTTTACTGATGATTCATC\n++\n+KKKKMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5005\n+ATGAATCATCAGTAAAATCTATTCATTATCTCAATA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMKKKK#\n+@frag_5006\n+CTATTGAGATAATGAATAGATTTTACTGATGATTCA\n++\n+##KKKKMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5007\n+GAATCATCAGTAAAATCTATTCATTATCTCAATAGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMKKKK##%\n+@frag_5008\n+AGCTATTGAGATAATGAATAGATTTTACTGATGATT\n++\n+'%##KKKKMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5009\n+ATCATCAGTAAAATCTATTCATTATCTCAATAGCTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMKKKK##%')\n+@frag_5010\n+AAAGCTATTGAGATAATGAATAGATTTTACTGATGA\n++\n++)'%##KKKKMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5011\n+CATCAGTAAAATCTATTCATTATCTCAATAGCTTTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMKKKK##%')+.\n+@frag_5012\n+GAAAAGCTATTGAGATAATGAATAGATTTTACTGAT\n++\n+0.+)'%##KKKKMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5013\n+TCAGTAAAATCTATTCATTATCTCAATAGCTTTTCA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMKKKK##%')+.02\n+@frag_5014\n+ATGAAAAGCTATTGAGATAATGAATAGATTTTACTG\n++\n+420.+)'%##KKKKMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5015\n+AGTAAAATCTATTCATTATCTCAATAGCTTTTCATT\n++\n+MMMMMMMMMMMMMMMMMMMMMKKKK##%')+.024J\n+@frag_5016\n+GAATGAAAAGCTATTGAGATAATGAATAGATTTTAC\n++\n+MJ420.+)'%##KKKKMMMMMMMMMMMMMMMMMMMM\n+@frag_5017\n+TAAAATCTATTCATTATCTCAATAGCTTTTCATTCT\n++\n+MMMMMMMMMMMMMMMMMMMKKKK##%')+.024JMM\n+@frag_5018\n+CAGAATGAAAAGCTATTGAGATAATGAATAGATTTT\n++\n+MMMJ420.+)'%##KKKKMMMMMMMMMMMMMMMMMM\n+@frag_5019\n+AAATCTATTCATTATCTCAATAGCTTTTCATTCTGA\n++\n+MMMMMMMMMMMMMMMMMKKKK##%')+.024JMMMM\n+@frag_5020\n+GTCAGAATGAAAAGCTATTGAGATAATGAATAGATT\n++\n+MMMMMJ420.+)'%##KKKKMMMMMMMMMMMMMMMM\n+@frag_5021\n+ATCTATTCATTATCTCAATAGCTTTTCATTCTGACT\n++\n+MMMMMMMMMMMMMMMKKKK##%')+.024JMMMMMM\n+@frag_5022\n+CAGTCAGAATGAAAAGCTATTGAGATAATGAATAGA\n++\n+MMMMMMMJ420.+)'%##KKKKMMMMMMMMMMMMMM\n+@frag_5023\n+CTATTCATTATCTCAATAGCTTTTCATTCTGACTGC\n++\n+MMMMMMMMMMMMMKKKK##%')+.024JMMMMMMMM\n+@frag_5024\n+TGCAGTCAGAATGAAAAGCTATTGAGATAATGAATA\n++\n+MMMMMMMMMJ420.+)'%##KKKKMMMMMMMMMMMM\n+@frag_5025\n+ATTCATTATCTCAATAGCTTTTCATTCTGACTGCAA\n++\n+MMMMMMMMMMMKKKK##%')+.024JMMMMMMMMMM\n+@frag_5026\n+GTTGCAGTCAGAATGAAAAGCTATTGAGATAATGAA\n++\n+MMMMMMMMMMMJ420.+)'%##KKKKMMMMMMMMMM\n+@frag_5027\n+TCATTATCTCAATAGCTTTTCATTCTGACTGCAACG\n++\n+MMMMMMMMMKKKK##%')+.024JMMMMMMMMMMMM\n+@frag_5028\n+CCGTTGCAGTCAGAATGAAAAGCTATTGAGATAATG\n++\n+MMMMMMMMMMMMMJ420.+)'%##KKKKMMMMMMMM\n+@frag_5029\n+ATTATCTCAATAGCTTTTCATTCTGACTGCAACGGG\n++\n+MMMMMMMKKKK##%')+.024JMMMMMMMMMMMMMM\n+@frag_5030\n+GCCCGTTGCAGTCAGAATGAAAAGCTATTGAGATAA\n++\n+MMMMMMMMMMMMMMMJ420.+)'%##KKKKMMMMMM\n+@frag_5031\n+TATCTCAATAGCTTTTCATTCTGACTGCAACGGGCA\n++\n+MMMMMKKKK##%')+.024JMMMMMMMMMMMMMMMM\n+@frag_5032\n+TTGCCCGTTGCAGTCAGAATGAAAAGCTATTGAGAT\n++\n+MMMMMMMMMMMMMMMMMJ420.+)'%##KKKKMMMM\n+@frag_5033\n+TCTCAATAGCTTTTCATTCTGACTGCAACGGGCAAT\n++\n+MMMKKKK##%')+.024JMMMMMMMMMMMMMMMMMM\n+@frag_5034\n+TATTGCCCGTTGCAGTCAGAATGAAAAGCTATTGAG\n++\n+MMMMMMMMMMMMMMMMMMMJ420.+)'%##KKKKMM\n+@frag_5035\n+TCAATAGCTTTTCATTCTGACTGCAACGGGCAATAT\n++\n+MKKKK##%')+.024JMMMMMMMMMMMMMMMMMMMM\n+@frag_5036\n+CATATTGCCCGTTGCAGTCAGAATGAAAAGCTATTG\n++\n+MMMMMMMMMMMMMMMMMMMMMJ420.+)'%##KKKK\n+@frag_5037\n+AATAGCTTTTCATTCTGACTGCAACGGGCAATATGT\n++\n+KKK##%')+.024JMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5038\n+GACATATTGCCCGTTGCAGTCAGAATGAAAAGCTAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##KK\n+@frag_5039\n+TAGCTTTTCATTCTGACTGCAACGGGCAATATGTCT\n++\n+K##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5039_a\n+AGACATATTGCCCGTTGCAGTCAGAATGAAAAGCTA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##K\n" |
| b |
| diff -r 000000000000 -r 087a226e501e test-data/ecoli.seq_composition.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ecoli.seq_composition.tabular Tue Nov 18 10:01:27 2014 -0500 |
| b |
| @@ -0,0 +1,5 @@ +Letter Count Percentage +A 42807 23.59 +C 47930 26.41 +G 47933 26.41 +T 42806 23.59 |
| b |
| diff -r 000000000000 -r 087a226e501e test-data/four_human_proteins.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta Tue Nov 18 10:01:27 2014 -0500 |
| b |
| @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA |
| b |
| diff -r 000000000000 -r 087a226e501e test-data/four_human_proteins.seq_composition.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.seq_composition.tabular Tue Nov 18 10:01:27 2014 -0500 |
| b |
| @@ -0,0 +1,21 @@ +Letter Count Percentage +A 191 5.79 +C 79 2.40 +D 171 5.19 +E 228 6.92 +F 166 5.03 +G 206 6.25 +H 99 3.00 +I 156 4.73 +K 164 4.97 +L 282 8.55 +M 76 2.31 +N 156 4.73 +P 210 6.37 +Q 183 5.55 +R 163 4.94 +S 254 7.70 +T 171 5.19 +V 202 6.13 +W 33 1.00 +Y 107 3.25 |
| b |
| diff -r 000000000000 -r 087a226e501e tools/seq_composition/README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_composition/README.rst Tue Nov 18 10:01:27 2014 -0500 |
| b |
| @@ -0,0 +1,110 @@ +Galaxy tool reporting sequence composition +========================================== + +This tool is copyright 2014 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below (MIT licence). + +This tool is a short Python script (using Biopython library functions) to +loop over given sequence files (in a range of formats including FASTA, FASTQ, +and SFF), and report the count of each letter (i.e. amino acids or bases). + +This can be useful for sanity checking assemblies (e.g. proportion of N +bases) or looking at differences in base composition. + +This tool is available from the Galaxy Tool Shed at: + +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_composition + + +Automated Installation +====================== + +This should be straightforward using the Galaxy Tool Shed, which should be +able to automatically install the dependency on Biopython, and then install +this tool and run its unit tests. + + +Manual Installation +=================== + +There are just two files to install to use this tool from within Galaxy: + +* ``seq_composition.py`` (the Python script) +* ``seq_composition.xml`` (the Galaxy tool definition) + +The suggested location is in a dedicated ``tools/seq_composition`` folder. + +You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line:: + + <tool file="seq_composition/seq_composition.xml" /> + +You will also need to install Biopython 1.62 or later. + +If you wish to run the unit tests, also move/copy the ``test-data/`` files +under Galaxy's ``test-data/`` folder. Then:: + + ./run_tests.sh -id seq_composition + +That's it. + + +History +======= + +======= ====================================================================== +Version Changes +------- ---------------------------------------------------------------------- +v0.0.1 - Initial version. + - Tool definition now embeds citation information. +======= ====================================================================== + + +Developers +========== + +This script and related tools are being developed on this GitHub repository: +https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_composition + +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder:: + + $ tar -czf seq_composition.tar.gz tools/seq_composition/README.rst tools/seq_composition/seq_composition.py tools/seq_composition/seq_composition.xml tools/seq_composition/tool_dependencies.xml test-data/four_human_proteins.fasta test-data/four_human_proteins.seq_composition.tabular test-data/ecoli.fastq test-data/ecoli.seq_composition.tabular test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular + + +Check this worked:: + + $ tar -tzf seq_composition.tar.gz + tools/seq_composition/README.rst + tools/seq_composition/seq_composition.py + tools/seq_composition/seq_composition.xml + tools/seq_composition/tool_dependencies.xml + test-data/four_human_proteins.fasta + test-data/four_human_proteins.seq_composition.tabular + test-data/ecoli.fastq + test-data/ecoli.seq_composition.tabular + test-data/MID4_GLZRM4E04_rnd30_frclip.sff + test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. |
| b |
| diff -r 000000000000 -r 087a226e501e tools/seq_composition/seq_composition.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_composition/seq_composition.py Tue Nov 18 10:01:27 2014 -0500 |
| [ |
| @@ -0,0 +1,91 @@ +#!/usr/bin/env python +"""Record sequence composition from FASTA, FASTQ or SFF files. + +This tool is a short Python script which requires Biopython 1.62 or later +for SFF file support. If you use this tool in scientific work leading to a +publication, please cite the Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2014 by Peter Cock, The James Hutton Institute +(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. +See accompanying text file for licence details (MIT license). + +Use -v or --version to get the version, -h or --help for help. +""" +import os +import sys +from optparse import OptionParser + +from Bio import SeqIO + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +#Parse Command Line +usage = """Example usage: + +$ python seq_composition.py -o my_output.tsv -q input1.fastq -q input2.fastq + +At least one input sequence file is required (using the -f, -q, or -s options). +If the expected alphabet is given, the sequence composition is verfied against +it. +""" +#TODO - Case senstivity? +#TODO - GenBank / EMBL input? Needs the datatype defined... +#TODO - Handle all the FASTQ datatype subclasses in the XML cheetah code? +parser = OptionParser(usage=usage) +parser.add_option('-f', '--fasta', dest='fasta', action="append", default=[], + help='Input sequence filename in FASTA format') +parser.add_option('-q', '--fastq', '--fastqsanger', '--fastqillumina', '--fastqsolexa', + dest='fastq', action="append", default=[], + help='Input sequence filename in FASTQ format') +parser.add_option('-s', '--sff', dest='sff', action="append", default=[], + help='Input sequence filename in SFF format') +parser.add_option('-o', '--output', dest='output', + default=None, help='Output filename (tabular)', + metavar="FILE") +parser.add_option("-v", "--version", dest="version", + default=False, action="store_true", + help="Show version and quit") +options, args = parser.parse_args() + +if options.version: + print("v0.0.1") + sys.exit(0) + +if not (options.fasta or options.fastq or options.sff): + stop_err("Require an input filename") +if not options.output: + stop_err("Require an output filename") + + +file_count = 0 +seq_count = 0 +counts = dict() + +for format, filenames in [("fasta", options.fasta), + ("fastq", options.fastq), + ("sff-trim", options.sff), + ]: + for filename in filenames: + file_count += 1 + for record in SeqIO.parse(filename, format): + seq_count += 1 + for letter in record: + try: + counts[letter] += 1 + except: + counts[letter] = 1 + +total = sum(counts.values()) +sys.stderr.write("Counted %i sequence letters from %i records from %i files\n" % (total, seq_count, file_count)) + +scale = 100.0 / total +with open(options.output, "w") as handle: + handle.write("Letter\tCount\tPercentage\n") + for letter, count in sorted(counts.items()): + handle.write("%s\t%i\t%0.2f\n" % (letter, count, count * scale)) |
| b |
| diff -r 000000000000 -r 087a226e501e tools/seq_composition/seq_composition.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_composition/seq_composition.xml Tue Nov 18 10:01:27 2014 -0500 |
| b |
| @@ -0,0 +1,67 @@ +<tool id="seq_composition" name="Sequence composition" version="0.0.1"> + <description>Count bases or amino-acids</description> + <requirements> + <requirement type="package" version="1.64">biopython</requirement> + <requirement type="python-module">Bio</requirement> + </requirements> + <version_command interpreter="python">seq_composition.py --version</version_command> + <command interpreter="python"> +seq_composition.py -o "$output_file" +##For loop over inputs +#for i in $input_file +--$i.ext "${i}" +#end for + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <inputs> + <param name="input_file" type="data" format="fasta,fastq,sff" multiple="true" label="Sequence file" help="FASTA, FASTQ, or SFF format." /> + </inputs> + <outputs> + <data name="output_file" format="tabular" label="Sequence composition ${on_string}"/> + </outputs> + <tests> + <test> + <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> + <output name="output_file" file="four_human_proteins.seq_composition.tabular" ftype="tabular" /> + </test> + <test> + <param name="input_file" value="ecoli.fastq" ftype="fastq" /> + <output name="output_file" file="ecoli.seq_composition.tabular" ftype="tabular" /> + </test> + <test> + <param name="input_file" value="ecoli.fastq" ftype="fastqsanger" /> + <output name="output_file" file="ecoli.seq_composition.tabular" ftype="tabular" /> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular" ftype="tabular"/> + </test> + </tests> + <help> +**What it does** + +Takes input files of sequences (typically FASTA or FASTQ, but also +Standard Flowgram Format (SFF) is supported), counts all the letters +in each sequence, and returns a summary table of their counts and +percentages. + +**Citation** + +This tool uses Biopython, so if you use this Galaxy tool in work leading to a +scientific publication please cite the following paper: + +Cock et al (2009). Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This tool is available to install into other Galaxy Instances via the Galaxy +Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_composition + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btp163</citation> + </citations> +</tool> |
| b |
| diff -r 000000000000 -r 087a226e501e tools/seq_composition/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_composition/tool_dependencies.xml Tue Nov 18 10:01:27 2014 -0500 |
| b |
| @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="biopython" version="1.64"> + <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency> |