Repository 'seq_composition'
hg clone https://toolshed.g2.bx.psu.edu/repos/peterjc/seq_composition

Changeset 0:087a226e501e (2014-11-18)
Next changeset 1:c0eb0e5792f3 (2015-05-13)
Commit message:
Uploaded v0.0.1 (with embedded citation)
added:
test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular
test-data/MID4_GLZRM4E04_rnd30_frclip.sff
test-data/ecoli.fastq
test-data/ecoli.seq_composition.tabular
test-data/four_human_proteins.fasta
test-data/four_human_proteins.seq_composition.tabular
tools/seq_composition/README.rst
tools/seq_composition/seq_composition.py
tools/seq_composition/seq_composition.xml
tools/seq_composition/tool_dependencies.xml
b
diff -r 000000000000 -r 087a226e501e test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular Tue Nov 18 10:01:27 2014 -0500
b
@@ -0,0 +1,5 @@
+Letter Count Percentage
+A 1733 27.68
+C 1267 20.24
+G 1551 24.78
+T 1709 27.30
b
diff -r 000000000000 -r 087a226e501e test-data/MID4_GLZRM4E04_rnd30_frclip.sff
b
Binary file test-data/MID4_GLZRM4E04_rnd30_frclip.sff has changed
b
diff -r 000000000000 -r 087a226e501e test-data/ecoli.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ecoli.fastq Tue Nov 18 10:01:27 2014 -0500
b
b"@@ -0,0 +1,20164 @@\n+@frag_1\n+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC\n++\n+##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_1_a\n+GAGACATATTGCCCGTTGCAGTCAGAATGAAAAGCT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##\n+@frag_2\n+AGAGACATATTGCCCGTTGCAGTCAGAATGAAAAGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%#\n+@frag_3\n+CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTG\n++\n+%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_4\n+ACAGAGACATATTGCCCGTTGCAGTCAGAATGAAAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'\n+@frag_5\n+TTTCATTCTGACTGCAACGGGCAATATGTCTCTGTG\n++\n+)+.024JMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_6\n+ACACAGAGACATATTGCCCGTTGCAGTCAGAATGAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMJ420.+\n+@frag_7\n+TCATTCTGACTGCAACGGGCAATATGTCTCTGTGTG\n++\n+.024JMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_8\n+CCACACAGAGACATATTGCCCGTTGCAGTCAGAATG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMJ420\n+@frag_9\n+ATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGA\n++\n+24JMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_10\n+ATCCACACAGAGACATATTGCCCGTTGCAGTCAGAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMJ4\n+@frag_11\n+TCTGACTGCAACGGGCAATATGTCTCTGTGTGGATT\n++\n+JMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_12\n+TAATCCACACAGAGACATATTGCCCGTTGCAGTCAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_13\n+TGACTGCAACGGGCAATATGTCTCTGTGTGGATTAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_14\n+TTTAATCCACACAGAGACATATTGCCCGTTGCAGTC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_15\n+ACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_16\n+TTTTTAATCCACACAGAGACATATTGCCCGTTGCAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_17\n+TGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_18\n+TTTTTTTAATCCACACAGAGACATATTGCCCGTTGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_19\n+CAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_20\n+TCTTTTTTTAATCCACACAGAGACATATTGCCCGTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_21\n+ACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_22\n+ACTCTTTTTTTAATCCACACAGAGACATATTGCCCG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_23\n+GGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_24\n+ACACTCTTTTTTTAATCCACACAGAGACATATTGCC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_25\n+GCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_26\n+AGACACTCTTTTTTTAATCCACACAGAGACATATTG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_27\n+AATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_28\n+TCAGACACTCTTTTTTTAATCCACACAGAGACATAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_29\n+TATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_30\n+TATCAGACACTCTTTTTTTAATCCACACAGAGACAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_31\n+TGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_32\n+GCTATCAGACACTCTTTTTTTAATCCACACAGAGAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_33\n+TCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_34\n+CTGCTATCAGACACTCTTTTTTTAATCCACACAGAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_35\n+TCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_36\n+AGCTGCTATCAGACACTCTTTTTTTAATCCACACAG\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_37\n+TGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_38\n+GAAGCTGCTATCAGACACTCTTTTTTTAATCCACAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_39\n+TGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_40\n+CAGAAGCTGCTATCAGACACTCTTTTTTTAATCCAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_41\n+TGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_42\n+TTCAGAAGCTGCTATCAGACACTCTTTTTTTAATCC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_43\n+GATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_44\n+AGTTCAGAAGCTGCTATCAGACACTCTTTTTTTAAT\n++\n+MMMMMMMMMMMMMMMMMMM"..b"4997\n+AATTGATGATGAATCATCAGTAAAATCTATTCATTA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_4998\n+ATAATGAATAGATTTTACTGATGATTCATCATCAAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_4999\n+TTGATGATGAATCATCAGTAAAATCTATTCATTATC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5000\n+AGATAATGAATAGATTTTACTGATGATTCATCATCA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5001\n+GATGATGAATCATCAGTAAAATCTATTCATTATCTC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMK\n+@frag_5002\n+TGAGATAATGAATAGATTTTACTGATGATTCATCAT\n++\n+KKMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5003\n+TGATGAATCATCAGTAAAATCTATTCATTATCTCAA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMKKK\n+@frag_5004\n+ATTGAGATAATGAATAGATTTTACTGATGATTCATC\n++\n+KKKKMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5005\n+ATGAATCATCAGTAAAATCTATTCATTATCTCAATA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMKKKK#\n+@frag_5006\n+CTATTGAGATAATGAATAGATTTTACTGATGATTCA\n++\n+##KKKKMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5007\n+GAATCATCAGTAAAATCTATTCATTATCTCAATAGC\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMKKKK##%\n+@frag_5008\n+AGCTATTGAGATAATGAATAGATTTTACTGATGATT\n++\n+'%##KKKKMMMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5009\n+ATCATCAGTAAAATCTATTCATTATCTCAATAGCTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMMMKKKK##%')\n+@frag_5010\n+AAAGCTATTGAGATAATGAATAGATTTTACTGATGA\n++\n++)'%##KKKKMMMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5011\n+CATCAGTAAAATCTATTCATTATCTCAATAGCTTTT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMMKKKK##%')+.\n+@frag_5012\n+GAAAAGCTATTGAGATAATGAATAGATTTTACTGAT\n++\n+0.+)'%##KKKKMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5013\n+TCAGTAAAATCTATTCATTATCTCAATAGCTTTTCA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMKKKK##%')+.02\n+@frag_5014\n+ATGAAAAGCTATTGAGATAATGAATAGATTTTACTG\n++\n+420.+)'%##KKKKMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5015\n+AGTAAAATCTATTCATTATCTCAATAGCTTTTCATT\n++\n+MMMMMMMMMMMMMMMMMMMMMKKKK##%')+.024J\n+@frag_5016\n+GAATGAAAAGCTATTGAGATAATGAATAGATTTTAC\n++\n+MJ420.+)'%##KKKKMMMMMMMMMMMMMMMMMMMM\n+@frag_5017\n+TAAAATCTATTCATTATCTCAATAGCTTTTCATTCT\n++\n+MMMMMMMMMMMMMMMMMMMKKKK##%')+.024JMM\n+@frag_5018\n+CAGAATGAAAAGCTATTGAGATAATGAATAGATTTT\n++\n+MMMJ420.+)'%##KKKKMMMMMMMMMMMMMMMMMM\n+@frag_5019\n+AAATCTATTCATTATCTCAATAGCTTTTCATTCTGA\n++\n+MMMMMMMMMMMMMMMMMKKKK##%')+.024JMMMM\n+@frag_5020\n+GTCAGAATGAAAAGCTATTGAGATAATGAATAGATT\n++\n+MMMMMJ420.+)'%##KKKKMMMMMMMMMMMMMMMM\n+@frag_5021\n+ATCTATTCATTATCTCAATAGCTTTTCATTCTGACT\n++\n+MMMMMMMMMMMMMMMKKKK##%')+.024JMMMMMM\n+@frag_5022\n+CAGTCAGAATGAAAAGCTATTGAGATAATGAATAGA\n++\n+MMMMMMMJ420.+)'%##KKKKMMMMMMMMMMMMMM\n+@frag_5023\n+CTATTCATTATCTCAATAGCTTTTCATTCTGACTGC\n++\n+MMMMMMMMMMMMMKKKK##%')+.024JMMMMMMMM\n+@frag_5024\n+TGCAGTCAGAATGAAAAGCTATTGAGATAATGAATA\n++\n+MMMMMMMMMJ420.+)'%##KKKKMMMMMMMMMMMM\n+@frag_5025\n+ATTCATTATCTCAATAGCTTTTCATTCTGACTGCAA\n++\n+MMMMMMMMMMMKKKK##%')+.024JMMMMMMMMMM\n+@frag_5026\n+GTTGCAGTCAGAATGAAAAGCTATTGAGATAATGAA\n++\n+MMMMMMMMMMMJ420.+)'%##KKKKMMMMMMMMMM\n+@frag_5027\n+TCATTATCTCAATAGCTTTTCATTCTGACTGCAACG\n++\n+MMMMMMMMMKKKK##%')+.024JMMMMMMMMMMMM\n+@frag_5028\n+CCGTTGCAGTCAGAATGAAAAGCTATTGAGATAATG\n++\n+MMMMMMMMMMMMMJ420.+)'%##KKKKMMMMMMMM\n+@frag_5029\n+ATTATCTCAATAGCTTTTCATTCTGACTGCAACGGG\n++\n+MMMMMMMKKKK##%')+.024JMMMMMMMMMMMMMM\n+@frag_5030\n+GCCCGTTGCAGTCAGAATGAAAAGCTATTGAGATAA\n++\n+MMMMMMMMMMMMMMMJ420.+)'%##KKKKMMMMMM\n+@frag_5031\n+TATCTCAATAGCTTTTCATTCTGACTGCAACGGGCA\n++\n+MMMMMKKKK##%')+.024JMMMMMMMMMMMMMMMM\n+@frag_5032\n+TTGCCCGTTGCAGTCAGAATGAAAAGCTATTGAGAT\n++\n+MMMMMMMMMMMMMMMMMJ420.+)'%##KKKKMMMM\n+@frag_5033\n+TCTCAATAGCTTTTCATTCTGACTGCAACGGGCAAT\n++\n+MMMKKKK##%')+.024JMMMMMMMMMMMMMMMMMM\n+@frag_5034\n+TATTGCCCGTTGCAGTCAGAATGAAAAGCTATTGAG\n++\n+MMMMMMMMMMMMMMMMMMMJ420.+)'%##KKKKMM\n+@frag_5035\n+TCAATAGCTTTTCATTCTGACTGCAACGGGCAATAT\n++\n+MKKKK##%')+.024JMMMMMMMMMMMMMMMMMMMM\n+@frag_5036\n+CATATTGCCCGTTGCAGTCAGAATGAAAAGCTATTG\n++\n+MMMMMMMMMMMMMMMMMMMMMJ420.+)'%##KKKK\n+@frag_5037\n+AATAGCTTTTCATTCTGACTGCAACGGGCAATATGT\n++\n+KKK##%')+.024JMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5038\n+GACATATTGCCCGTTGCAGTCAGAATGAAAAGCTAT\n++\n+MMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##KK\n+@frag_5039\n+TAGCTTTTCATTCTGACTGCAACGGGCAATATGTCT\n++\n+K##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMM\n+@frag_5039_a\n+AGACATATTGCCCGTTGCAGTCAGAATGAAAAGCTA\n++\n+MMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##K\n"
b
diff -r 000000000000 -r 087a226e501e test-data/ecoli.seq_composition.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ecoli.seq_composition.tabular Tue Nov 18 10:01:27 2014 -0500
b
@@ -0,0 +1,5 @@
+Letter Count Percentage
+A 42807 23.59
+C 47930 26.41
+G 47933 26.41
+T 42806 23.59
b
diff -r 000000000000 -r 087a226e501e test-data/four_human_proteins.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.fasta Tue Nov 18 10:01:27 2014 -0500
b
@@ -0,0 +1,61 @@
+>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF
+SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK
+REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER
+VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK
+CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD
+CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF
+HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
+>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG
+GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS
+DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD
+LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG
+KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP
+DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT
+IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE
+ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ
+QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY
+QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV
+ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD
+KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD
+QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE
+NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA
+QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK
+APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD
+EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR
+HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS
+WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ
+SQQSQPVELDPFGAAPFPSKQ
+>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL
+QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL
+VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE
+ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL
+GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG
+CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC
+TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL
+EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE
+RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ
+NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS
+DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE
+RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL
+KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF
+PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV
+SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV
+SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG
+PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR
+EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG
+FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA
+AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV
+RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN
+CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME
+FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN
+PS
+>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY
+VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG
+GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP
+EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES
+ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI
+YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
b
diff -r 000000000000 -r 087a226e501e test-data/four_human_proteins.seq_composition.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.seq_composition.tabular Tue Nov 18 10:01:27 2014 -0500
b
@@ -0,0 +1,21 @@
+Letter Count Percentage
+A 191 5.79
+C 79 2.40
+D 171 5.19
+E 228 6.92
+F 166 5.03
+G 206 6.25
+H 99 3.00
+I 156 4.73
+K 164 4.97
+L 282 8.55
+M 76 2.31
+N 156 4.73
+P 210 6.37
+Q 183 5.55
+R 163 4.94
+S 254 7.70
+T 171 5.19
+V 202 6.13
+W 33 1.00
+Y 107 3.25
b
diff -r 000000000000 -r 087a226e501e tools/seq_composition/README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_composition/README.rst Tue Nov 18 10:01:27 2014 -0500
b
@@ -0,0 +1,110 @@
+Galaxy tool reporting sequence composition
+==========================================
+
+This tool is copyright 2014 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below (MIT licence).
+
+This tool is a short Python script (using Biopython library functions) to
+loop over given sequence files (in a range of formats including FASTA, FASTQ,
+and SFF), and report the count of each letter (i.e. amino acids or bases).
+
+This can be useful for sanity checking assemblies (e.g. proportion of N
+bases) or looking at differences in base composition.
+
+This tool is available from the Galaxy Tool Shed at:
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_composition
+
+
+Automated Installation
+======================
+
+This should be straightforward using the Galaxy Tool Shed, which should be
+able to automatically install the dependency on Biopython, and then install
+this tool and run its unit tests.
+
+
+Manual Installation
+===================
+
+There are just two files to install to use this tool from within Galaxy:
+
+* ``seq_composition.py`` (the Python script)
+* ``seq_composition.xml`` (the Galaxy tool definition)
+
+The suggested location is in a dedicated ``tools/seq_composition`` folder.
+
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the
+tool. One suggested location is in the filters section. Simply add the line::
+
+    <tool file="seq_composition/seq_composition.xml" />
+
+You will also need to install Biopython 1.62 or later.
+
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
+
+    ./run_tests.sh -id seq_composition
+
+That's it.
+
+
+History
+=======
+
+======= ======================================================================
+Version Changes
+------- ----------------------------------------------------------------------
+v0.0.1  - Initial version.
+        - Tool definition now embeds citation information.
+======= ======================================================================
+
+
+Developers
+==========
+
+This script and related tools are being developed on this GitHub repository:
+https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_composition
+
+For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder::
+
+    $ tar -czf seq_composition.tar.gz tools/seq_composition/README.rst tools/seq_composition/seq_composition.py tools/seq_composition/seq_composition.xml tools/seq_composition/tool_dependencies.xml test-data/four_human_proteins.fasta test-data/four_human_proteins.seq_composition.tabular test-data/ecoli.fastq test-data/ecoli.seq_composition.tabular test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular
+
+
+Check this worked::
+
+    $ tar -tzf seq_composition.tar.gz
+    tools/seq_composition/README.rst
+    tools/seq_composition/seq_composition.py
+    tools/seq_composition/seq_composition.xml
+    tools/seq_composition/tool_dependencies.xml
+    test-data/four_human_proteins.fasta
+    test-data/four_human_proteins.seq_composition.tabular
+    test-data/ecoli.fastq
+    test-data/ecoli.seq_composition.tabular
+    test-data/MID4_GLZRM4E04_rnd30_frclip.sff
+    test-data/MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
b
diff -r 000000000000 -r 087a226e501e tools/seq_composition/seq_composition.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_composition/seq_composition.py Tue Nov 18 10:01:27 2014 -0500
[
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+"""Record sequence composition from FASTA, FASTQ or SFF files.
+
+This tool is a short Python script which requires Biopython 1.62 or later
+for SFF file support. If you use this tool in scientific work leading to a
+publication, please cite the Biopython application note:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This script is copyright 2014 by Peter Cock, The James Hutton Institute
+(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
+See accompanying text file for licence details (MIT license).
+
+Use -v or --version to get the version, -h or --help for help.
+"""
+import os
+import sys
+from optparse import OptionParser
+
+from Bio import SeqIO
+
+def stop_err(msg, err=1):
+    sys.stderr.write(msg.rstrip() + "\n")
+    sys.exit(err)
+
+#Parse Command Line
+usage = """Example usage:
+
+$ python seq_composition.py -o my_output.tsv -q input1.fastq -q input2.fastq
+
+At least one input sequence file is required (using the -f, -q, or -s options).
+If the expected alphabet is given, the sequence composition is verfied against
+it.
+"""
+#TODO - Case senstivity?
+#TODO - GenBank / EMBL input? Needs the datatype defined...
+#TODO - Handle all the FASTQ datatype subclasses in the XML cheetah code?
+parser = OptionParser(usage=usage)
+parser.add_option('-f', '--fasta', dest='fasta', action="append", default=[],
+                  help='Input sequence filename in FASTA format')
+parser.add_option('-q', '--fastq', '--fastqsanger', '--fastqillumina', '--fastqsolexa',
+                  dest='fastq', action="append", default=[],
+                  help='Input sequence filename in FASTQ format')
+parser.add_option('-s', '--sff', dest='sff', action="append", default=[],
+                  help='Input sequence filename in SFF format')
+parser.add_option('-o', '--output', dest='output',
+                  default=None, help='Output filename (tabular)',
+                  metavar="FILE")
+parser.add_option("-v", "--version", dest="version",
+                  default=False, action="store_true",
+                  help="Show version and quit")
+options, args = parser.parse_args()
+
+if options.version:
+    print("v0.0.1")
+    sys.exit(0)
+
+if not (options.fasta or options.fastq or options.sff):
+    stop_err("Require an input filename")
+if not options.output:
+    stop_err("Require an output filename")
+
+
+file_count = 0
+seq_count = 0
+counts = dict()
+
+for format, filenames in [("fasta", options.fasta),
+                          ("fastq", options.fastq),
+                          ("sff-trim", options.sff),
+                          ]:
+    for filename in filenames:
+        file_count += 1
+        for record in SeqIO.parse(filename, format):
+            seq_count += 1
+            for letter in record:
+                try:
+                    counts[letter] += 1
+                except:
+                    counts[letter] = 1
+
+total = sum(counts.values())
+sys.stderr.write("Counted %i sequence letters from %i records from %i files\n" % (total, seq_count, file_count))
+
+scale = 100.0 / total
+with open(options.output, "w") as handle:
+    handle.write("Letter\tCount\tPercentage\n")
+    for letter, count in sorted(counts.items()):
+        handle.write("%s\t%i\t%0.2f\n" % (letter, count, count * scale))
b
diff -r 000000000000 -r 087a226e501e tools/seq_composition/seq_composition.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_composition/seq_composition.xml Tue Nov 18 10:01:27 2014 -0500
b
@@ -0,0 +1,67 @@
+<tool id="seq_composition" name="Sequence composition" version="0.0.1">
+    <description>Count bases or amino-acids</description>
+    <requirements>
+        <requirement type="package" version="1.64">biopython</requirement>
+        <requirement type="python-module">Bio</requirement>
+    </requirements>
+    <version_command interpreter="python">seq_composition.py --version</version_command>
+    <command interpreter="python">
+seq_composition.py -o "$output_file"
+##For loop over inputs
+#for i in $input_file
+--$i.ext "${i}"
+#end for
+    </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
+    <inputs>
+        <param name="input_file" type="data" format="fasta,fastq,sff" multiple="true" label="Sequence file" help="FASTA, FASTQ, or SFF format." />
+    </inputs>
+    <outputs>
+        <data name="output_file" format="tabular" label="Sequence composition ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
+            <output name="output_file" file="four_human_proteins.seq_composition.tabular" ftype="tabular" />
+        </test> 
+        <test>
+            <param name="input_file" value="ecoli.fastq" ftype="fastq" />
+            <output name="output_file" file="ecoli.seq_composition.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="input_file" value="ecoli.fastq" ftype="fastqsanger" />
+            <output name="output_file" file="ecoli.seq_composition.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
+            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.seq_composition.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Takes input files of sequences (typically FASTA or FASTQ, but also
+Standard Flowgram Format (SFF) is supported), counts all the letters
+in each sequence, and returns a summary table of their counts and
+percentages.
+
+**Citation**
+
+This tool uses Biopython, so if you use this Galaxy tool in work leading to a
+scientific publication please cite the following paper:
+
+Cock et al (2009). Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This tool is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_composition
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btp163</citation>
+    </citations>
+</tool>
b
diff -r 000000000000 -r 087a226e501e tools/seq_composition/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_composition/tool_dependencies.xml Tue Nov 18 10:01:27 2014 -0500
b
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.64">
+        <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>