# HG changeset patch
# User peterjc
# Date 1425660489 18000
# Node ID da64f6a9e32b55c30db378b694dc3fedc03eb6e8
# Parent 16ecf25d521f225f3fbc6115b3813d3ea3372004
Uploaded v0.2.0, adds desired count mode
diff -r 16ecf25d521f -r da64f6a9e32b test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
Binary file test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff has changed
diff -r 16ecf25d521f -r da64f6a9e32b test-data/MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff
Binary file test-data/MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff has changed
diff -r 16ecf25d521f -r da64f6a9e32b test-data/ecoli.pair_sample_N100.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ecoli.pair_sample_N100.fastq Fri Mar 06 11:48:09 2015 -0500
@@ -0,0 +1,208 @@
+@frag_1
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC
++
+##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1_a
+GAGACATATTGCCCGTTGCAGTCAGAATGAAAAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##
+@frag_200
+TGGTAATGGTGATGGTGGTGGTAATGGTGGTGCTAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_201
+TAGCACCACCATTACCACCACCATCACCATTACCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_400
+TGGCCACCTGCCCCTGCCTGGCATTGCTTTCCAGAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_401
+TCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_600
+TTGGGCAAATTCCTGATCGACGAAAGTTTTCAATTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_601
+AATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_800
+ATATCGACGGTAGATTCGAGGTAATGCCCCACTGCC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_801
+GCAGTGGGGCATTACCTCGAATCTACCGTCGATATT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1000
+TATAGACCCCGTCAACGTCCGTCCAAATCTCGCAAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1001
+TTGCGAGATTTGGACGGACGTTGACGGGGTCTATAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1200
+ATCACGGCTGGCACCAATGAGCGTACCTGGTGCTTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1201
+AAGCACCAGGTACGCTCATTGGTGCCAGCCGTGATG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1400
+CAGTCGCTTTGTGGAACGCAGAAACTGATGCTGTAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1401
+TACAGCATCAGTTTCTGCGTTCCACAAAGCGACTGT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1600
+ATCCCTGAGCAATGGCGACAATGTTGATATTGGCGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1601
+CGCCAATATCAACATTGTCGCCATTGCTCAGGGATC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1800
+GACACGTAAGTCGATATGTTTATTCTTCAGCCAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1801
+GCTGGCTGAAGAATAAACATATCGACTTACGTGTCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2000
+AAGTCGGCATATTGATCCGCCACTGCCTGGCTGGAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2001
+TCCAGCCAGGCAGTGGCGGATCAATATGCCGACTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2200
+CGGAGAACTTCATCAATTCATCACCTGCATTGAGCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2201
+GCTCAATGCAGGTGATGAATTGATGAAGTTCTCCGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2400
+TTCAATATCCGCCAGCTCCAGTTCACGTCCCGTTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2401
+AAACGGGACGTGAACTGGAGCTGGCGGATATTGAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2600
+GGATCATTACCATCCACTTCGGCAATCTTCACGCGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2601
+CGCGTGAAGATTGCCGAAGTGGATGGTAATGATCCG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2800
+ATTGGCACTGGAAGCCGGGGCATAAACTTTAACCAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2801
+TGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3000
+TGCTTACCCAGTTCCTGGCAAAAACGCTCCCAGCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3001
+TGCTGGGAGCGTTTTTGCCAGGAACTGGGTAAGCAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3200
+ACGGTGCCACGTTGTCGTAATGAATGCTGCCGGAGA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3201
+CTCCGGCAGCATTCATTACGACAACGTGGCACCGTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3400
+GTGAATGAAGCCTGCCAGATGTCGCCCGTGCGCAAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3401
+TTGCGCACGGGCGACATCTGGCAGGCTTCATTCACG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3600
+ACGCGCTGGGCGGTTTCCGGCTTGTCACACAGAGCG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3601
+GCTCTGTGTGACAAGCCGGAAACCGCCCAGCGCGTT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3800
+GGTCGTGCGGAAAAAACAGCCCCTGATTTTTGCCCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3801
+GGGCAAAAATCAGGGGCTGTTTTTTCCGCACGACCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4000
+CCCGTGGAACAATTCCAGACAACCGACATCGCTTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4001
+AAAGCGATGTCGGTTGTCTGGAATTGTTCCACGGGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4200
+TTTTCTTGCAGTGGACTGATTTTGCCTCGTGGATAG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4201
+TATCCACGAGGCAAAATCAGTCCACTGCAAGAAAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4400
+GCGGCAGCTGCGCAACAGCTTCAAAGTAGTAGCAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4401
+TTGCTACTACTTTGAAGCTGTTGCGCAGCTGCCGCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4600
+CATCGCGTTGGATAACGTCGCCTGAGTCGCTTTGGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4601
+CCAAAGCGACTCAGGCGACGTTATCCAACGCGATGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4800
+CCTGGATTCAACTGATCACGCAGCGCACGATAAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4801
+GCTTATCGTGCGCTGCGTGATCAGTTGAATCCAGGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_5000
+AGATAATGAATAGATTTTACTGATGATTCATCATCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_5001
+GATGATGAATCATCAGTAAAATCTATTCATTATCTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMK
diff -r 16ecf25d521f -r da64f6a9e32b test-data/ecoli.sample_C10.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ecoli.sample_C10.fastq Fri Mar 06 11:48:09 2015 -0500
@@ -0,0 +1,40 @@
+@frag_1
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC
++
+##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_504
+TACGTTCGGCATCGCTGATATTGGGTAAAGCATCCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1008
+GTCGCAGGTATAGACCCCGTCAACGTCCGTCCAAAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1512
+ATCACCTACCACCGAGATAATGGCCAGCCGTTCCGT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2016
+GAAACCTTCGCGCAGGAAGTCGGCATATTGATCCGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2520
+TCCTTCATCACGGGCCTTCGCCACGCGCGCGGCAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3024
+TCCAGGGTCATCGCCACTGGAATTTGCTTACCCAGT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3528
+ACCGCGCCGATTTCCGCGACCGCCTGCCGCGCCTGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4032
+CGACCGCCGAAATCTTTAAATGCCAGCGTTGGCCCG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4536
+GGCACGGTATCGTTCACGTTGGTCGCAGCAATAAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
diff -r 16ecf25d521f -r da64f6a9e32b test-data/get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta Fri Mar 06 11:48:09 2015 -0500
@@ -0,0 +1,119 @@
+>Streptococcus_suis|ORF1 length 457 aa, 1374 bp, from 1..1374 of Streptococcus_suis
+MNQEQLFWQRFIELAKVNFKPSIYDFYVADAKLLGINQQVANIFLNRPFKKDFWEKNFEE
+LMIAASFESYGEPLTIQYQFTEDEQEIRNTTNTRSSIVHQVQTLEPATPQETFKPVHSDI
+KSQYTFANFVQGDNNHWAKAAALAVSDNLGELYNPLFIFGGPGLGKTHILNAIGNKVLAD
+NPQARIKYVSSETFINEFLEHLRLNDMESFKKTYRNLDLLLIDDIQSLRNKATTQEEFFH
+TFNALHEKNKQIVLTSDRNPDHLDNLEERLVTRFKWGLTSEITPPDFETRIAILRNKCEN
+LPYNFTNETLSYLAGQFDSNVRDLEGALKDIHLIATMRQLSEISVEVAAEAIRSRKQTNP
+QNMVIPIEKIQTEVGNFYGVSLKELKGSKRVQHIVHARQVAMFLAREMTDNSLPKIGKEF
+GNRDHTTVMHAYNKIKTLLLDDENLEIEITSIKNKLR
+>Streptococcus_suis|ORF2 length 385 aa, 1158 bp, from 1507..2664 of Streptococcus_suis
+IINKGESMIQFSINKNIFLQALSITKRAISTKNAIPILSTVKITVTSEGITLTGSNGQIS
+IEHFISIQDENAGLLISSPGSILLEAGFFINVVSSMPDLVLDFNEIEQKQIVLTSGKSEI
+TLKGKEAEQYPRLQEVPTSKPLVLETKVLKQTINETAFAASTQESRPILTGVHFVLTENK
+NLKTVATDSHRMSQRKLVLDTSGDDFNVVIPSRSLREFTAVFTDDIETVEVFFSNNQILF
+RSEHISFYTRLLEGTYPDTDRLIPTEFKTTAIFDTANLRHSMERARLLSNATQNGTVKLE
+IANNVVSAHVNSPEVGRVNEELDTVEVSGEDLVISFNPTYLIEALKATTSEQVKISFISS
+VRPFTLIPNNEGEDFIQLVTPVRTN
+>Streptococcus_suis|ORF291 length 760 aa, 2283 bp, from complement(184307..186589) of Streptococcus_suis
+KRGEFMRFNQFSFIKKETSVYLQELDTLGFQLIPDASSKTNLETFVRKCHFLTANTDFAL
+SNMIAEWDTDLLTFFQSDRELTDQIFYQVAFQLLGFVPGMDYTDVMDFVEKSNFPIVYGD
+IIDNLYQLLNTRTKSGNTLIDQLVSDDLIPEDNHYHFFNGKSMATFSTKNLIREVVYVET
+PVDTAGTGQTDIVKLSILRPHFDGKIPAVITNSPYHQGVNDVASDKALHKMEGELAEKQV
+GTIQVKQASITKLDLDQRNLPVSPATEKLGHITSYSLNDYFLARGFASLHVSGVGTLGST
+GYMTSGDYQQVEGYKAVIDWLNGRTKAYTDHTRSLEVKADWANGKVATTGLSYLGTMSNA
+LATTGVDGLEVIIAEAGISSWYDYYRENGLVTSPGGYPGEDLDSLTALTYSKSLQAGDFL
+RNKAAYEKGLAAERAALDRTSGDYNQYWHDRNYLLHADRVKCEVVFTHGSQDWNVKPIHV
+WNMFHALPSHIKKHLFFHNGAHVYMNNWQSIDFRESMNALLSQKLLGYENNYQLPTVIWQ
+DNSGEQTWTTLDTFGGENETVLPLGTGSQTVANQYTQEDFERYGKSYSAFHQDLYAGKAN
+QISIELPVTEGLLLNGQVTLKLRVASSVAKGLLSAQLLDKGNKKRLAPIPAPKARLSLDN
+GRYHAQENLVELPYVEMPQRLVTKGFMNLQNRTDLMTVEEVVPGQWMNLTWKLQPTIYQL
+KKGDVLELILYTTDFECTVRDNSQWQIHLDLSQSQLILPH
+>Streptococcus_suis|ORF292 length 216 aa, 651 bp, from 185183..185833 of Streptococcus_suis
+AVGKDHLTLDPISVEQIIAVMPVLIVVTAGAVQGSTLGSQSFFVGCFIAEEVTCLQTLGV
+GQGGQAVQIFAWIATRAGHQPVFTVVVIPRGNPCFCDDDFQSVHASCCQGIGHGTEIRQS
+RRRYLTIGPIGLDLKRASVVCVGLGATVQPVNHRFIALHLLVVARCHVARRAQRANTRHM
+EAGKAASEEVVIEGVRSNVPQFFSSRADRQVPLVQV
+>Streptococcus_suis|ORF583 length 391 aa, 1176 bp, from 397805..398980 of Streptococcus_suis
+RKKMKKQFELIATAAAGLEAVVGREIRNLGYECQVENGRVRFQGDVKSIIETNIWLRSAD
+RIKIIVGQFPAKTFEELFQGVFNLDWENYLPLGCKFPISKAKCVKSKLHNEPSVQAISKK
+AVVKKLQKHFSRPEGVPLQEMGAEFKIEVSILKDVATVMIDTTGSSLFKRGYRVEKGGAP
+IKENMAAAILQLSNWYPDKPLIDPTCGSGTFCIEAAMLAKNIAPGLKRSFAFEEWPWVED
+QLVVALRKEAQASIKTDLVLDITGSDIDARMIEIAKKNAFAAGVEQDIVFKQMRVQDLRT
+DKINGVIISNPPYGERLLDDEAIVTLYREMGETFEPLKTWSKFILTSDELFETRFGQQAD
+KKRKLYNGTLKVDLYQFFGQRVKRQVQEVQG
+>Streptococcus_suis|ORF584 length 487 aa, 1464 bp, from 398981..400444 of Streptococcus_suis
+EDIVGEKNSHHLPLDEEKVLDFEVAKDLTIEEAVKKHKEIEAGVTEDDGLLDRYIKQHRA
+EIESQKFETKINHLPLVEVADEEKNQGHESAEEVEANESSLTEVSEEIAPIVEELSVTPM
+ETLEETVIASTVAMEGLSSVADDSSLELEEDETEDLDHSEGADRDQKKKFYFWSAVGLSM
+IGVMATALVWMNSVNKSNTATSSSSTSTSQTSSTASSSTDANVTAFEQLYNSFFTDSSLT
+KLKNSEFGKLAELKVLLEKLDKNSDSYTKAKEQYDHLEKAIAAIQAINGQFDKEVVVNGE
+IDTTATVKSGESLSATTTGISAVDSLLASVVNFGRSQQEVASATVASEAAVTRNQGADET
+VSTGVPATTEVASTTVSGSTTDFGIAVPAGVVLQRDRSRVPYNQAMIDDVNNEAWNFNPG
+ILENIVTISQQRGYITGNQYILEKVNIINGNGYYNMFKPDGTYLFSINCKTGYFVGNGAG
+HSDALDY
+>Streptococcus_suis|ORF873 length 343 aa, 1032 bp, from 605439..606470 of Streptococcus_suis
+TLGEETMTNVFKGRHFLAEKDFTRAELEWLIDFSAHLKDLKKRNIPHRYLEGKNIALLFE
+KTSTRTRAAFTVASIDLGAHPEYLGANDIQLGKKESTEDTAKVLGRMFDGIEFRGFSQKM
+VEELAEFSGVPVWNGLTDAWHPTQMLADYLTVKENFGKLEGLTLVYCGDGRNNVANSLLV
+TGAILGVNVHIFSPKELFPEEEVVALAEGFAKESGARVLITDNADEAVKGADVLYTDVWV
+SMGEEDKFAERVALLKPYQVNMELVKKAENENLIFLHCLPAFHDTNTVYGKDVAEKFGVE
+EMEVTDEVFRSKYARHFDQAENRMHTIKAVMAATLGDPFVPRV
+>Streptococcus_suis|ORF874 length 113 aa, 342 bp, from complement(605625..605966) of Streptococcus_suis
+VSNIVTAITTVNQSQAFQLAKVFFDSQVVRQHLSWVPCICQTIPYWHTGEFCQFFHHFLT
+ETTEFNTVEHTSQNFSSIFCRFFLTKLDVICTKIFWMGTKVNRCYCEGSTSTC
+>Streptococcus_suis|ORF1165 length 105 aa, 318 bp, from 811613..811930 of Streptococcus_suis
+AYNESVKRKECHLMKQVNMSKIINYLTILGLLILLSAFFLDNWIRDWFFPSSWGNVATML
+ILPLLGALILILSIYYKKLWTGLISIFLIISFPLIFGIGYFIFGP
+>Streptococcus_suis|ORF1166 length 125 aa, 378 bp, from 811867..812244 of Streptococcus_suis
+YLLNNLISSDIRYWLLYIWPLEGVVMNLTLLKRLNLVLYGIAIFLFVMLFLPIGQWFDIV
+NVNFKLTFFIIPFFGLASLPTAIYTKNVRQILLSVLLVALYFILFSLITALSGLFHLNFY
+SFFFK
+>Streptococcus_suis|ORF1455 length 114 aa, 345 bp, from 1026973..1027317 of Streptococcus_suis
+SCKLSLHIRWESWMGQGFYCYRFKLIHLRTNSNPFSFFRHLNSHFQHLRNEWTVMLPDSV
+LDQDISTSHCRCHHKGTRFDTILHHLMFCASQFFYTSNRNRLCTCPLNFCPHFV
+>Streptococcus_suis|ORF1456 length 116 aa, 351 bp, from complement(1027944..1028294) of Streptococcus_suis
+YGNACNSRPPTCDKSYSCWETLIYMGLNLVQFHFLISWYNGNMVISILQFFSHILFIYLA
+HHLLVTTVDWSRWLKVTGDNQRKINLLILFLAIALGYLVSTFFLELLMMGRSFANM
+>Streptococcus_suis|ORF1747 length 335 aa, 1008 bp, from complement(1225218..1226225) of Streptococcus_suis
+RMLNTDDTVTIYDVAREAGVSMATVSRVVNGNKNVKENTRKKVLEVIDRLDYRPNAVARG
+LASKKTTTVGVVIPNIANAYFATLAKGIDDIADMYKYNIVLANSDENDEKEINVVNTLFS
+KQVDGIIFMGYHLTDKIRAEFSRSRTPIVLAGTVDLEHQLPSVNIDYAAASVDAVNLLAK
+NNKKIAFVSGPLVDDINGKVRFAGYKQGLKDNGIEFNEGLVFESKYKYEEGYALAERILN
+AGATAAYVAEDEIAAGLLNGVSDMGIKVPEDFEIITSDDSLVTKFTRPNLTSINQPLYDI
+GAIAMRMLTKIMHKEELENREVVLNHGIKVRKSTK
+>Streptococcus_suis|ORF1748 length 377 aa, 1134 bp, from 1226384..1227517 of Streptococcus_suis
+TKISLFLPLHARKVSTMSKLHHVKSYLEANKMDLAIFSDPVSIYYLTGYHSDPHERHMML
+FVMPDHDSLLFLPALDVERAVATVDFPVAGYMDSENPWQIIKSKLPQKSFSAICAEFDNL
+NLTRYHGLQSIFSQPFSDITPLINTMKLIKSRDEIEKMLVAGEFADKAMQVGFNNISLDV
+TETDIIAQIEFEMKKQGISKMSFETMVLTGDNAANPHGIPSTNKIENNALLLFDLGVEAL
+GYTSDMTRTVAVGKPDQFKKDIYNLTLEAHMAAVNMIKPGVTAGEIDYAARSVIEKAGYG
+EYFNHRLGHGLGMSVHEFPSIMEGNDLVIEEGMCFSVEPGIYIPGKVGVRIEDCGYVTKN
+GFEVFTKTPKELLYFEG
+>Streptococcus_suis|ORF2037 length 234 aa, 705 bp, from complement(1422380..1423084) of Streptococcus_suis
+KSMTKTALITGVSSGIGLAQAGIFLENGWRVFGIDLASKPDLAGDFHFLQLDLTGDLSPV
+FSWCQSVDVLCNTAGILDDYRPHLDISEDELAQIFAVNFFAVTRLTRPYLQQMVDRQSGI
+IINMCSIASSLAGGGGSAYTASKHALAGFTKQLALDYAKDKVQIFGIAPGAVQTGMTQKD
+FEPGGLADWVADQTPIGRWTQPSEIAELTFMLATGKLASMQGQIITIDGGWSLK
+>Streptococcus_suis|ORF2038 length 112 aa, 339 bp, from 1422849..1423187 of Streptococcus_suis
+SSKMPAVLQRTSTDWHQEKTGDKSPVRSSCRKWKSPAKSGLLARSIPKTRQPFSKKIPAC
+ARPMPLETPVMRAVLVMDFYPVGRKDIARGRAPHGEAFTLAGHVDEEIGRRL
+>Streptococcus_suis|ORF2329 length 160 aa, 483 bp, from 1612284..1612766 of Streptococcus_suis
+LIETNWFHHLTGQEGLDVLFFHNLGFRITDQLYLEVRKFHLLQGLSQLLRRWSQESRVKG
+ARYIERNHPLDTCFLQQFNRLIHCSHLASDDDLGWCVVVGWGNNPRGNSRTDFFNQVDIC
+VENSNHLTSPCWRSQFHIFTTLSNQGNRIFKGQSSRCHQS
+>Streptococcus_suis|ORF2330 length 329 aa, 990 bp, from complement(1613050..1614039) of Streptococcus_suis
+ARKKDEGIMKTKITELLDIKYPIFQGGMAWVADGDLAGAVSNAGGLGIIGGGNAPKEVVK
+ANIDKVKSITDKPFGVNIMLLSPFADDIVDLVIEEGVKVVTTGAGNPGKYMERLHAAGIT
+VIPVVPSVALAKRMEKLGVDAVIAEGMEAGGHIGKLTTMTLVRQVVEAVSIPVIAAGGIA
+DGAGAAAAFMLGAEAVQVGTRFVVATESNAHQAYKEKVLKAKDIDTTVSASIVGHPVRAI
+KNKLSSAYAAAEKDFLAGKISADAIEELGAGALRNAVVDGDVTNGSVMAGQIAGLVSKEE
+SCEDILKDIYYGAAKVIREEASRWASVGE
+>Streptococcus_suis|ORF2619 length 107 aa, 324 bp, from 1802386..1802709 of Streptococcus_suis
+QLCVGSNPINSLFRRNFFVCCISSQSSCYVHTMWFVGIIVEIIVARYIIIAMGNFQCVCP
+CRRWSNVLNFRNDTIIQPHVFVLNIQTGVNDCNHHSATICLIFRTCF
+>Streptococcus_suis|ORF2620 length 192 aa, 579 bp, from complement(1803558..1804136) of Streptococcus_suis
+RLKIPCFQRKEVTMYDSFDKGWFVLQTYSGYENKVKENLLQRAHTYNMLENILRVEIPTQ
+TVQVEKNGEVKEVEENRFPGYVLVEMVMTDEAWFVVRNTPNVTGFVGSHGNRSKPTPLLE
+EEIRQILVSMGQTVQEFDIDVKVGDTVRIIDGAFTDYTGKITEIDNNKVKMVISMFGNDT
+IAEVNLSQIAEL
diff -r 16ecf25d521f -r da64f6a9e32b test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta Fri Mar 06 11:48:09 2015 -0500
@@ -0,0 +1,214 @@
+>Streptococcus_suis|ORF1 length 457 aa, 1374 bp, from 1..1374 of Streptococcus_suis
+MNQEQLFWQRFIELAKVNFKPSIYDFYVADAKLLGINQQVANIFLNRPFKKDFWEKNFEE
+LMIAASFESYGEPLTIQYQFTEDEQEIRNTTNTRSSIVHQVQTLEPATPQETFKPVHSDI
+KSQYTFANFVQGDNNHWAKAAALAVSDNLGELYNPLFIFGGPGLGKTHILNAIGNKVLAD
+NPQARIKYVSSETFINEFLEHLRLNDMESFKKTYRNLDLLLIDDIQSLRNKATTQEEFFH
+TFNALHEKNKQIVLTSDRNPDHLDNLEERLVTRFKWGLTSEITPPDFETRIAILRNKCEN
+LPYNFTNETLSYLAGQFDSNVRDLEGALKDIHLIATMRQLSEISVEVAAEAIRSRKQTNP
+QNMVIPIEKIQTEVGNFYGVSLKELKGSKRVQHIVHARQVAMFLAREMTDNSLPKIGKEF
+GNRDHTTVMHAYNKIKTLLLDDENLEIEITSIKNKLR
+>Streptococcus_suis|ORF2 length 385 aa, 1158 bp, from 1507..2664 of Streptococcus_suis
+IINKGESMIQFSINKNIFLQALSITKRAISTKNAIPILSTVKITVTSEGITLTGSNGQIS
+IEHFISIQDENAGLLISSPGSILLEAGFFINVVSSMPDLVLDFNEIEQKQIVLTSGKSEI
+TLKGKEAEQYPRLQEVPTSKPLVLETKVLKQTINETAFAASTQESRPILTGVHFVLTENK
+NLKTVATDSHRMSQRKLVLDTSGDDFNVVIPSRSLREFTAVFTDDIETVEVFFSNNQILF
+RSEHISFYTRLLEGTYPDTDRLIPTEFKTTAIFDTANLRHSMERARLLSNATQNGTVKLE
+IANNVVSAHVNSPEVGRVNEELDTVEVSGEDLVISFNPTYLIEALKATTSEQVKISFISS
+VRPFTLIPNNEGEDFIQLVTPVRTN
+>Streptococcus_suis|ORF201 length 360 aa, 1083 bp, from complement(128035..129117) of Streptococcus_suis
+SCHGGRRMTLFGKIKEVTELQSLPGFEGQVRNHIRQKITPHVDRIETDGLGGIFGIKDTA
+VENAPRILVVAHMDEVGFMISQIKPDGTFRVVELGGWNPLVVSSQAFTLQLQDGRTIPAI
+SGSVPPHLSRGANAPGMPAIADIIFDAGFANYDEAWAFGVRPGDVLVPKNETILTANGKN
+VISKAWDNRFGVLMVTELLESLSGHALPNQLIAGANVQEEVGLRGAHASTTKFNPDIFLA
+VDCSPAGDIYGDQGKIGDGTLLRFYDPGHIMLKNMKDFLLTTAEEAGVKFQYYCGKGGTD
+AGAAHLKNHGVPSTTIGVCARYIHSHQTLYSMDDFLEAQAFLQTIVKKLDRSTVDLIKNY
+>Streptococcus_suis|ORF202 length 106 aa, 321 bp, from 128792..129112 of Streptococcus_suis
+RVKAWLETTRGFQPPSSTTRKVPSGLIWLIIKPTSSMWATTRIRGAFSTAVSLIPKIPPS
+PSVSMRSTCGVIFWRMWLRTCPSNPGKLCNSVTSLIFPKRVILLPP
+>Streptococcus_suis|ORF401 length 120 aa, 363 bp, from 265643..266005 of Streptococcus_suis
+TTGTTSPIAPKWKASSKSLRVPTSEPTTLIPSSTVFTILRSMYSDGSPTATTYPPARTLS
+IAWLKATLETAVTTVECTPPPVISLIYPGTSSTSSPLIVTSAPTSLASSNLSLLMSTAIT
+>Streptococcus_suis|ORF402 length 201 aa, 606 bp, from 265741..266346 of Streptococcus_suis
+HSLHDTEVHVFRWKSDSYYISTSTNTVNSLVEGYFGNSCYNSRVYTATSNFFNISRNIFY
+FKSVDRHICTNFFGEFQFIIIDVYGDNMSVEDFFSVLYSKVSKSTSTIDSNPLTWFQVSF
+FNRFVASNASTSDRTCLSWIKTFWNFYCIVRCYNTLLSHTTVNRVACIFYGTAESFATGC
+TIFTHTTALEEPSNADTVTNF
+>Streptococcus_suis|ORF601 length 665 aa, 1998 bp, from 409896..411893 of Streptococcus_suis
+VMIQIGKIFAGRYRIVRQIGRGGMADVYLARDLILDGEEVAVKVLRTNYQTDQIAIQRFQ
+REARAMAELDHPNIVRISDIGEEDGQQYLAMEYVNGLDLKRYIKENAPLSNDVAVRIMGQ
+ILLAMRMAHTRGIVHRDLKPQNVLLTSNGVAKVTDFGIAVAFAETSLTQTNSMLGSVHYL
+SPEQARGSKATIQSDIYAMGIILFEMLTGRIPYDGDSAVTIALQHFQKPLPSVREENANV
+PQALENVVLKATAKKLNERYKSVAEMYADLASALSMDRQNEPRVELEGNKVDTKTLPKLS
+QANVETKVPHTNSSAQVSATDKGSGKKEVAKSGNKPVSKPRPGIRTRYKVLIGAILLTVI
+AAGLMFFNTPRTVTVPDVSGQTVEKATEMIEVAGLEVGNITEEATATVDEGLVIRTSPAA
+KTTRRQGSKIDIVVATAALASIPDVVDKESDTARQELEALGFQVTIKEEYSEKVAQGLVI
+KTDPGANSSAEKGAKITLYVSKGVAPQVVPNVVGKSQENATQILQTAGFSIGTITQEYSS
+SVTAGQVISTDPVANTELAKGSIINLVISKGKELIMPDLTSGNYTYSQARSQLQALGVNA
+ESIEKQEDRSYYSTTSDIVIGQYPAAGATIDGTVTLYVSVASTRTSSDSSAGSSTSTSTS
+TGSGQ
+>Streptococcus_suis|ORF602 length 120 aa, 363 bp, from complement(410593..410955) of Streptococcus_suis
+LLSRLCSVYVCLALVLKQAYFPTSPLLSYQTPYPSQKLEQNCLYAVLSFQHWLGRVSARF
+LYQPCSLLVQPWVHSDDPWIEPKLNLHTFLQPTYSARLIFLPLLLVQHFLRPEVRWHSLL
+>Streptococcus_suis|ORF801 length 428 aa, 1287 bp, from 561960..563246 of Streptococcus_suis
+KSSRDCESCLLLFVILKVMQADRRKTFGKMRIRINNLFFVAIAFMGIIISNSQVVLAIGK
+ASVIQYLSYLVLILCIVNDLLKNNKHIVVYKLGYLFLIIFLFTIGICQQILPITTKIYLS
+ISMMIISVLATLPISLIKDIDDFRRISNHLLFALFITSILGIMMGATMFTGAVEGIGFSQ
+GFNGGLTHKNFFGITILMGFVLTYLAYKYGSYKRTDRFILGLELFLILISNTRSVYLILL
+LFLFLVNLDKIKIEQRQWSTLKYISMLFCAIFLYYFFGFLITHSDSYAHRVNGLINFFEY
+YRNDWFHLMFGAADLAYGDLTLDYAIRVRRVLGWNGTLEMPLLSIMLKNGFIGLVGYGIV
+LYKLYRNVRILKTDNIKTIGKSVFIIVVLSATVENYIVNLSFVFMPICFCLLNSISTMES
+TINKQLQT
+>Streptococcus_suis|ORF802 length 333 aa, 1002 bp, from 563382..564383 of Streptococcus_suis
+RMEKVSIIVPIFNTEKYLRECLDSIISQSYTNLEILLIDDGSSDSSTDICLEYAEQDGRI
+KLFRLPNGGVSNARNYGIKNSTANYIMFVDSDDIVDGNIVESLYTCLKENDSDLSGGLLA
+TFDGNYQESELQKCQIDLEEIKEVRDLGNENFPNHYMSGIFNSPCCKLYKNIYINKGFDT
+EQWLGEDLLFNLNYLKNIKKVSYVNRNLYFARRGIQSTTNTFKKDVFIQLENLEEKTFDL
+FVKIFGGQYEFSVFKETLQWHIIYYSLLMFKNGDESLPKKLHIFKYLYNRHSLDTLSIKR
+TSSVFKRICKLIVANNLFKIFLNTLIREEKNND
+>Streptococcus_suis|ORF1001 length 374 aa, 1125 bp, from 694014..695138 of Streptococcus_suis
+HYLLFQGGILMKVFASPSRYIQGKHVLFQGAEAIGKLGTKPLILCDDLVYGIIGEKFLSY
+LVEEGMQVHRVAFNGEASDKEIQRVVEIGKEQASDVVIGLGGGKTIDSAKAIADLLGVPV
+VIAPTIASTDAPTSALSVIYSEEGAFERYIFYKKNPDLVLVDTAIICQAPPRLLASGIAD
+GLATWVEARAILQSNGTTMAGGGQTLAGIAIAQTCEQTLFEYGLQAMASCEAKVVTAALE
+NIVEANTLLSGLGFESAGLAAAHAIHNGFTALEGDIHHLTHGEKVAYGTLTQLFLENRPK
+EELEKYIRFYQALNLPTTLEELHLADASYEELLKVGQQATIEGETIHGMPFAISAEDVAE
+ALMAVDYYVRSLDK
+>Streptococcus_suis|ORF1002 length 366 aa, 1101 bp, from 695283..696383 of Streptococcus_suis
+RIDLKEISMAYVVAVVGATGAVGAQMIKMLEESTLPIEKVRFLASARSAGKTLQFKGQDI
+VIEETTETAFEGVDIALFSAGGSTSAKYAPYAVKAGAVVVDNTSYFRQNPDVPLVVPEVN
+AHALDAHNGIIACPNCSTIQMMVALEPVRQKWGLERIIVSTYQAVSGAGMGAILETQAQL
+RSVLNDGVEPKAVEANILPSGGDKKHYPIGFNAIPQIDLFTENDYTYEEMKMTKETKKIM
+EDDSIAVSATCVRIPVLSAHSESVYIETKEIAPIDEVKAAIASFPGAVLEDDVANQIYPQ
+AINAVGSRDTFVGRIRKDLDKENGIHMWVVSDNLLKGAAWNSVQIAETLHERGLVRPTAE
+LKFELK
+>Streptococcus_suis|ORF1201 length 144 aa, 435 bp, from 842957..843391 of Streptococcus_suis
+FQTIKEKSRLMNIKKLILTLLTLTLTIVPCACGNQSNSNDSQLSGTYSYEKGGIDGSEMG
+FEDEELTLHYELKVSGDENILNINLLSERGNNVKYLYSEKVTIDTDKQIISDSNGTELEY
+SVSGDSVTIPDLAGDSGETVTLKR
+>Streptococcus_suis|ORF1202 length 343 aa, 1032 bp, from 843537..844568 of Streptococcus_suis
+VKVMYIFETTEQNNSKANDFETKSLLYLMSFKSDSTDIDTFFVDCFNDITGASSDLLKLW
+DVQAKNISSLRPKTIGKSLITLFQNFISSVDFYEYILFIPKLKENYLMDISLTEFKIDNF
+KDIAKIQEGLEEEYKRRKKLGALNLKQLSQLNTFLEQIHFVTGDSSKAIYIKNIIQFKSN
+IRDDNFFESVFNEVRSKQTELKNINIHNISINSIEEVLKLNKHLTKRQLETLVVNRIIGV
+ELFKQRIPNDFFDVINDKSSSDRKDIIQDCNANLSRLLFDKNSNKKKFWSLLEQILILVE
+EKDDIYQILNRIKQYQIPKIINDDYTLLYLISMVKEGMEENAC
+>Streptococcus_suis|ORF1401 length 409 aa, 1230 bp, from 991071..992300 of Streptococcus_suis
+GDNMKYPTLLDRFLVYVKENTRSDENSTTTPSTQNQVEFAQNILLPEMERIGLQNVHYLP
+NGFAVGTLPANDPSLTRKIGFIAHMDTADFNAEGVNPQIIENYDGNPIALGTSGYELHPK
+DFPQLANYHGQILITTDGTTLLGSDDKSGIAEIMTAIEFLIQNPDIKHCEIRVGFGPDEE
+IGVGADKFDVKDFDVDFAYTMDGGPLGELQYETFSAAGAKIDFLGRNVHPGSAKDQMINA
+FQMAIDFHNALPETDRPEKTEGYEGFFHLMNMEGSVDTASTTYIIRDFEEEDFQARKQLM
+LDIAEKMNANFDTPRVIVNLHDQYYNMKKIIEKDMTPINIAKDVMENLGIKPLIEPVRGG
+TDGSKISFMGIPTPNIFAGGENMHGRFEFVSLETMEKAVDVILGIVAYK
+>Streptococcus_suis|ORF1402 length 144 aa, 435 bp, from 992392..992826 of Streptococcus_suis
+YNRTIKKKWSFIMTEETLAQGILIGIWGTTLLFSFIWYILVAISNYILFKKAGYAGWKSL
+IPIYNLYIQQCITFGYEKRWFILFLLIPLAGPLYGIYLVYNFGRSFGLSAVQAIFYVLLT
+PIFNLYIAFNDGSRYQGPQEFFID
+>Streptococcus_suis|ORF1601 length 141 aa, 426 bp, from complement(1127307..1127732) of Streptococcus_suis
+VHPLHGRSLLIYFDCFAYEGGGIMTIQALAMFLASLGFLYFIFRNINKNKILFEHAFMWI
+VIGFGLIVFALFDVIPIKLAYLFGFGLTSNFLLSVAIFVLLVIGFLHSMALSQQKQQIKN
+LIQEVSMAKKRISELEEHHAE
+>Streptococcus_suis|ORF1602 length 241 aa, 726 bp, from complement(1127663..1128388) of Streptococcus_suis
+REKMKVLMIIPAYNEEESILQTVQGIIDYKNSVNFQLDYVVINDGSTDSTKEILIQNKLN
+AVHLVQNLGIGGAVQTGYKYALDNDYDVAVQFDGDGQHDIRSLNGLIQPILVGQADMVIG
+SRFVGDTLSEFQTSFMRRFGIGVISNMIKLTTGNRIWDTTSGYRLGNRKVIAQFAKRYPI
+KYPEPESTVHLLKQNFQVVEAPANMFERAGGVSSITPFKSIRYMVEVCSSILIASLMKEG
+E
+>Streptococcus_suis|ORF1801 length 128 aa, 387 bp, from 1263312..1263698 of Streptococcus_suis
+RLHDSCSICFLFIHGNIAGNRPCKEIGILQNNPHVTAQAFTRIITDVFPINQYTSLLWII
+ETIEEIHNRRLTRPSMPNQSNCFSFFCSNGNIFQNWSVFFIAKVHVFKHDLPLFNFQNTI
+TVVLQLFF
+>Streptococcus_suis|ORF1802 length 578 aa, 1737 bp, from complement(1264661..1266397) of Streptococcus_suis
+RRYMFRLIFDYIKRHKWLYLLVAVTLIIYDATLLLPTQIIQRMVDILTKNELTQAILVQE
+MTLLLLVTVLNYATAFIWHLKLFQASVNFKFDMQQRAFKKLVTMRTPFYEKFRSGDVMTR
+FSTDVDGLMEMVGYGLMIVVYAGGMLAFIIPTMFFIDWKISLVALLPMLFMTLCIFFIGR
+KQDKAIDANREAVAQLNNEVLEVIEGIRVTRAYSKKANQKAQFQARTKQLAQGGDRITSL
+QSLYNPLATVCLGLSTIFVLLMGAQAVKAGQLTLGQVIALQLYVGSLLEPFWTLADFILV
+YQTGKTSFEKLQELIETGDDLEADGSKEIAELSSISFKNYSFSYPQAERASLQDINWTLK
+AGQTVGIVGKTGSGKTTLVRQFLRQYPIGQGNFFINHQSILDFKRSSIEEKIGYVPQEHI
+LFSRSVGENIALGKVASSSEEIEQAIATAAFSQDLKRMSDGLDTMIGERGVSISGGQKQR
+ISIARAFLREPDLLILDDSLSAVDARTERQIIQNIQKERAGKTNVIVTHRLSAVNHADWV
+LVLDEGRIVEEGRPADLLAQRGWYYEQYQRQQSQEGGE
+>Streptococcus_suis|ORF2001 length 415 aa, 1248 bp, from complement(1398025..1399272) of Streptococcus_suis
+EDFIMKMKTFLKCASVCAFASFLVACGNASSSDKVEIEYFSQKPEMQATLQEIIDDFEKE
+NPTIDVKFSNVPDAGTVLKTRMANNEAPDVINIYPQNADFKAYAADGRFLEIGDDAGLNH
+LKDGAVTPYLVNEKNYTLPLTANAYGIYYNKDKFKELGLEVPTTYAEFVALVDKIKADGS
+AAPFALSLNDAWSLNGYHQLAWVTVAGGFDGAEDILIRSAKGAIQDDATTKAVLERLQLL
+KDNGQKGATGALYADAVAAFAAGDALMLPQGTWAATAVNQQEPEFEYGMFTFPGDKEGGD
+YTIGAADLALSISADTEHPEESKKFLEYLSRPEVIQKYYDVDGSPTSVEGVDTEGKFEET
+AGVTQYAFTDKHVVWLQSEWESEEEFWNITVEMVKNPNSAELVKKLNAFFDPMKK
+>Streptococcus_suis|ORF2002 length 732 aa, 2199 bp, from complement(1399273..1401471) of Streptococcus_suis
+DHKEEIGEMNVIEIYNEKQIFHLKTREFSYIIQVLETGDLVHRYFGKKIEKFSDGNKITY
+LDRSFSPSPITGDRTYSLDVLPLEYSSNGLGDFRTSALDVRNEFGVTLDLKYKEYRLYKG
+KKELRGLPASFGNQEEVESLEIDLYDQLTDITVTLQYSVFEEASYLARSATIQTGKYPCK
+LEKVLSATLDFPHQDFIVHSLAGRYAYEKEWTQTPLTKGQYSIGSIRGASSHSRTPFLAL
+VSPDASEDKGDVYAAHLVYSGNFTAFVETTAMETSRLGLGLESHYFSWQLDKDDRFQTPE
+VLLSYTDKGFTGMTQNSHHFITKHLIRSSFVNKPRPILINNWEATYFEFTEEKILQLAQV
+ASRAGIELFVLDDGWFGKRNNDESSLGDWKVNLDKLPNGLNGLAERINELGMKFGLWFEP
+EMISIDSDLYREHPDWAIRTEGRLPIYSREQLVLDLTKQEVCDYIIDSVSSILESANISY
+VKWDMNRNITNIPEGLANDQRFEFHHRYMLGLYRVLDHLTKRFPDILFESCAGGGGRNDL
+GIMYYMPQAWASDDTDAIERLSIQEGTSLIYPPSSIGAHVSAVPNHQVGRITPLATRGNV
+AMMGGAFGYELDLTKLSEKELDEISQQIETYHSIRETIQFGQLYRLKKTSNTWAANYVSQ
+DKNQVVFTFVKILAKPEAPLLHVRLKGLDPDALYECPQLGETFYGDELMNIGLTMPHVQK
+DYFSVQYIFNKI
+>Streptococcus_suis|ORF2201 length 272 aa, 819 bp, from complement(1531599..1532417) of Streptococcus_suis
+DCSKIKIIDLAVGKLKLLSSKRKGAFMEIIRSKANHLVKQVKKLQQKKYRTSSYLIEGWH
+LLEEAMEAGANIEHIFVVEEYFEKVAGLANVTVVSPEIMQELADSKTPQGVVAQLALPSQ
+RLPETLDGKFLVLEDVQDPGNVGTMIRTADAAGFDGVFLSDKSADIYNMKVLRSMQGSHF
+HLPVYRMPISSILTALKSNQIQILATTLSSQSVDYKEITPHSSFALVMGNEGQGISDLVA
+DEADQLVHITMPGQAESLNVAIAAGILLFSFI
+>Streptococcus_suis|ORF2202 length 101 aa, 306 bp, from 1532445..1532750 of Streptococcus_suis
+MSCQKEKLMRKVKMIASGRVQGVGFRWSVQFLAVEIGDIYGRVWNNDDGTVTILAQSDNA
+EKLSHFIHEIRKGPSRMAKVIYLDVTLANFEDYKDFQVSYR
+>Streptococcus_suis|ORF2401 length 141 aa, 426 bp, from 1658030..1658455 of Streptococcus_suis
+ASITVPIARTVGSAFSSWISATKRTVSNNSSMFWLNLAEISTNSDSPPQAVEITPCSANS
+PMTRSGFAPGLSILLIATMIGTLAAFEWLIASIVCGMTPSSAATTRMVKSVTDAPRARIE
+VKAACPGVSKKVIFLPASSIW
+>Streptococcus_suis|ORF2402 length 266 aa, 801 bp, from 1658515..1659315 of Streptococcus_suis
+GVQQGCFTMVNVSHDSHNRWAFCHLFFIEVALFYEETLNICVIDLYLFFRFNTIINHEEF
+DSISIQRLVLSRHNSHKEEFFHNFSRFTFDSFCNFCDGHASSIFKFSWQFVELAFCDRFG
+RLVSLAFFIFLVVIPVTCSLISHLILTISISLLFPWTIFFVTIKVTFFIWSSLFLTTGIY
+SSFCNLLWYRCNKCRFHKWFAFHNRFFKLNFFWLLRLLFSFLSLTKTFFTGTSILRILFC
+FQSSSTRFEVNFRSCWFCSLSLFKAS
+>Streptococcus_suis|ORF2601 length 100 aa, 303 bp, from 1790150..1790452 of Streptococcus_suis
+LKDGYQRLVVEGFADIAETFLQTETNLMTTVIFIARHDDDRPIAFPLGSLNQVNMTLVHG
+SKGPKNNCYCLFHNLPFYCFLYFISYSFLKPKSRVFYIFL
+>Streptococcus_suis|ORF2602 length 823 aa, 2472 bp, from complement(1790482..1792953) of Streptococcus_suis
+ERGVVRMKISRGLQGVYEDAQLIAQRYSSDYLETWHLLLAFVINPDTVAGAILAEYPADV
+LDYERAVYMVMGRRYHEELESFFFLPSSKRVKELQVFAEKIAEIVKSKGLGTEHIFMGML
+LDKRSTASQILDQVGFHFEDSDDKVRFLDLRKNLEAKAGFTKEHLKAIRTMTKGGKPKQA
+TVGNMMGMTQSQSGGLEDYTRDLTALARSGQLEPVIGRDEEISRMLQILSRKTKNNPVLV
+GDAGVGKTALALGLAQRIANGEVPASLVNMRILELDLMNVIAGTRFRGDFEERMNNIIND
+IEEDGRVILFIDELHTIMGSGSGIDSILDAANILKPALSRGTLRTVGATTQDEYQKHIEK
+DAALVRRFAKVTIEEPSVADSVAILQGLKPAYEAHHKVTISDQAVVTAVAYAKRYLTSKN
+LPDSAIDLLDEASATVQNRAKGQVEEGGLTALDQALMAGKYKTVTQLLLKAQEAENQATS
+YSLEVTEEDILATLSRLSGIPVTKLSQTDAKKYLNLEQELHKRVIGQEEAISAVSRAIRR
+NQSGIRTGHRPIGSFMFLGPTGVGKTELAKALAEILFDDESALIRFDMSEYMEKFAASRL
+NGAPPGYVGYEEGGELTEKVRNKPYSVLLFDEVEKAHPDIFNVLLQVLDDGVLTDRKGRK
+VDFSNTVIIMTSNLGATALRDDKTVGFGALDLSKSQEHVEKRIFEALKKAYRPEFINRID
+EKVVFHSLTEADMQDVVKVMVKPLIAVAASKGITLKLQASALKLLAKEGYDPEMGARPLR
+RLLQTKLEDPLAEMLLRGELPAGVTLKVGVKAEQLKFDSVKAG
+>Streptococcus_suis|ORF2801 length 1006 aa, 3021 bp, from complement(1921434..1924454) of Streptococcus_suis
+TQTKEYEMIEFRKKAVQLASLMSVFFLCTYSFTDAMYIMAESLSTDGASTIRRTYIEDKK
+EDKDRLNIELVESLSSPKTIGQKITIDKQSLATQNFNEKGIVVITQKGLELKKDDLEKGW
+KLDESYNEKDLAITKSETEKRSLSNELDVLSKTVEELPVYGENYHSYRLLPTTELDYSAD
+NVSLTLSFTKVSEVIKGELVAVVDAEHIAYFKAEPSVFKEYSQVNEKPSSTEDVNVVSPS
+QDPPVSETKENVPDNPESQGSSTVPESEQAVDALVEQRGVICIKLTKSSSEQEEGIEDTE
+NEAIEGATFEVRNVESENLVYTGQTDKDGLLTISNLPLGNYAVIQKSTIDGYEISATKEV
+VELTVAQSRQTVSISNSPKNPLEGLMLNSILDSSLIPRSARVARSLLDTSLLDNPTVTGN
+ANATTTTTVFGNKTTTITREESNIKYIFKPITISIPGVYQSYSQDGVLKKKEVVVDSNTN
+TTKIIWEYTTTVGGVNSNITSIRNAFSTTTDSGLGEPKITSIMKDGVAITPNTTYYGNFD
+NFKSATDNLPVGNGTYVYTIETPVVIPSDNYSLDYRSEVTVDAPKGSKLTYNGTSVTLTQ
+KETRTLSTADTITLPAKNDGGPLGDLKVDTVNTSNTNRTIGKYRDNDDKVIEWTSSQLND
+TSTTQSFTFDVALDSSQAAHEYKVYIYEPSNGTYTETKAEKVATPGNQITVDNVPAGAVA
+LVKTVTNVKDEKVNHTISGAQLEALKGDIKIQKNWEADSDKVDVTFTVNGGSLTNRKETL
+SANNTQITIANVDKFSGMRSTATKKRIYYDVTEAVPSGYILSSAQTDWENLYYVFTNKKD
+NTTTPVFPPDTCGNYGVSSIDLVSINYVMYKSGSKIWGGFDGSMKMNLKIPAFARAGDSF
+TLELPPELKLSHVANPNVAWSTVSANGKVIAKVYHEKDNLIRFVLTTEAYSVQEYNGWFE
+IGVPTSNVIKINNRETTELYKTGVLPNLPEWYTTTTRNQTLIKRSR
+>Streptococcus_suis|ORF2802 length 252 aa, 759 bp, from complement(1925855..1926613) of Streptococcus_suis
+LEARMQQYFVNGRAPQGMFQISDKDTAKHMFSVMRLQAGDQIVLVFDDGIKRLARVVDSQ
+SQSVEIIEELTDNVELPISVTIAMGFPKGDKLEFVAQKATELGMSALWAFPADWSVVKWD
+GKKLAKKAEKLEKIAQGAAEQSKRNRIPAVRLFEKKSDFLAQLAGFDQIILAYEEAAKEG
+EQANLVKILSGLEIGQSVLVIVGPEGGVSPEEVAAFEGAGAVKTGLGPRILRAETAPLYA
+LSTISYATELLR
diff -r 16ecf25d521f -r da64f6a9e32b test-data/get_orf_input.Suis_ORF.prot.sample_C10.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_orf_input.Suis_ORF.prot.sample_C10.fasta Fri Mar 06 11:48:09 2015 -0500
@@ -0,0 +1,50 @@
+>Streptococcus_suis|ORF1 length 457 aa, 1374 bp, from 1..1374 of Streptococcus_suis
+MNQEQLFWQRFIELAKVNFKPSIYDFYVADAKLLGINQQVANIFLNRPFKKDFWEKNFEE
+LMIAASFESYGEPLTIQYQFTEDEQEIRNTTNTRSSIVHQVQTLEPATPQETFKPVHSDI
+KSQYTFANFVQGDNNHWAKAAALAVSDNLGELYNPLFIFGGPGLGKTHILNAIGNKVLAD
+NPQARIKYVSSETFINEFLEHLRLNDMESFKKTYRNLDLLLIDDIQSLRNKATTQEEFFH
+TFNALHEKNKQIVLTSDRNPDHLDNLEERLVTRFKWGLTSEITPPDFETRIAILRNKCEN
+LPYNFTNETLSYLAGQFDSNVRDLEGALKDIHLIATMRQLSEISVEVAAEAIRSRKQTNP
+QNMVIPIEKIQTEVGNFYGVSLKELKGSKRVQHIVHARQVAMFLAREMTDNSLPKIGKEF
+GNRDHTTVMHAYNKIKTLLLDDENLEIEITSIKNKLR
+>Streptococcus_suis|ORF292 length 216 aa, 651 bp, from 185183..185833 of Streptococcus_suis
+AVGKDHLTLDPISVEQIIAVMPVLIVVTAGAVQGSTLGSQSFFVGCFIAEEVTCLQTLGV
+GQGGQAVQIFAWIATRAGHQPVFTVVVIPRGNPCFCDDDFQSVHASCCQGIGHGTEIRQS
+RRRYLTIGPIGLDLKRASVVCVGLGATVQPVNHRFIALHLLVVARCHVARRAQRANTRHM
+EAGKAASEEVVIEGVRSNVPQFFSSRADRQVPLVQV
+>Streptococcus_suis|ORF583 length 391 aa, 1176 bp, from 397805..398980 of Streptococcus_suis
+RKKMKKQFELIATAAAGLEAVVGREIRNLGYECQVENGRVRFQGDVKSIIETNIWLRSAD
+RIKIIVGQFPAKTFEELFQGVFNLDWENYLPLGCKFPISKAKCVKSKLHNEPSVQAISKK
+AVVKKLQKHFSRPEGVPLQEMGAEFKIEVSILKDVATVMIDTTGSSLFKRGYRVEKGGAP
+IKENMAAAILQLSNWYPDKPLIDPTCGSGTFCIEAAMLAKNIAPGLKRSFAFEEWPWVED
+QLVVALRKEAQASIKTDLVLDITGSDIDARMIEIAKKNAFAAGVEQDIVFKQMRVQDLRT
+DKINGVIISNPPYGERLLDDEAIVTLYREMGETFEPLKTWSKFILTSDELFETRFGQQAD
+KKRKLYNGTLKVDLYQFFGQRVKRQVQEVQG
+>Streptococcus_suis|ORF874 length 113 aa, 342 bp, from complement(605625..605966) of Streptococcus_suis
+VSNIVTAITTVNQSQAFQLAKVFFDSQVVRQHLSWVPCICQTIPYWHTGEFCQFFHHFLT
+ETTEFNTVEHTSQNFSSIFCRFFLTKLDVICTKIFWMGTKVNRCYCEGSTSTC
+>Streptococcus_suis|ORF1165 length 105 aa, 318 bp, from 811613..811930 of Streptococcus_suis
+AYNESVKRKECHLMKQVNMSKIINYLTILGLLILLSAFFLDNWIRDWFFPSSWGNVATML
+ILPLLGALILILSIYYKKLWTGLISIFLIISFPLIFGIGYFIFGP
+>Streptococcus_suis|ORF1456 length 116 aa, 351 bp, from complement(1027944..1028294) of Streptococcus_suis
+YGNACNSRPPTCDKSYSCWETLIYMGLNLVQFHFLISWYNGNMVISILQFFSHILFIYLA
+HHLLVTTVDWSRWLKVTGDNQRKINLLILFLAIALGYLVSTFFLELLMMGRSFANM
+>Streptococcus_suis|ORF1747 length 335 aa, 1008 bp, from complement(1225218..1226225) of Streptococcus_suis
+RMLNTDDTVTIYDVAREAGVSMATVSRVVNGNKNVKENTRKKVLEVIDRLDYRPNAVARG
+LASKKTTTVGVVIPNIANAYFATLAKGIDDIADMYKYNIVLANSDENDEKEINVVNTLFS
+KQVDGIIFMGYHLTDKIRAEFSRSRTPIVLAGTVDLEHQLPSVNIDYAAASVDAVNLLAK
+NNKKIAFVSGPLVDDINGKVRFAGYKQGLKDNGIEFNEGLVFESKYKYEEGYALAERILN
+AGATAAYVAEDEIAAGLLNGVSDMGIKVPEDFEIITSDDSLVTKFTRPNLTSINQPLYDI
+GAIAMRMLTKIMHKEELENREVVLNHGIKVRKSTK
+>Streptococcus_suis|ORF2038 length 112 aa, 339 bp, from 1422849..1423187 of Streptococcus_suis
+SSKMPAVLQRTSTDWHQEKTGDKSPVRSSCRKWKSPAKSGLLARSIPKTRQPFSKKIPAC
+ARPMPLETPVMRAVLVMDFYPVGRKDIARGRAPHGEAFTLAGHVDEEIGRRL
+>Streptococcus_suis|ORF2329 length 160 aa, 483 bp, from 1612284..1612766 of Streptococcus_suis
+LIETNWFHHLTGQEGLDVLFFHNLGFRITDQLYLEVRKFHLLQGLSQLLRRWSQESRVKG
+ARYIERNHPLDTCFLQQFNRLIHCSHLASDDDLGWCVVVGWGNNPRGNSRTDFFNQVDIC
+VENSNHLTSPCWRSQFHIFTTLSNQGNRIFKGQSSRCHQS
+>Streptococcus_suis|ORF2620 length 192 aa, 579 bp, from complement(1803558..1804136) of Streptococcus_suis
+RLKIPCFQRKEVTMYDSFDKGWFVLQTYSGYENKVKENLLQRAHTYNMLENILRVEIPTQ
+TVQVEKNGEVKEVEENRFPGYVLVEMVMTDEAWFVVRNTPNVTGFVGSHGNRSKPTPLLE
+EEIRQILVSMGQTVQEFDIDVKVGDTVRIIDGAFTDYTGKITEIDNNKVKMVISMFGNDT
+IAEVNLSQIAEL
diff -r 16ecf25d521f -r da64f6a9e32b tools/sample_seqs/README.rst
--- a/tools/sample_seqs/README.rst Thu Mar 27 12:13:22 2014 -0400
+++ b/tools/sample_seqs/README.rst Fri Mar 06 11:48:09 2015 -0500
@@ -39,11 +39,12 @@
-You will also need to install Biopython 1.62 or later. If you want to run
-the unit tests, include this line in ``tools_conf.xml.sample`` and the sample
-FASTA files under the ``test-data`` directory. Then::
+You will also need to install Biopython 1.62 or later.
- ./run_functional_tests.sh -id sample_seqs
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
+
+ ./run_tests.sh -id sample_seqs
That's it.
@@ -55,6 +56,13 @@
Version Changes
------- ----------------------------------------------------------------------
v0.0.1 - Initial version.
+v0.1.1 - Using optparse to provide a proper command line API.
+v0.1.2 - Interleaved mode for working with paired records.
+ - Tool definition now embeds citation information.
+v0.2.0 - Option to give number of sequences (or pairs) desired.
+ This works by first counting all your sequences, then calculates
+ the percentage required in order to sample them uniformly (evenly).
+ This makes two passes through the input and is therefore slower.
======= ======================================================================
@@ -67,7 +75,7 @@
For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
the following command from the Galaxy root folder::
- $ tar -czf sample_seqs.tar.gz tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml test-data/ecoli.fastq test-data/ecoli.sample_N100.fastq test-data/get_orf_input.Suis_ORF.prot.fasta test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff
+ $ tar -czf sample_seqs.tar.gz tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml test-data/ecoli.fastq test-data/ecoli.sample_N100.fastq test-data/ecoli.pair_sample_N100.fastq test-data/ecoli.sample_C10.fastq test-data/get_orf_input.Suis_ORF.prot.fasta test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta test-data/get_orf_input.Suis_ORF.prot.sample_C10.fasta test-data/get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff
Check this worked::
@@ -78,10 +86,18 @@
tools/sample_seqs/tool_dependencies.xml
test-data/ecoli.fastq
test-data/ecoli.sample_N100.fastq
+ test-data/ecoli.pair_sample_N100.fastq
+ test-data/ecoli.sample_C10.fastq
test-data/get_orf_input.Suis_ORF.prot.fasta
test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta
+ test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta
+ test-data/get_orf_input.Suis_ORF.prot.sample_C10.fasta
+ test-data/get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta
test-data/MID4_GLZRM4E04_rnd30_frclip.sff
test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff
+ test-data/MID4_GLZRM4E04_rnd30_pair_sample.sff
+ test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
+ test-data/MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff
Licence (MIT)
diff -r 16ecf25d521f -r da64f6a9e32b tools/sample_seqs/sample_seqs.py
--- a/tools/sample_seqs/sample_seqs.py Thu Mar 27 12:13:22 2014 -0400
+++ b/tools/sample_seqs/sample_seqs.py Fri Mar 06 11:48:09 2015 -0500
@@ -2,46 +2,136 @@
"""Sub-sample sequence from a FASTA, FASTQ or SFF file.
This tool is a short Python script which requires Biopython 1.62 or later
-for SFF file support. If you use this tool in scientific work leading to a
+for sequence parsing. If you use this tool in scientific work leading to a
publication, please cite the Biopython application note:
Cock et al 2009. Biopython: freely available Python tools for computational
molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
+This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute
(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
See accompanying text file for licence details (MIT license).
-This is version 0.1.0 of the script, use -v or --version to get the version.
+Use -v or --version to get the version, -h or --help for help.
"""
import os
import sys
+from optparse import OptionParser
-def stop_err(msg, err=1):
+
+def sys_exit(msg, err=1):
sys.stderr.write(msg.rstrip() + "\n")
sys.exit(err)
-if "-v" in sys.argv or "--version" in sys.argv:
- print("v0.1.0")
+#Parse Command Line
+usage = """Use as follows:
+
+$ python sample_seqs.py [options]
+
+e.g. Sample 20% of the reads:
+
+$ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
+
+This samples uniformly though the file, rather than at random, and therefore
+should be reproducible.
+"""
+parser = OptionParser(usage=usage)
+parser.add_option('-i', '--input', dest='input',
+ default=None, help='Input sequences filename',
+ metavar="FILE")
+parser.add_option('-f', '--format', dest='format',
+ default=None,
+ help='Input sequence format (e.g. fasta, fastq, sff)')
+parser.add_option('-o', '--output', dest='output',
+ default=None, help='Output sampled sequenced filename',
+ metavar="FILE")
+parser.add_option('-p', '--percent', dest='percent',
+ default=None,
+ help='Take this percent of the reads')
+parser.add_option('-n', '--everyn', dest='everyn',
+ default=None,
+ help='Take every N-th read')
+parser.add_option('-c', '--count', dest='count',
+ default=None,
+ help='Take exactly N reads')
+parser.add_option("--interleaved", dest="interleaved",
+ default=False, action="store_true",
+ help="Input is interleaved reads, preserve the pairings")
+parser.add_option("-v", "--version", dest="version",
+ default=False, action="store_true",
+ help="Show version and quit")
+options, args = parser.parse_args()
+
+if options.version:
+ print("v0.2.0")
sys.exit(0)
-#Parse Command Line
-if len(sys.argv) < 5:
- stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
-seq_format, in_file, out_file, mode = sys.argv[1:5]
+in_file = options.input
+out_file = options.output
+interleaved = options.interleaved
+
+if not in_file:
+ sys_exit("Require an input filename")
if in_file != "/dev/stdin" and not os.path.isfile(in_file):
- stop_err("Missing input file %r" % in_file)
+ sys_exit("Missing input file %r" % in_file)
+if not out_file:
+ sys_exit("Require an output filename")
+if not options.format:
+ sys_exit("Require the sequence format")
+seq_format = options.format.lower()
+
+
+def count_fasta(filename):
+ from Bio.SeqIO.FastaIO import SimpleFastaParser
+ count = 0
+ with open(filename) as handle:
+ for title, seq in SimpleFastaParser(handle):
+ count += 1
+ return count
+
+
+def count_fastq(filename):
+ from Bio.SeqIO.QualityIO import FastqGeneralIterator
+ count = 0
+ with open(filename) as handle:
+ for title, seq, qual in FastqGeneralIterator(handle):
+ count += 1
+ return count
+
-if mode == "everyNth":
- if len(sys.argv) != 6:
- stop_err("If using everyNth, just need argument N (integer, at least 2)")
+def count_sff(filename):
+ from Bio import SeqIO
+ # If the SFF file has a built in index (which is normal),
+ # this will be parsed and is the quicker than scanning
+ # the whole file.
+ return len(SeqIO.index(filename, "sff"))
+
+
+def count_sequences(filename, format):
+ if seq_format == "sff":
+ return count_sff(filename)
+ elif seq_format == "fasta":
+ return count_fasta(filename)
+ elif seq_format.startswith("fastq"):
+ return count_fastq(filename)
+ else:
+ sys_exit("Unsupported file type %r" % seq_format)
+
+
+if options.percent and options.everyn:
+ sys_exit("Cannot combine -p and -n options")
+elif options.everyn and options.count:
+ sys_exit("Cannot combine -p and -c options")
+elif options.percent and options.count:
+ sys_exit("Cannot combine -n and -c options")
+elif options.everyn:
try:
- N = int(sys.argv[5])
+ N = int(options.everyn)
except:
- stop_err("Bad N argument %r" % sys.argv[5])
+ sys_exit("Bad -n argument %r" % options.everyn)
if N < 2:
- stop_err("Bad N argument %r" % sys.argv[5])
+ sys_exit("Bad -n argument %r" % options.everyn)
if (N % 10) == 1:
sys.stderr.write("Sampling every %ist sequence\n" % N)
elif (N % 10) == 2:
@@ -57,15 +147,13 @@
count += 1
if count % N == 1:
yield record
-elif mode == "percentage":
- if len(sys.argv) != 6:
- stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
+elif options.percent:
try:
- percent = float(sys.argv[5]) / 100.0
+ percent = float(options.percent) / 100.0
except:
- stop_err("Bad percent argument %r" % sys.argv[5])
+ sys_exit("Bad -p percent argument %r" % options.percent)
if percent <= 0.0 or 1.0 <= percent:
- stop_err("Bad percent argument %r" % sys.argv[5])
+ sys_exit("Bad -p percent argument %r" % options.percent)
sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
def sampler(iterator):
global percent
@@ -76,8 +164,88 @@
if percent * count > taken:
taken += 1
yield record
+elif options.count:
+ try:
+ N = int(options.count)
+ except:
+ sys_exit("Bad -c count argument %r" % options.count)
+ if N < 1:
+ sys_exit("Bad -c count argument %r" % options.count)
+ total = count_sequences(in_file, seq_format)
+ print("Input file has %i sequences" % total)
+ if interleaved:
+ # Paired
+ if total % 2:
+ sys_exit("Paired mode, but input file has an odd number of sequences: %i"
+ % total)
+ elif N > total // 2:
+ sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
+ % (N, total // 2, total))
+ total = total // 2
+ if N == 1:
+ sys.stderr.write("Sampling just first sequence pair!\n")
+ elif N == total:
+ sys.stderr.write("Taking all the sequence pairs\n")
+ else:
+ sys.stderr.write("Sampling %i sequence pairs\n" % N)
+ else:
+ # Not paired
+ if total < N:
+ sys_exit("Requested %i sequences, but file only has %i." % (N, total))
+ if N == 1:
+ sys.stderr.write("Sampling just first sequence!\n")
+ elif N == total:
+ sys.stderr.write("Taking all the sequences\n")
+ else:
+ sys.stderr.write("Sampling %i sequences\n" % N)
+ if N == total:
+ def sampler(iterator):
+ """Dummy filter to filter nothing, taking everything."""
+ global N
+ taken = 0
+ for record in iterator:
+ taken += 1
+ yield record
+ assert taken == N, "Picked %i, wanted %i" % (taken, N)
+ else:
+ def sampler(iterator):
+ # Mimic the percentage sampler, with double check on final count
+ global N, total
+ # Do we need a floating point fudge factor epsilon?
+ # i.e. What if percentage comes out slighty too low, and
+ # we could end up missing last few desired sequences?
+ percentage = float(N) / float(total)
+ #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
+ # % (N, total, percentage * 100.0))
+ count = 0
+ taken = 0
+ for record in iterator:
+ count += 1
+ # Do we need the extra upper bound?
+ if percentage * count > taken and taken < N:
+ taken += 1
+ yield record
+ elif total - count + 1 <= N - taken:
+ # remaining records (incuding this one) <= what we still need.
+ # This is a safey check for floating point edge cases where
+ # we need to take all remaining sequences to meet target
+ taken += 1
+ yield record
+ assert taken == N, "Picked %i, wanted %i" % (taken, N)
else:
- stop_err("Unsupported mode %r" % mode)
+ sys_exit("Must use either -n, -p or -c")
+
+
+def pair(iterator):
+ """Quick and dirty pair batched iterator."""
+ while True:
+ a = next(iterator)
+ b = next(iterator)
+ if not b:
+ assert not a, "Odd number of records?"
+ break
+ yield (a, b)
+
def raw_fasta_iterator(handle):
"""Yields raw FASTA records as multi-line strings."""
@@ -113,46 +281,46 @@
if not line:
return # StopIteration
-def fasta_filter(in_file, out_file, iterator_filter):
+def fasta_filter(in_file, out_file, iterator_filter, inter):
count = 0
#Galaxy now requires Python 2.5+ so can use with statements,
with open(in_file) as in_handle:
with open(out_file, "w") as pos_handle:
- for record in iterator_filter(raw_fasta_iterator(in_handle)):
- count += 1
- pos_handle.write(record)
+ if inter:
+ for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
+ count += 1
+ pos_handle.write(r1)
+ pos_handle.write(r2)
+ else:
+ for record in iterator_filter(raw_fasta_iterator(in_handle)):
+ count += 1
+ pos_handle.write(record)
return count
-try:
- from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
- def fastq_filter(in_file, out_file, iterator_filter):
- count = 0
- #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
- reader = fastqReader(open(in_file, "rU"))
- writer = fastqWriter(open(out_file, "w"))
- for record in iterator_filter(reader):
- count += 1
- writer.write(record)
- writer.close()
- reader.close()
- return count
-except ImportError:
- from Bio.SeqIO.QualityIO import FastqGeneralIterator
- def fastq_filter(in_file, out_file, iterator_filter):
- count = 0
- with open(in_file) as in_handle:
- with open(out_file, "w") as pos_handle:
+
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+def fastq_filter(in_file, out_file, iterator_filter, inter):
+ count = 0
+ with open(in_file) as in_handle:
+ with open(out_file, "w") as pos_handle:
+ if inter:
+ for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))):
+ count += 1
+ pos_handle.write("@%s\n%s\n+\n%s\n" % r1)
+ pos_handle.write("@%s\n%s\n+\n%s\n" % r2)
+ else:
for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
count += 1
pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
- return count
+ return count
-def sff_filter(in_file, out_file, iterator_filter):
+
+def sff_filter(in_file, out_file, iterator_filter, inter):
count = 0
try:
from Bio.SeqIO.SffIO import SffIterator, SffWriter
except ImportError:
- stop_err("SFF filtering requires Biopython 1.54 or later")
+ sys_exit("SFF filtering requires Biopython 1.54 or later")
try:
from Bio.SeqIO.SffIO import ReadRocheXmlManifest
except ImportError:
@@ -167,17 +335,26 @@
with open(out_file, "wb") as out_handle:
writer = SffWriter(out_handle, xml=manifest)
in_handle.seek(0) #start again after getting manifest
- count = writer.write_file(iterator_filter(SffIterator(in_handle)))
- #count = writer.write_file(SffIterator(in_handle))
+ if inter:
+ from itertools import chain
+ count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
+ assert count % 2 == 0, "Odd number of records? %i" % count
+ count /= 2
+ else:
+ count = writer.write_file(iterator_filter(SffIterator(in_handle)))
+ #count = writer.write_file(SffIterator(in_handle))
return count
-if seq_format.lower()=="sff":
- count = sff_filter(in_file, out_file, sampler)
-elif seq_format.lower()=="fasta":
- count = fasta_filter(in_file, out_file, sampler)
-elif seq_format.lower().startswith("fastq"):
- count = fastq_filter(in_file, out_file, sampler)
+if seq_format == "sff":
+ count = sff_filter(in_file, out_file, sampler, interleaved)
+elif seq_format == "fasta":
+ count = fasta_filter(in_file, out_file, sampler, interleaved)
+elif seq_format.startswith("fastq"):
+ count = fastq_filter(in_file, out_file, sampler, interleaved)
else:
- stop_err("Unsupported file type %r" % seq_format)
+ sys_exit("Unsupported file type %r" % seq_format)
-sys.stderr.write("Sampled %i records\n" % count)
+if interleaved:
+ sys.stderr.write("Selected %i pairs\n" % count)
+else:
+ sys.stderr.write("Selected %i records\n" % count)
diff -r 16ecf25d521f -r da64f6a9e32b tools/sample_seqs/sample_seqs.xml
--- a/tools/sample_seqs/sample_seqs.xml Thu Mar 27 12:13:22 2014 -0400
+++ b/tools/sample_seqs/sample_seqs.xml Fri Mar 06 11:48:09 2015 -0500
@@ -1,18 +1,21 @@
-
+e.g. to reduce coverage
- biopython
+ biopythonBiosample_seqs.py --version
+sample_seqs.py -f "$input_file.ext" -i "$input_file" -o "$output_file"
#if str($sampling.type) == "everyNth":
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}"
+-n "${sampling.every_n}"
#elif str($sampling.type) == "percentage":
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}"
-#else:
-##Should give an error about invalid sampling type:
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}"
+-p "${sampling.percent}"
+#else
+-c "${sampling.count}"
+#end if
+#if $interleaved
+--interleaved
#end if
@@ -24,8 +27,9 @@
-
-
+
+
+
@@ -34,7 +38,11 @@
+
+
+
+
@@ -53,6 +61,13 @@
+
+
+
+
+
+
+
@@ -65,35 +80,108 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
**What it does**
Takes an input file of sequences (typically FASTA or FASTQ, but also
Standard Flowgram Format (SFF) is supported), and returns a new sequence
-file sub-sampling from this (in the same format).
+file sub-sampling uniformly from this (in the same format, preserving the
+input order and selecting sequencing evenly though the input file).
-Several sampling modes are supported, all designed to be non-random. This
-allows reproducibility, and also works on paired sequence files. Also
-note that by sampling uniformly through the file, this avoids any bias
-should reads in any part of the file are of lesser quality (e.g. one part
-of the slide).
+Several sampling modes are supported, all designed to do non-random
+uniform sampling (i.e. evenly through the input file). This allows
+reproducibility, and also works on paired sequence files (run the tool
+twice, once on each file using the same settings).
-The simplest mode is to take every N-th sequence, for example taking
+By sampling uniformly (evenly) through the file, this avoids any bias
+should reads in any part of the file be of lesser quality (e.g. for
+high throughput sequencing the reads at the start and end of the file
+can be of lower quality).
+
+The simplest mode is to take every *N*-th sequence, for example taking
every 2nd sequence would sample half the file - while taking every 5th
sequence would take 20% of the file.
+The target count method picks *N* sequences from the input file, which
+again will be distributed uniformly (evenly) though the file. This works
+by first counting the number of records, then calculating the desired
+percentage of sequences to take. Note if your input file has exactly
+*N* sequences this selects them all (effectively copying the input file).
+If your input file has less than *N* sequences, this is treated as an
+error.
+
+If you tick the interleaved option, the file is processed as pairs of
+records to ensure your read pairs are not separated by sampling.
+For example using 20% would take every 5th pair of records, or you
+could request 1000 read pairs.
+
+.. class:: warningmark
+
+Note interleaves/pair mode does *not* actually check your read names
+match a known pair naming scheme!
**Example Usage**
@@ -103,6 +191,14 @@
Taking every 3rd read would reduce the estimated coverage to about x66,
and would preserve the pairing as well.
+Similarly, if you had some Illumina paired end data interleaved into one
+file with an estimated x200 coverage, you would run this tool in
+interleaved mode, taking every 3rd read pair. This would again reduce
+the estimated coverage to about x66, while preserving the read pairing.
+
+Suppose you have a transcriptome assembly, and wish to look at the
+species distribution of the top BLAST hits for an initial quality check.
+Rather than using all your sequences, you could pick 1000 only for this.
**Citation**
@@ -116,4 +212,7 @@
This tool is available to install into other Galaxy Instances via the Galaxy
Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs
+
+ 10.1093/bioinformatics/btp163
+
diff -r 16ecf25d521f -r da64f6a9e32b tools/sample_seqs/tool_dependencies.xml
--- a/tools/sample_seqs/tool_dependencies.xml Thu Mar 27 12:13:22 2014 -0400
+++ b/tools/sample_seqs/tool_dependencies.xml Fri Mar 06 11:48:09 2015 -0500
@@ -1,6 +1,6 @@
-
-
+
+