Mercurial > repos > arkarachai-fungtammasan > microsatellite_ngs
changeset 6:dccd7a3ee717
removing unnecessary files
| author | devteam@galaxyproject.org | 
|---|---|
| date | Wed, 22 Apr 2015 12:22:05 -0400 | 
| parents | b27006b0a953 | 
| children | 3c05abb4452e | 
| files | test-data/.DS_Store test-data/GenotypeTRcorrection.py test-data/GenotypingSTR.xml test-data/PEsortedSAM2readprofile.py test-data/PEsortedSAM2readprofile.xml test-data/README.md test-data/STR-FM/.DS_Store test-data/STR-FM/.git/COMMIT_EDITMSG test-data/STR-FM/.git/FETCH_HEAD test-data/STR-FM/.git/HEAD test-data/STR-FM/.git/ORIG_HEAD test-data/STR-FM/.git/config test-data/STR-FM/.git/description test-data/STR-FM/.git/hooks/applypatch-msg.sample test-data/STR-FM/.git/hooks/commit-msg.sample test-data/STR-FM/.git/hooks/post-update.sample test-data/STR-FM/.git/hooks/pre-applypatch.sample test-data/STR-FM/.git/hooks/pre-commit.sample test-data/STR-FM/.git/hooks/pre-push.sample test-data/STR-FM/.git/hooks/pre-rebase.sample test-data/STR-FM/.git/hooks/prepare-commit-msg.sample test-data/STR-FM/.git/hooks/update.sample test-data/STR-FM/.git/index test-data/STR-FM/.git/info/exclude test-data/STR-FM/.git/logs/HEAD test-data/STR-FM/.git/logs/refs/heads/master test-data/STR-FM/.git/logs/refs/remotes/origin/HEAD test-data/STR-FM/.git/logs/refs/remotes/origin/master test-data/STR-FM/.git/objects/00/65e52bd133e27ebd75aa0d23697f7e1dc887cb test-data/STR-FM/.git/objects/00/9d1901c2a9f8ce435cda891df89b4e2d11e895 test-data/STR-FM/.git/objects/0c/532395e56679a7f181e9ff6e329bf88b55030d test-data/STR-FM/.git/objects/15/6730cc56c5c045e7f73a940d0da23aa5083add test-data/STR-FM/.git/objects/19/637cfe7d85e2a9b41e6272003dcca6e01d9b58 test-data/STR-FM/.git/objects/1c/4b24809f0d53625b803af1d880f737aef0cb5f test-data/STR-FM/.git/objects/1f/006fe62353d7163b93e72d28b5c15df1ebf982 test-data/STR-FM/.git/objects/1f/235bb203a8f4961f62c75d77cd64b50848b1a3 test-data/STR-FM/.git/objects/21/e77deba49b6ba73853e753f679e832d70cbb73 test-data/STR-FM/.git/objects/23/36b40e39da86d2c5537e56e722e3bb2ef8dae1 test-data/STR-FM/.git/objects/26/987f13c24ba62274e026714f47e223412b5714 test-data/STR-FM/.git/objects/27/6728965898e68eea856d7384f3744b9417c070 test-data/STR-FM/.git/objects/2e/5efab77e486426b7703146615f5ac0c7ff363c test-data/STR-FM/.git/objects/38/071906fb65431e06428693afbfc71bade5ec99 test-data/STR-FM/.git/objects/3b/65f228ef1817a991dcd2a7f0bc35eeabf56cd9 test-data/STR-FM/.git/objects/3c/72d9651d5cd1ad49875fe7326a541e1916737f test-data/STR-FM/.git/objects/3e/350089364d34ede5dd5f5311551f26a11c1c39 test-data/STR-FM/.git/objects/44/5252a9df64ead9f59e0ebe4004b53091d614ad test-data/STR-FM/.git/objects/45/945a50d69e1f140444ed42393e6b2b08429a30 test-data/STR-FM/.git/objects/45/c0d4d9bacbf7a9a39119ad31423d6e55540a99 test-data/STR-FM/.git/objects/4c/f8e3de5255184c12e076995a8bce030669a5f0 test-data/STR-FM/.git/objects/4e/28ca0263259df475c9bed544b4e89709a1ad9d test-data/STR-FM/.git/objects/4f/bb7af3188879a4ac1012b6a61424b52854ccb1 test-data/STR-FM/.git/objects/50/97abc625c65fa11f10b487a7f66f8ac9367b0b test-data/STR-FM/.git/objects/52/5f703fc64ce9e2010ea74b42dcbe596aded3c0 test-data/STR-FM/.git/objects/52/ab2a0c6715c5c4536b514b47e0ff74d9eb2404 test-data/STR-FM/.git/objects/59/4efff9886e723fd607f8735f1cd2d46d8b84a1 test-data/STR-FM/.git/objects/5a/fbc673a9b7e0f3279225900b7516750c60a863 test-data/STR-FM/.git/objects/60/5088643574c402f0d98b5ad2831dfe4580bd41 test-data/STR-FM/.git/objects/62/9c7d519f699ff169c0574d47f18907fb537a40 test-data/STR-FM/.git/objects/64/004ca2ec103e185ebb71abc7ebad45ad8da295 test-data/STR-FM/.git/objects/66/0a528da974f41924da3279a17a654eacd8e19f test-data/STR-FM/.git/objects/67/eff41a50756d383233a2c06aa4876562dfe223 test-data/STR-FM/.git/objects/6c/5f01970c2bcf5cfa391c41f6e9fe79c9afa1e7 test-data/STR-FM/.git/objects/6d/9c8eedb886195285e7a3f805ff494e5fe5f374 test-data/STR-FM/.git/objects/6e/cb2682fd362b24ddbb479fce5682a4ef49234a test-data/STR-FM/.git/objects/71/4758ec266030e060e7bf7881c32167a3fbd5b5 test-data/STR-FM/.git/objects/73/cd9f1c6004a5d97926a236e0c42eebf4984dad test-data/STR-FM/.git/objects/77/324ab9d4e51c9fb4e3e88df6aae9f9b25206ed test-data/STR-FM/.git/objects/79/1e14959d2b0f0e2531fe801a34c06fe908bf32 test-data/STR-FM/.git/objects/7f/48062955b52948798594aac973f23c749979a8 test-data/STR-FM/.git/objects/82/ef3b9f212da4d35b867d6e2a152ac42f7e3ff0 test-data/STR-FM/.git/objects/84/c85bf7959dbcbae4ae3a4bb528f262adc1c352 test-data/STR-FM/.git/objects/86/d2c135b7d3998986c837d8b6f66973f2aa9f71 test-data/STR-FM/.git/objects/88/37d328adbc2637a71678090382aae604eef6ab test-data/STR-FM/.git/objects/8a/c88565d8e69ce00536510540ee9aa228d5d39d test-data/STR-FM/.git/objects/8a/df7c6fbfab0ca7297f7081c30ce8a0d91f8500 test-data/STR-FM/.git/objects/8c/668ef6ee5d9eacd62eed1d651c7c771d9207d5 test-data/STR-FM/.git/objects/8d/1196beb5d1e6185dfd5980832adc1b59923258 test-data/STR-FM/.git/objects/8e/fad9a0398c76c27ed5a32d2823c2291a694461 test-data/STR-FM/.git/objects/96/bef7c4ed4ab4d1f4d8d610f0e28fc2fe44d81f test-data/STR-FM/.git/objects/98/365f3b89ce6a5102ba19e722cc0fa3fadba996 test-data/STR-FM/.git/objects/98/c5f2bb6708714fe4914c2cc8503a57e6231ee0 test-data/STR-FM/.git/objects/9e/bdf5b88ba4eb8b5516c300edbcb7fc81f1882d test-data/STR-FM/.git/objects/9f/02b33db90fbe53f6b1cb8e1624946b90c91336 test-data/STR-FM/.git/objects/a3/952a3a70cde7d0f67dc810da0f0e4b15010d84 test-data/STR-FM/.git/objects/a4/88e9604e7379ec68377aad8da69d3198e8d6ea test-data/STR-FM/.git/objects/a5/03245a189f1705559c7a0542e0e60b505cb72d test-data/STR-FM/.git/objects/a5/15ea668b78dc84569274a8297e02a882b6b38b test-data/STR-FM/.git/objects/a7/be6e358d1016c259ee3a4b99f0acfd8c0096a8 test-data/STR-FM/.git/objects/ab/bd57d29285d2253408bf5dea4cffd6f9e8aca6 test-data/STR-FM/.git/objects/b0/5e976751d04a76c3b333ae9d152a71c4089b7f test-data/STR-FM/.git/objects/b1/2233971833efeaec764f1e797cf103a97166dd test-data/STR-FM/.git/objects/b3/c05bb3894d71a5c88e2c0eb062dc417cdb2cd3 test-data/STR-FM/.git/objects/b4/cf9ef6300314d493478c391b89b962cfd48f0c test-data/STR-FM/.git/objects/b7/a1e182da880e8ab9991819f5054b7c19fa718d test-data/STR-FM/.git/objects/ba/74660506d90c514ee1d32bf4eb5ecb0af14075 test-data/STR-FM/.git/objects/bb/60aa1c10e6b0b84b68039eff7a5f9e8ccb8dd8 test-data/STR-FM/.git/objects/c4/b1eccc8a4e7795d12c385b3b3df67a9d16751f test-data/STR-FM/.git/objects/c6/5837b5335b15de2371ea6a394b76851a1e416c test-data/STR-FM/.git/objects/c9/ba1d8a454cbc8fd3ebbf002e33ced5f51b363f test-data/STR-FM/.git/objects/ca/fed8793e168e785956ccbbe2a7852ae5d0ca77 test-data/STR-FM/.git/objects/cc/1e650dbc118a79699d69196d6a1862d0dd1275 test-data/STR-FM/.git/objects/ce/013d9a241a080e0eaeca400d1919baf1630cb1 test-data/STR-FM/.git/objects/ce/bc3ab80ab25aa2af4ae265bd89387c2225a708 test-data/STR-FM/.git/objects/cf/0fbf511a978a6292d3205bf82cd5f71f30dd70 test-data/STR-FM/.git/objects/d0/4a2cd3a9a020709cacd39e082f3159804e01b8 test-data/STR-FM/.git/objects/d0/ea4935294ab6ea1fe192fa929c308422f2699b test-data/STR-FM/.git/objects/d1/07d24e434cae7bc804d9a96bf8de7b8a094547 test-data/STR-FM/.git/objects/d1/6c4dcdc10fe565a1753d6eca3e3043f659d746 test-data/STR-FM/.git/objects/d1/b92cb33cf7d2942655e776f5499c5bbff18bde test-data/STR-FM/.git/objects/d6/51c39b1765fef9046cb5efab6918cf52f4f7a8 test-data/STR-FM/.git/objects/d6/684d880aa770620623907bddf1e6e984aab45b test-data/STR-FM/.git/objects/dc/98cf2e45c7b7b0ad9077b3392c109e13b5db1c test-data/STR-FM/.git/objects/de/31e8f100b06c28c0aeda43b0b98af94a393779 test-data/STR-FM/.git/objects/e4/0412b7fab11a34ee539a0f62dd97c2506fd453 test-data/STR-FM/.git/objects/e4/80084541ff270bec24fdedbbfc92354734a380 test-data/STR-FM/.git/objects/e8/fc4566e4342552693ed4ab5f9907db2952ac72 test-data/STR-FM/.git/objects/ea/6162c79ad57153ed2a5961f77875f075a878d3 test-data/STR-FM/.git/objects/f0/7fd90b9063f41cf2428844ca34fb9c2c0a956d test-data/STR-FM/.git/objects/f1/973b380e2d4bd1e8adbb352b53f06f576b3312 test-data/STR-FM/.git/objects/f3/2cfaaeb649eae94f69b3d2cd199e748bc2261b test-data/STR-FM/.git/objects/f3/c9246e5c4686a47e86f36f854691d5e99ef68d test-data/STR-FM/.git/objects/f4/6d272789e77b9cc9fc9b282e7153a890203416 test-data/STR-FM/.git/objects/f6/acfc018f5b2cd0fdf9b8b1e809af07cef5c98f test-data/STR-FM/.git/objects/f6/b8b5549bbef48e5d212b463b7c0d93617f21d9 test-data/STR-FM/.git/objects/f6/d325b93ac860b1a7228d7bb18af956b941c0a7 test-data/STR-FM/.git/objects/f6/db9f6752293744129977b97ff76beb250d5eeb test-data/STR-FM/.git/objects/fd/b47064525be57239d995de06c60f9301b759d4 test-data/STR-FM/.git/objects/ff/125a58870195e28b6927b4a492f1df3dd5c97d test-data/STR-FM/.git/packed-refs test-data/STR-FM/.git/refs/heads/master test-data/STR-FM/.git/refs/remotes/origin/HEAD test-data/STR-FM/.git/refs/remotes/origin/master test-data/STR-FM/test-data/.DS_Store test-data/changespacetounderscore_readname.py test-data/combinedprobforallelecombination.py test-data/combineprobforallelecombination.xml test-data/fetchflank.xml test-data/heteroprob.py test-data/microsatcompat.py test-data/microsatcompat.xml test-data/microsatellite.py test-data/microsatellite.xml test-data/microsatpurity.py test-data/microsatpurity.xml test-data/pair_fetch_DNA_ff.py test-data/probvalueforhetero.xml test-data/profilegenerator.py test-data/profilegenerator.xml test-data/readdepth2sequencingdepth.xml test-data/sequencingdepthconversion_G.py test-data/space2underscore_readname.xml test-data/test-data/.DS_Store test-data/test-data/C_sample_fastq test-data/test-data/C_sample_snoope test-data/test-data/PCRinclude.allrate.bymajorallele test-data/test-data/combineprob_out.txt test-data/test-data/microsatcompat_in.txt test-data/test-data/microsatcompat_out.txt test-data/test-data/microsatellite_flanking_L.fastq test-data/test-data/microsatellite_flanking_R.fastq test-data/test-data/microsatpurity_in.txt test-data/test-data/microsatpurity_out.txt test-data/test-data/nice1tab.py test-data/test-data/probvalueforhetero_in.txt test-data/test-data/probvalueforhetero_out.txt test-data/test-data/profilegenerator_in.txt test-data/test-data/profilegenerator_out.txt test-data/test-data/readdepth2seqdepth.out test-data/test-data/samplePESAM_2_profile_C.txt test-data/test-data/sampleTRgenotypingcorrection test-data/test-data/sampleTRprofile_C.txt test-data/test-data/samplefq.snoope test-data/test-data/samplefq.snoope.new test-data/test-data/sampleprofilegenerator_in test-data/test-data/sampleprofilegenerator_out test-data/test-data/samplesortedPESAM_C.sam test-data/test-data/shifted.2bit | 
| diffstat | 177 files changed, 0 insertions(+), 4802 deletions(-) [+] | 
line wrap: on
 line diff
--- a/test-data/GenotypeTRcorrection.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,250 +0,0 @@ -### import libraries ### -import sys -import collections, math -import heapq -from galaxy import eggs - - - - - -### basic function ### -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def averagelist(a,b,expectedlevelofminor): - product=[] - for i in range(len(a)): - product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i]) - - return product - -def complement_base(read): - collect='' - for i in read: - if i.upper()=='A': - collect+='T' - elif i.upper()=='T': - collect+='A' - elif i.upper()=='C': - collect+='G' - elif i.upper()=='G': - collect+='C' - return collect -def makeallpossible(read): - collect=[] - for i in range(len(read)): - tmp= read[i:]+read[:i] - collect.append(tmp) - collect.append(complement_base(tmp)) - return collect - -def motifsimplify(base): - '''str--> str - ''' - motiflength=len(base) - temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base)))) - - return temp[0] - -def majorallele(seq): - binseq=list(set(seq)) - binseq.sort(reverse=True) # highly mutate mode - #binseq.sort() # majority mode - storeform='' - storevalue=0 - for i in binseq: - if seq.count(i)>storevalue: - storeform=i - storevalue=seq.count(i) - - return int(storeform) - -### decide global parameter ### -COORDINATECOLUMN=1 -ALLELECOLUMN=2 -MOTIFCOLUMN=3 - ##(0.01-0.5) -MINIMUMMUTABLE=1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012 - - -## Fixed global variable -inputname=sys.argv[1] -errorprofile=sys.argv[2] -Genotypingcorrected=sys.argv[3] -EXPECTEDLEVELOFMINOR=float(sys.argv[4]) -if EXPECTEDLEVELOFMINOR >0.5: - try: - expected_contribution_of_minor_allele=int('expected_contribution_of_minor_allele') - except Exception, eee: - print eee - stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5") -ALLREPEATTYPE=[1,2,3,4] -ALLREPEATTYPENAME=['mono','di','tri','tetra'] -monomotif=['A','C'] -dimotif=['AC','AG','AT','CG'] -trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG'] -tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\ -'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\ -'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC'] -ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif} -monorange=range(5,60) -dirange=range(6,60) -trirange=range(9,60) -tetrarange=range(12,80) -ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange} - -######################################### -######## Prob calculation sector ######## -######################################### -def multinomial_prob(majorallele,STRlength,motif,probdatabase): - '''int,int,str,dict-->int - ### get prob for each STRlength to be generated from major allele - ''' - #print (majorallele,STRlength,motif) - prob=probdatabase[len(motif)][motif][majorallele][STRlength] - return prob - -################################################ -######## error model database sector ########### -################################################ - -## structure generator -errormodeldatabase={1:{},2:{},3:{},4:{}} -sumbymajoralleledatabase={1:{},2:{},3:{},4:{}} -for repeattype in ALLREPEATTYPE: - for motif in ALLMOTIF[repeattype]: - errormodeldatabase[repeattype][motif]={} - sumbymajoralleledatabase[repeattype][motif]={} - for motifsize1 in ALLRANGE[repeattype]: - errormodeldatabase[repeattype][motif][motifsize1]={} - sumbymajoralleledatabase[repeattype][motif][motifsize1]=0 - for motifsize2 in ALLRANGE[repeattype]: - errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE - -#print errormodeldatabase -## read database - - -## get read count for each major allele -fd=open(errorprofile) -lines=fd.readlines() -for line in lines: - temp=line.strip().split('\t') - t_major=int(temp[0]) - t_count=int(temp[2]) - motif=temp[3] - sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count -fd.close() -##print sumbymajoralleledatabase - -## get probability -fd=open(errorprofile) -lines=fd.readlines() -for line in lines: - temp=line.strip().split('\t') - t_major=int(temp[0]) - t_read=int(temp[1]) - t_count=int(temp[2]) - motif=temp[3] - if sumbymajoralleledatabase[len(motif)][motif][t_major]>0: - errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0) - #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0)) - - #else: - # errormodeldatabase[repeattype][motif][t_major][t_read]=0 -fd.close() - -######################################### -######## input reading sector ########### -######################################### -fdout=open(Genotypingcorrected,'w') - -fd = open(inputname) - -lines=fd.xreadlines() -for line in lines: - i_read=[] - i2_read=[] - temp=line.strip().split('\t') - i_coordinate=temp[COORDINATECOLUMN-1] - i_motif=motifsimplify(temp[MOTIFCOLUMN-1]) - i_read=temp[ALLELECOLUMN-1].split(',') - i_read=map(int,i_read) - coverage=len(i_read) - -### Evaluate 1 major allele ### - i_all_allele=list(set(i_read)) - i_major_allele=majorallele(i_read) - f_majorallele=i_read.count(i_major_allele) -### Evaluate 2 major allele ### - if len(i_all_allele)>1: - i2_read=filter(lambda a: a != i_major_allele, i_read) - i_major2_allele=majorallele(i2_read) - f_majorallele2=i_read.count(i_major2_allele) - ### Evaluate 3 major allele ### - if len(i_all_allele)>2: - i3_read=filter(lambda a: a != i_major2_allele, i2_read) - i_major3_allele=majorallele(i3_read) - f_majorallele3=i_read.count(i_major3_allele) - ### No 3 major allele ### - elif len(i_all_allele)==2: - i_major3_allele=i_major2_allele - ### No 2 major allele ### - elif len(i_all_allele)==1: - #i_major2_allele=majorallele(i_read) - i_major2_allele=i_major_allele+len(i_motif) - i_major3_allele=i_major2_allele - #print line.strip()+'\t'+'\t'.join(['homo','only',str(i_major_allele),str(i_major_allele),'NA']) - #continue - else: - print("no allele is reading") - sys.exit() - -## scope filter - -######################################### -######## prob calculation option ######## -######################################### - homozygous_collector=0 - heterozygous_collector=0 - - - alist=[multinomial_prob(i_major_allele,x,i_motif,errormodeldatabase)for x in i_read] - blist=[multinomial_prob(i_major2_allele,x,i_motif,errormodeldatabase)for x in i_read] - clist=[multinomial_prob(i_major3_allele,x,i_motif,errormodeldatabase)for x in i_read] - - ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR) - bclist=averagelist(blist,clist,EXPECTEDLEVELOFMINOR) - aclist=averagelist(alist,clist,EXPECTEDLEVELOFMINOR) - - #print alist,blist,clist - majora=sum([math.log(i,10) for i in alist]) - majorb=sum([math.log(i,10) for i in blist]) - majorc=sum([math.log(i,10) for i in clist]) - homozygous_collector=max(majora,majorb,majorc) - - homomajor1=max([(majora,i_major_allele),(majorb,i_major2_allele),(majorc,i_major3_allele)])[1] - homomajordict={i_major_allele:majora,i_major2_allele:majorb,i_major3_allele:majorc} - - majorab=sum([math.log(i,10) for i in ablist]) - majorbc=sum([math.log(i,10) for i in bclist]) - majorac=sum([math.log(i,10) for i in aclist]) - heterozygous_collector=max(majorab,majorbc,majorac) - bothheteromajor=max([(majorab,(i_major_allele,i_major2_allele)),(majorbc,(i_major2_allele,i_major3_allele)),(majorac,(i_major_allele,i_major3_allele))])[1] - ##heteromajor1=max(bothheteromajor) - ##heteromajor2=min(bothheteromajor) - pre_heteromajor1=bothheteromajor[0] - pre_heteromajor2=bothheteromajor[1] - heteromajor1=max((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1] - heteromajor2=min((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1] - - logratio_homo=homozygous_collector-heterozygous_collector - - if logratio_homo>0: - fdout.writelines(line.strip()+'\t'+'\t'.join(['homo',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n') - elif logratio_homo<0: - fdout.writelines(line.strip()+'\t'+'\t'.join(['hetero',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n') -fd.close() -fdout.close()
--- a/test-data/GenotypingSTR.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ -<tool id="GenotypeSTR" name="Correct genotype for microsatellite errors" version="2.0.0"> - <description> during sequencing and library prep </description> - <command interpreter="python2.7">GenotypeTRcorrection.py $microsat_raw $microsat_error_profile $microsat_corrected $expectedminorallele </command> - - <inputs> - <param name="microsat_raw" type="data" label="Select microsatellite length profile that need to refine genotyping" /> - <param name="microsat_error_profile" type="data" label="Select microsatellite error profile that correspond to this dataset" /> - <param name="expectedminorallele" type="float" value="0.5" label="Expected contribution of minor allele when present (0.5 for genotyping)" /> - - </inputs> - <outputs> - <data name="microsat_corrected" format="tabular" /> - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="microsat_raw" value="sampleTRprofile_C.txt"/> - <param name="microsat_error_profile" value="PCRinclude.allrate.bymajorallele"/> - <param name="expectedminorallele" value="0.5"/> - <output name="microsat_corrected" file="sampleTRgenotypingcorrection"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -- This tool will correct for microsatellite sequencing and library preparation errors using error rates estimated from hemizygous male X chromosome or any rates provided by user. The read profile for each locus will be processed independently. -- First, this tool will find three most common read lengths from input read length profile. If the read profile has only one length of TR, the length of one motif longer than the observed length will be used as the second most common read length. -- Second, it will calculate probability of three forms of homozygous and use the form which give the highest probability. The same goes for heterozygous. -- Third, this tools will calculate log based 10 of (the probability of homozygous/the probability of heterozygous). If this value is more than 0, it will predict this locus to homozygous. If this value is less than 0, it will predict this locus to heterozygous. If this value is 0, read profile at this locus will be discard. - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -- The input files need to contain at least three columns. -- Column 1 = location of microsatellite locus. -- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). -- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. - -**Output** - -The output will be contain original three (or more) column as the input. However, it will also have these following columns. - -- Additional column 1 = homozygous/heterozygous label. -- Additional column 2 = log based 10 of (the probability of homozygous/the probability of heterozygous) -- Additional column 3 = Allele for most probable homozygous form. -- Additional column 4 = Allele 1 for most probable heterozygous form. -- Additional column 5 = Allele 2 for most probable heterozygous form. - -**Example** - -- Suppose that we sequence one locus of microsatellite with NGS. This locus has **A** motif and the following length (bp) profile. :: - - chr1_100_106 5, 6, 6, 6, 6, 7, 7, 8, 8 A - -- We want to figure out if this locus is a homolozygous or heterozygous and the corresponding allele(s). Therefore, we use this tool to refine genotype. -- This tool will calculate the probability of homozygous A6A6, A7A7, and A8A8 to generate observed length profile. Among this A7A7 has the highest probability. Therefore, we use this form as the representative for homozygous. -- Then, this tool will calculate the probability of heterozygous A6A7, A7A8, and A6A8 to generate observed length profile. Among this A6A8 has the highest probability. Therefore, we use this form as the representative for heterozygous. -- The A6A7 has higher probability than A7A7. Therefore, the program will report that this locus is a heterozygous locus. :: - - chr1 5,6,6,6,6,7,7,8,8 A hetero -14.8744881854 7 6 8 - - -</help> -</tool>
--- a/test-data/PEsortedSAM2readprofile.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -#!/usr/bin/env python - -import sys -from galaxy import eggs -import pkg_resources -pkg_resources.require( "bx-python" ) -import bx.seq.twobit - -##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence - -samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname -seq_path = sys.argv[2] #Path to the reference genome in 2bit format - -##maxTRlength=int(sys.argv[4]) -##maxoriginalreadlength=int(sys.argv[5]) -maxTRlength=int(sys.argv[3]) -maxoriginalreadlength=int(sys.argv[4]) -outfile=sys.argv[5] -fout = open(outfile,'w') - -twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) ) - -skipped=0 -while True: - read = samf.readline().strip() - if not(read): #EOF reached - break - if read[0] == "@": - #print read - continue - mate = samf.readline().strip() - if not(mate): #EOF reached - break - read_elems = read.split() - mate_elems = mate.split() - read_name = read_elems[0].strip() - mate_name = mate_elems[0].strip() - while True: - if read_name == mate_name: - break - elif read_name != mate_name: - #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry." - #break - skipped += 1 - read = mate - read_elems = mate_elems - mate = samf.readline().strip() - read_name = read_elems[0].strip() - mate_name = mate_elems[0].strip() - if not(mate): #EOF reached - break - mate_elems = mate.split() - #extract XT:A tag - #for e in read_elems: - # if e.startswith('XT:A'): - # read_xt = e - #for e in mate_elems: - # if e.startswith('XT:A'): - # mate_xt = e - #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely - # continue - read_chr = read_elems[2] - read_start = int(read_elems[3]) - read_cigar = read_elems[5] - if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M - continue - read_len = int(read_cigar.split('M')[0]) - mate_chr = mate_elems[2] - mate_start = int(mate_elems[3]) - mate_cigar = mate_elems[5] - if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M - continue - mate_len = int(mate_cigar.split('M')[0]) - if read_chr != mate_chr: # check that they were mapped to the same chromosome - continue - if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength): - continue - if read_start < mate_start: - pre_s = read_start-1 - pre_e = read_start-1+read_len - tr_s = read_start-1+read_len - tr_e = mate_start-1 - suf_s = mate_start-1 - suf_e = mate_start-1+mate_len - else: - pre_s = mate_start-1 - pre_e = mate_start-1+mate_len - tr_s = mate_start-1+mate_len - tr_e = read_start-1 - suf_s = read_start-1 - suf_e = read_start-1+read_len - tr_len = abs(tr_e - tr_s) - if tr_len > maxTRlength: - continue - if pre_e >= suf_s: #overlapping prefix and suffix - continue - tr_ref_seq = twobitfile[read_chr][tr_s:tr_e] - ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq) - fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n') - -print "Skipped %d unpaired reads" %(skipped)
--- a/test-data/PEsortedSAM2readprofile.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -<tool id="PEsortedSAM2readprofile" name="Combine mapped flaked bases" version="1.0.0"> - <description> from SAM file sorted by readname </description> - <command interpreter="python2.7">PEsortedSAM2readprofile.py $flankedbasesSAM $twobitref $maxTRlength $maxoriginalreadlength $output </command> - - <inputs> - <param name="flankedbasesSAM" type="data" format="sam" label="Select sorted SAM file (by readname) of flaked bases" /> - <param name="twobitref" type="data" label="Select twobit file reference genome" /> - <param name="maxTRlength" type="integer" value="100" label="Maximum expected microsatellite length (bp)" /> - <param name="maxoriginalreadlength" type="integer" value="101" label="Maxinum original read length" /> - - </inputs> - <outputs> - <data name="output" format="tabular" /> - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="flankedbasesSAM" value="samplesortedPESAM_C.sam"/> - <param name="twobitref" value="shifted.2bit"/> - <param name="maxTRlength" value="100"/> - <param name="maxoriginalreadlength" value="250"/> - <output name="output" file="samplePESAM_2_profile_C.txt"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -- This tool will take SAM file sorted by read name, remove unpaired reads, report microsatellites sequences in the reference genome that correspond to the space between paired end reads. Coordinate of start and stop for left and right flanking regions of microsatellites and microsatellite itself as inferred from paired end reads will also be reported. -- These microsatellites in reference can be used to filter out reads that do not contain microsatellites that concur with microsatellites in reference where the reads mapped to. - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -- Sorted SAM files by read name - -**Output** - -The output will combined two lines of input which are paired. The output format is as follow. - -- Column 1 = read name -- Column 2 = chromosome -- Column 3 = left flanking region start -- Column 4 = left flanking region stop -- Column 5 = microsatellite start -- Column 6 = microsatellite stop -- Column 7 = right flanking region start -- Column 8 = right flanking region stop -- Column 9 = microsatellite length in reference -- Column 10= microsatellite sequence in reference - - - -</help> -</tool>
--- a/test-data/README.md Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -# STR-FM
--- a/test-data/STR-FM/.git/COMMIT_EDITMSG Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -initial commit - -# Please enter the commit message for your changes. Lines starting -# with '#' will be ignored, and an empty message aborts the commit. -# -# Committer: Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> -# -# On branch master -# Changes to be committed: -# (use "git reset HEAD <file>..." to unstage) -# -# new file: GenotypeTRcorrection.py -# new file: GenotypingSTR.xml -# new file: PEsortedSAM2readprofile.py -# new file: PEsortedSAM2readprofile.xml -# new file: changespacetounderscore_readname.py -# new file: combinedprobforallelecombination.py -# new file: combineprobforallelecombination.xml -# new file: fetchflank.xml -# new file: heteroprob.py -# new file: microsatcompat.py -# new file: microsatcompat.xml -# new file: microsatellite.py -# new file: microsatellite.xml -# new file: microsatpurity.py -# new file: microsatpurity.xml -# new file: pair_fetch_DNA_ff.py -# new file: probvalueforhetero.xml -# new file: profilegenerator.py -# new file: profilegenerator.xml -# new file: readdepth2sequencingdepth.xml -# new file: sequencingdepthconversion_G.py -# new file: space2underscore_readname.xml -# new file: test-data/.DS_Store -# new file: test-data/C_sample_fastq -# new file: test-data/C_sample_snoope -# new file: test-data/PCRinclude.allrate.bymajorallele -# new file: test-data/combineprob_out.txt -# new file: test-data/microsatcompat_in.txt -# new file: test-data/microsatcompat_out.txt -# new file: test-data/microsatellite_flanking_L.fastq -# new file: test-data/microsatellite_flanking_R.fastq -# new file: test-data/microsatpurity_in.txt -# new file: test-data/microsatpurity_out.txt -# new file: test-data/nice1tab.py -# new file: test-data/probvalueforhetero_in.txt -# new file: test-data/probvalueforhetero_out.txt -# new file: test-data/profilegenerator_in.txt -# new file: test-data/profilegenerator_out.txt -# new file: test-data/readdepth2seqdepth.out -# new file: test-data/samplePESAM_2_profile_C.txt -# new file: test-data/sampleTRgenotypingcorrection -# new file: test-data/sampleTRprofile_C.txt -# new file: test-data/samplefq.snoope -# new file: test-data/samplefq.snoope.new -# new file: test-data/sampleprofilegenerator_in -# new file: test-data/sampleprofilegenerator_out -# new file: test-data/samplesortedPESAM_C.sam -# new file: test-data/shifted.2bit -#
--- a/test-data/STR-FM/.git/FETCH_HEAD Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -cebc3ab80ab25aa2af4ae265bd89387c2225a708 branch 'master' of https://github.com/Arkarachai/STR-FM
--- a/test-data/STR-FM/.git/HEAD Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -ref: refs/heads/master
--- a/test-data/STR-FM/.git/ORIG_HEAD Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -d1b92cb33cf7d2942655e776f5499c5bbff18bde
--- a/test-data/STR-FM/.git/config Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -[core] - repositoryformatversion = 0 - filemode = true - bare = false - logallrefupdates = true - ignorecase = true - precomposeunicode = false -[remote "origin"] - url = https://github.com/Arkarachai/STR-FM.git - fetch = +refs/heads/*:refs/remotes/origin/* -[branch "master"] - remote = origin - merge = refs/heads/master
--- a/test-data/STR-FM/.git/description Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -Unnamed repository; edit this file 'description' to name the repository.
--- a/test-data/STR-FM/.git/hooks/applypatch-msg.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -#!/bin/sh -# -# An example hook script to check the commit log message taken by -# applypatch from an e-mail message. -# -# The hook should exit with non-zero status after issuing an -# appropriate message if it wants to stop the commit. The hook is -# allowed to edit the commit message file. -# -# To enable this hook, rename this file to "applypatch-msg". - -. git-sh-setup -test -x "$GIT_DIR/hooks/commit-msg" && - exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"} -:
--- a/test-data/STR-FM/.git/hooks/commit-msg.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -#!/bin/sh -# -# An example hook script to check the commit log message. -# Called by "git commit" with one argument, the name of the file -# that has the commit message. The hook should exit with non-zero -# status after issuing an appropriate message if it wants to stop the -# commit. The hook is allowed to edit the commit message file. -# -# To enable this hook, rename this file to "commit-msg". - -# Uncomment the below to add a Signed-off-by line to the message. -# Doing this in a hook is a bad idea in general, but the prepare-commit-msg -# hook is more suited to it. -# -# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') -# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" - -# This example catches duplicate Signed-off-by lines. - -test "" = "$(grep '^Signed-off-by: ' "$1" | - sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { - echo >&2 Duplicate Signed-off-by lines. - exit 1 -}
--- a/test-data/STR-FM/.git/hooks/post-update.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -#!/bin/sh -# -# An example hook script to prepare a packed repository for use over -# dumb transports. -# -# To enable this hook, rename this file to "post-update". - -exec git update-server-info
--- a/test-data/STR-FM/.git/hooks/pre-applypatch.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -#!/bin/sh -# -# An example hook script to verify what is about to be committed -# by applypatch from an e-mail message. -# -# The hook should exit with non-zero status after issuing an -# appropriate message if it wants to stop the commit. -# -# To enable this hook, rename this file to "pre-applypatch". - -. git-sh-setup -test -x "$GIT_DIR/hooks/pre-commit" && - exec "$GIT_DIR/hooks/pre-commit" ${1+"$@"} -:
--- a/test-data/STR-FM/.git/hooks/pre-commit.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -#!/bin/sh -# -# An example hook script to verify what is about to be committed. -# Called by "git commit" with no arguments. The hook should -# exit with non-zero status after issuing an appropriate message if -# it wants to stop the commit. -# -# To enable this hook, rename this file to "pre-commit". - -if git rev-parse --verify HEAD >/dev/null 2>&1 -then - against=HEAD -else - # Initial commit: diff against an empty tree object - against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 -fi - -# If you want to allow non-ascii filenames set this variable to true. -allownonascii=$(git config hooks.allownonascii) - -# Redirect output to stderr. -exec 1>&2 - -# Cross platform projects tend to avoid non-ascii filenames; prevent -# them from being added to the repository. We exploit the fact that the -# printable range starts at the space character and ends with tilde. -if [ "$allownonascii" != "true" ] && - # Note that the use of brackets around a tr range is ok here, (it's - # even required, for portability to Solaris 10's /usr/bin/tr), since - # the square bracket bytes happen to fall in the designated range. - test $(git diff --cached --name-only --diff-filter=A -z $against | - LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0 -then - echo "Error: Attempt to add a non-ascii file name." - echo - echo "This can cause problems if you want to work" - echo "with people on other platforms." - echo - echo "To be portable it is advisable to rename the file ..." - echo - echo "If you know what you are doing you can disable this" - echo "check using:" - echo - echo " git config hooks.allownonascii true" - echo - exit 1 -fi - -# If there are whitespace errors, print the offending file names and fail. -exec git diff-index --check --cached $against --
--- a/test-data/STR-FM/.git/hooks/pre-push.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -#!/bin/sh - -# An example hook script to verify what is about to be pushed. Called by "git -# push" after it has checked the remote status, but before anything has been -# pushed. If this script exits with a non-zero status nothing will be pushed. -# -# This hook is called with the following parameters: -# -# $1 -- Name of the remote to which the push is being done -# $2 -- URL to which the push is being done -# -# If pushing without using a named remote those arguments will be equal. -# -# Information about the commits which are being pushed is supplied as lines to -# the standard input in the form: -# -# <local ref> <local sha1> <remote ref> <remote sha1> -# -# This sample shows how to prevent push of commits where the log message starts -# with "WIP" (work in progress). - -remote="$1" -url="$2" - -z40=0000000000000000000000000000000000000000 - -IFS=' ' -while read local_ref local_sha remote_ref remote_sha -do - if [ "$local_sha" = $z40 ] - then - # Handle delete - else - if [ "$remote_sha" = $z40 ] - then - # New branch, examine all commits - range="$local_sha" - else - # Update to existing branch, examine new commits - range="$remote_sha..$local_sha" - fi - - # Check for WIP commit - commit=`git rev-list -n 1 --grep '^WIP' "$range"` - if [ -n "$commit" ] - then - echo "Found WIP commit in $local_ref, not pushing" - exit 1 - fi - fi -done - -exit 0
--- a/test-data/STR-FM/.git/hooks/pre-rebase.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,169 +0,0 @@ -#!/bin/sh -# -# Copyright (c) 2006, 2008 Junio C Hamano -# -# The "pre-rebase" hook is run just before "git rebase" starts doing -# its job, and can prevent the command from running by exiting with -# non-zero status. -# -# The hook is called with the following parameters: -# -# $1 -- the upstream the series was forked from. -# $2 -- the branch being rebased (or empty when rebasing the current branch). -# -# This sample shows how to prevent topic branches that are already -# merged to 'next' branch from getting rebased, because allowing it -# would result in rebasing already published history. - -publish=next -basebranch="$1" -if test "$#" = 2 -then - topic="refs/heads/$2" -else - topic=`git symbolic-ref HEAD` || - exit 0 ;# we do not interrupt rebasing detached HEAD -fi - -case "$topic" in -refs/heads/??/*) - ;; -*) - exit 0 ;# we do not interrupt others. - ;; -esac - -# Now we are dealing with a topic branch being rebased -# on top of master. Is it OK to rebase it? - -# Does the topic really exist? -git show-ref -q "$topic" || { - echo >&2 "No such branch $topic" - exit 1 -} - -# Is topic fully merged to master? -not_in_master=`git rev-list --pretty=oneline ^master "$topic"` -if test -z "$not_in_master" -then - echo >&2 "$topic is fully merged to master; better remove it." - exit 1 ;# we could allow it, but there is no point. -fi - -# Is topic ever merged to next? If so you should not be rebasing it. -only_next_1=`git rev-list ^master "^$topic" ${publish} | sort` -only_next_2=`git rev-list ^master ${publish} | sort` -if test "$only_next_1" = "$only_next_2" -then - not_in_topic=`git rev-list "^$topic" master` - if test -z "$not_in_topic" - then - echo >&2 "$topic is already up-to-date with master" - exit 1 ;# we could allow it, but there is no point. - else - exit 0 - fi -else - not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"` - /usr/bin/perl -e ' - my $topic = $ARGV[0]; - my $msg = "* $topic has commits already merged to public branch:\n"; - my (%not_in_next) = map { - /^([0-9a-f]+) /; - ($1 => 1); - } split(/\n/, $ARGV[1]); - for my $elem (map { - /^([0-9a-f]+) (.*)$/; - [$1 => $2]; - } split(/\n/, $ARGV[2])) { - if (!exists $not_in_next{$elem->[0]}) { - if ($msg) { - print STDERR $msg; - undef $msg; - } - print STDERR " $elem->[1]\n"; - } - } - ' "$topic" "$not_in_next" "$not_in_master" - exit 1 -fi - -exit 0 - -################################################################ - -This sample hook safeguards topic branches that have been -published from being rewound. - -The workflow assumed here is: - - * Once a topic branch forks from "master", "master" is never - merged into it again (either directly or indirectly). - - * Once a topic branch is fully cooked and merged into "master", - it is deleted. If you need to build on top of it to correct - earlier mistakes, a new topic branch is created by forking at - the tip of the "master". This is not strictly necessary, but - it makes it easier to keep your history simple. - - * Whenever you need to test or publish your changes to topic - branches, merge them into "next" branch. - -The script, being an example, hardcodes the publish branch name -to be "next", but it is trivial to make it configurable via -$GIT_DIR/config mechanism. - -With this workflow, you would want to know: - -(1) ... if a topic branch has ever been merged to "next". Young - topic branches can have stupid mistakes you would rather - clean up before publishing, and things that have not been - merged into other branches can be easily rebased without - affecting other people. But once it is published, you would - not want to rewind it. - -(2) ... if a topic branch has been fully merged to "master". - Then you can delete it. More importantly, you should not - build on top of it -- other people may already want to - change things related to the topic as patches against your - "master", so if you need further changes, it is better to - fork the topic (perhaps with the same name) afresh from the - tip of "master". - -Let's look at this example: - - o---o---o---o---o---o---o---o---o---o "next" - / / / / - / a---a---b A / / - / / / / - / / c---c---c---c B / - / / / \ / - / / / b---b C \ / - / / / / \ / - ---o---o---o---o---o---o---o---o---o---o---o "master" - - -A, B and C are topic branches. - - * A has one fix since it was merged up to "next". - - * B has finished. It has been fully merged up to "master" and "next", - and is ready to be deleted. - - * C has not merged to "next" at all. - -We would want to allow C to be rebased, refuse A, and encourage -B to be deleted. - -To compute (1): - - git rev-list ^master ^topic next - git rev-list ^master next - - if these match, topic has not merged in next at all. - -To compute (2): - - git rev-list master..topic - - if this is empty, it is fully merged to "master".
--- a/test-data/STR-FM/.git/hooks/prepare-commit-msg.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -#!/bin/sh -# -# An example hook script to prepare the commit log message. -# Called by "git commit" with the name of the file that has the -# commit message, followed by the description of the commit -# message's source. The hook's purpose is to edit the commit -# message file. If the hook fails with a non-zero status, -# the commit is aborted. -# -# To enable this hook, rename this file to "prepare-commit-msg". - -# This hook includes three examples. The first comments out the -# "Conflicts:" part of a merge commit. -# -# The second includes the output of "git diff --name-status -r" -# into the message, just before the "git status" output. It is -# commented because it doesn't cope with --amend or with squashed -# commits. -# -# The third example adds a Signed-off-by line to the message, that can -# still be edited. This is rarely a good idea. - -case "$2,$3" in - merge,) - /usr/bin/perl -i.bak -ne 's/^/# /, s/^# #/#/ if /^Conflicts/ .. /#/; print' "$1" ;; - -# ,|template,) -# /usr/bin/perl -i.bak -pe ' -# print "\n" . `git diff --cached --name-status -r` -# if /^#/ && $first++ == 0' "$1" ;; - - *) ;; -esac - -# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') -# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
--- a/test-data/STR-FM/.git/hooks/update.sample Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ -#!/bin/sh -# -# An example hook script to blocks unannotated tags from entering. -# Called by "git receive-pack" with arguments: refname sha1-old sha1-new -# -# To enable this hook, rename this file to "update". -# -# Config -# ------ -# hooks.allowunannotated -# This boolean sets whether unannotated tags will be allowed into the -# repository. By default they won't be. -# hooks.allowdeletetag -# This boolean sets whether deleting tags will be allowed in the -# repository. By default they won't be. -# hooks.allowmodifytag -# This boolean sets whether a tag may be modified after creation. By default -# it won't be. -# hooks.allowdeletebranch -# This boolean sets whether deleting branches will be allowed in the -# repository. By default they won't be. -# hooks.denycreatebranch -# This boolean sets whether remotely creating branches will be denied -# in the repository. By default this is allowed. -# - -# --- Command line -refname="$1" -oldrev="$2" -newrev="$3" - -# --- Safety check -if [ -z "$GIT_DIR" ]; then - echo "Don't run this script from the command line." >&2 - echo " (if you want, you could supply GIT_DIR then run" >&2 - echo " $0 <ref> <oldrev> <newrev>)" >&2 - exit 1 -fi - -if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then - echo "usage: $0 <ref> <oldrev> <newrev>" >&2 - exit 1 -fi - -# --- Config -allowunannotated=$(git config --bool hooks.allowunannotated) -allowdeletebranch=$(git config --bool hooks.allowdeletebranch) -denycreatebranch=$(git config --bool hooks.denycreatebranch) -allowdeletetag=$(git config --bool hooks.allowdeletetag) -allowmodifytag=$(git config --bool hooks.allowmodifytag) - -# check for no description -projectdesc=$(sed -e '1q' "$GIT_DIR/description") -case "$projectdesc" in -"Unnamed repository"* | "") - echo "*** Project description file hasn't been set" >&2 - exit 1 - ;; -esac - -# --- Check types -# if $newrev is 0000...0000, it's a commit to delete a ref. -zero="0000000000000000000000000000000000000000" -if [ "$newrev" = "$zero" ]; then - newrev_type=delete -else - newrev_type=$(git cat-file -t $newrev) -fi - -case "$refname","$newrev_type" in - refs/tags/*,commit) - # un-annotated tag - short_refname=${refname##refs/tags/} - if [ "$allowunannotated" != "true" ]; then - echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2 - echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 - exit 1 - fi - ;; - refs/tags/*,delete) - # delete tag - if [ "$allowdeletetag" != "true" ]; then - echo "*** Deleting a tag is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/tags/*,tag) - # annotated tag - if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 - then - echo "*** Tag '$refname' already exists." >&2 - echo "*** Modifying a tag is not allowed in this repository." >&2 - exit 1 - fi - ;; - refs/heads/*,commit) - # branch - if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then - echo "*** Creating a branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/heads/*,delete) - # delete branch - if [ "$allowdeletebranch" != "true" ]; then - echo "*** Deleting a branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/remotes/*,commit) - # tracking branch - ;; - refs/remotes/*,delete) - # delete tracking branch - if [ "$allowdeletebranch" != "true" ]; then - echo "*** Deleting a tracking branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - *) - # Anything else (is there anything else?) - echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 - exit 1 - ;; -esac - -# --- Finished -exit 0
--- a/test-data/STR-FM/.git/info/exclude Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -# git ls-files --others --exclude-from=.git/info/exclude -# Lines that start with '#' are comments. -# For a project mostly in C, the following would be a good set of -# exclude patterns (uncomment them if you want to use them): -# *.[oa] -# *~
--- a/test-data/STR-FM/.git/logs/HEAD Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -0000000000000000000000000000000000000000 3b65f228ef1817a991dcd2a7f0bc35eeabf56cd9 Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427821659 -0400 clone: from https://github.com/Arkarachai/STR-FM.git -3b65f228ef1817a991dcd2a7f0bc35eeabf56cd9 d1b92cb33cf7d2942655e776f5499c5bbff18bde Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427821928 -0400 commit: initial commit -d1b92cb33cf7d2942655e776f5499c5bbff18bde cebc3ab80ab25aa2af4ae265bd89387c2225a708 Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427900437 -0400 pull: Fast-forward
--- a/test-data/STR-FM/.git/logs/refs/heads/master Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -0000000000000000000000000000000000000000 3b65f228ef1817a991dcd2a7f0bc35eeabf56cd9 Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427821659 -0400 clone: from https://github.com/Arkarachai/STR-FM.git -3b65f228ef1817a991dcd2a7f0bc35eeabf56cd9 d1b92cb33cf7d2942655e776f5499c5bbff18bde Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427821928 -0400 commit: initial commit -d1b92cb33cf7d2942655e776f5499c5bbff18bde cebc3ab80ab25aa2af4ae265bd89387c2225a708 Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427900437 -0400 pull: Fast-forward
--- a/test-data/STR-FM/.git/logs/refs/remotes/origin/HEAD Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -0000000000000000000000000000000000000000 3b65f228ef1817a991dcd2a7f0bc35eeabf56cd9 Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427821659 -0400 clone: from https://github.com/Arkarachai/STR-FM.git
--- a/test-data/STR-FM/.git/logs/refs/remotes/origin/master Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -3b65f228ef1817a991dcd2a7f0bc35eeabf56cd9 d1b92cb33cf7d2942655e776f5499c5bbff18bde Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427822061 -0400 update by push -d1b92cb33cf7d2942655e776f5499c5bbff18bde cebc3ab80ab25aa2af4ae265bd89387c2225a708 Arkarachai Fungtammasan <arkarachaifungtammasan@Arkarachai-Fungtammasans-MacBook-Pro.local> 1427900437 -0400 pull: fast-forward
--- a/test-data/STR-FM/.git/objects/19/637cfe7d85e2a9b41e6272003dcca6e01d9b58 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -x¥ÎËMÄ0€aΩ °cK+: €ñc6qG¢|¶®ÿáÓŸ{kÛTèéeŽZ•4.%ÃAlôZ<fr…(o“ƒ`CÒl–“G=¦Ê’ÄiÍ‘{ŒX‚K0'¤Å@)_síC}ŒoœWÞÔçu<&·Æ?|¨ûe ¼¯}6Þö[îíMi‹"EÔ+X€åYŸ¿³þ_Z¾Î³ªstÙöú¨G<û¸ý¶ýlµU” \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/44/5252a9df64ead9f59e0ebe4004b53091d614ad Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -x¥ÎKNÅ0@QÆ]E6À“ãæg !˜°ÆÈI\ZAš*Ï•X>e Lïàè–ÞÚ¦=è1‹#ÆDc¦Rh)”1¡DëgN³³a:xÈ®&Uk)dɾZ 6ùºTO ÒŒ\‹ÍžgôiâS×>ÌëøâÁeåͼû§rk|çÝ<.‚Ç/kׯÛ÷ôöl¬û[‰€Þ<‚˜®zýªü_šÞÊ*¦meô;ë%¬ýÔ›þè/TU« \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/45/945a50d69e1f140444ed42393e6b2b08429a30 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -x¥MA‚@ôÌ+¸i0*]ØVvËn=Cï$$ÞL4þÿ`áÎ4í43i—×{IU¡vý4ŽE©u£Î•ÎW˜ô1)™'hÔyeÁšÂêZ)•©¡žaþ<¿0CÂŒÌäQJ„c"b‡Ò±P<qܶ®|rüç§Ü"êbŒ›QˆÑ‡ˆ!ø¡÷}ÖµÍõpiõÞ´¥í[ò,; \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/4c/f8e3de5255184c12e076995a8bce030669a5f0 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -xMË1 -€0Fa×x -¥Ñ?M½J¥ÐAŠ÷G©òÞú¥ãLƒt{©B–å6*ùÊõ¤1L˜W¨¸ªk$YÚèQŽ_ÆÆÎþƒ‚É p(È(’÷7´óg \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/62/9c7d519f699ff169c0574d47f18907fb537a40 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -x}RMOã0ݳÅH’ˆ(»{ ’nœ¸e«•›L¨UÉž¢åßïØQE-9DñÌ{“7Ïogüî~ÿÐ6øHÞ“¸‚ˆÖ¿!(cøSM@{E°W\::«h܃ÕcôIƒÇ=ŽðμóÁh~1\i‹bÂFoƒA‹ŽþîTÂ6ÏìÖø½18’lšrœ} Ú•ßVDFétcÛIÙlš¯ÖÉkÙ¼Ô1¹ˆfÁz¹ÈÚ|Ãz¼Èzú†õt‘õXYéÝçúy÷â”Ud׃OIïÌY§†íÂ)å^±5誾fÈYŒôz{MÖº²sûÃú^±¯njÝ•ÉgzËÌØ -^ì!Ä<IÏóZR¯âëÛ°Úv"g"ÉyêÿeåÔvbÖ1«8Z'µ£/Êý¶»]ÁÕ/‘pôn:ùY!°º93%t›2º†ƒ×Ïç>QÔ¡íúŒ¦¶ùCMÕ^³6ızæèÞäBm•x¯d.'2«y¥y_›§ -k—SWÙ9ÆËû¬Ô“; -‘7/²?eþ&1Ä \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/71/4758ec266030e060e7bf7881c32167a3fbd5b5 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -xMO± €0c-Wp@‡¤béÈ+°0 !ñÿ@¶‰7;qlç|ÞsÛE–ëþRÍ5kG*5`P%»xJ€]£4oØç%p‹¤Ð¦X¸«‘yë8¾ïHIàÇÁJ5Ì1Lå§ø/üå,?ŒAÙJw:Z#ºÅa§?£Vd \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/84/c85bf7959dbcbae4ae3a4bb528f262adc1c352 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -xÊÁ À À~)§$Èëð®Äþ¿,?N–×·WË|ýÄØKGäqí2ÕWóºNN^:‚Ã~ógV \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/8a/df7c6fbfab0ca7297f7081c30ce8a0d91f8500 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -x¥ÎAnÄ @Ñ®s -.0#c ÆRUµ›Þ phP‡1Ž4½}s‡nÿâé§ÞZUƒsxÑ‘³!¶Ùz|à -dΖÁŠó æ’âRN»Œ¼©¡â#ÌÈ!,ÙGâØ‹$&WÐ%òÌÄ'9tíÃ|Œ’V©æóؾUZ“‡læõð Þ×®Mêýšz{3Ö#Å€`.঳ž¿šÿ/M_ûM4›VÓèÑýU¯Ïvÿ¹çS“ \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/8d/1196beb5d1e6185dfd5980832adc1b59923258 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -x¥ÎMNÄ0@aÖ=E.À(qþ%„`à X#Çqi4¤©2®Äñ騾ŧG£÷& -Bz’ɬŠ%íK±)» zJ‰4 ’3‘jªv9pò.ªš’е´Æ -ÙAðžc«w9Ó%«I¥ò‚§lcª÷ylj´aSçþ-Ø;>pW/§‹ÚCÔoÛŽíçF£¿*ã ¦2xõ¬ÖËU¯_áÿKËçQQXõFs<P.ù@ùjûM~å¿”W% \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/8e/fad9a0398c76c27ed5a32d2823c2291a694461 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -xULA -Â0ô_áMÔ&MÒ6Zë6MÖs›{ ^ÅÿÜT)8„ÌÌÎìŽç¸…\h&3¦$C–±¡ïs©T%ö…Êt¼‚xÇ+yVntnT)„á….yy|ÝÞ<rÀZèa°Ødh6%©qŸ†N©éý€T'J×`òH{ä2²ÿð³õn–ˆ÷Ý/"ùUœ÷óÖ¹¶»´`ÛmSWÇÍ¡Vk]KÓœíNËññIv \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/98/365f3b89ce6a5102ba19e722cc0fa3fadba996 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -xK; -Ã0íìSh‹½Ò¥PðIÚ ¶‹Šc IzûÚyËûïvX¯·Lb ?u5'âÒýÐq“÷÷±¾‚kØ‹¦š£”-ŸÆWI`jÀ~²ÞX98Í<ª ²Q¹¡ÃlY°,O[⇰û¹<R,ð \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/9f/02b33db90fbe53f6b1cb8e1624946b90c91336 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -xeŠAƒ E»ÅSxhbºò,„/¬×—4vÕÍËÏûo‰iÍóõ(Ȱ"$´·*°þ·YÖT6ÛøúzÜ“9&wTªŽWvT±Çè$m “=þÞÁhe´VS§2Óp•ÿ0h \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/c9/ba1d8a454cbc8fd3ebbf002e33ced5f51b363f Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -x]PAnà ì5¼©°â Úi{°ÄúγN©0 X7éï±Û¨>±;³»3Ìɸ}®Ÿôè]@¿#”t,OµèÂùëPµqú WDÝ–ì -Ò;3]ÿÑÙ3Hmñ¾¼o‹]Eé#U0t“Aª#}ÝUò…m!ÊA‰k€NÝ:^Áškªíí Ù Œ^fPDÚóBDo4rvĤ>Óƒ6¿9ež_ðZææðßa+xÓõÀe%{ÏG|pjêqoVó´è|ŠFñ|´¾o ¸â/¼ÒÚVMŠo3ç'.A#Ì_ÍæÅ§Ó–/ÒÅ–m²’é‹ÂX¶–öº” \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/ce/bc3ab80ab25aa2af4ae265bd89387c2225a708 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -x¥KNÄ0Yç}FíOÛ±„l¸hÛ1Žƒ§s"®Àî©¥z¥·¶)Øžtˆ€¡–B¡PAO—è8y¬XÙ:fÂÙqÓÁCv…%ä9ù”³,~ªÖØìƒË±`M.˜¸XSÓħ®}ÀûøæÁeå >ÎýK¹5~ð/§H6âÛÚµñv¿•Þ^Áx¢‹žÑ#N½zUþoš>Ê*0äº#‡®ö!?ãÖOý5V& \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/cf/0fbf511a978a6292d3205bf82cd5f71f30dd70 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -x¥ÎANÄ0@QÖ=E.ÀÈqÒ:‘FˆÙpà&6hš*“"ŽOïÀö/ž~ª¥¬Ýà_z1EÝ-föÙs˜(O‚lGääQIœ*7Ù» œ•Ò¤³ò‰ #)A°ÉA’À£Õ0|ö¥6óhßÜ8-¼šsÿê\ -?y7÷ÓŒHð¾Ô^xÝn©–7c=Rˆ5¯à/æª×o—ÿKÃ瑹‹9Zx;Ek[ä’ëí·l"}WŒ \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/d0/4a2cd3a9a020709cacd39e082f3159804e01b8 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -xeŠAƒ D»ÅSôâÂtÓ³ü ŒøüP@½~©ÑU“™dòæM!NO=¼ ¦R€øº¨ãîÍ2ǼšÊ;èä©)ïíV¨$XžÙRÁgƒXO—à!qìð÷vºWºÿU -c÷Çm0˜ \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/d1/b92cb33cf7d2942655e776f5499c5bbff18bde Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -x½ANÃ0EYû¾@=™‰c©B”;$®0žŒ©Õ$®\÷þ '`ûÿûzúR·t;"=ô¦j 8;™‚'!A§Du9\¢&@‡æÂM÷¯aš(ÌšýìÇèY€CvIFRå”i’%¾õSmöØÎÜXN\ìëmÿè¼m|åÝø·Èwùó?Üó×áå¥ÖóðÞêãZ…×'ëÂ>Âl‡Îù>×õŸµ¦ì¥^íß|ÁsvE \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/ea/6162c79ad57153ed2a5961f77875f075a878d3 Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -x¥ŽKnÃ0»ö)xÔÏ” h7½A@Ktl4²…züèY¾fðR-eS°cøÐ&ì¦`Ù1aÊB—‘rŠ3ã‚âg°è‡ƒ›ì -ˆc`çlœK2g -̘'ZHL·#¥yàS×Úà§ýqã´ò¿ç~W.…Ÿ¼Ãõô„Á~¯UoKªåŒ·'Š~‚OôˆC§ý¯Êû¥ávdV²¥VŸ¬½|°^þËã+<U9 \ No newline at end of file
--- a/test-data/STR-FM/.git/objects/ff/125a58870195e28b6927b4a492f1df3dd5c97d Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -x¥ŽËmÃ0@{öZ E}(EÑ\ºA`$:6É®B¿AWèñ½ÃÃ+[k«¤ø¢CÄD’yö–PˆÕ%‡Î1ˆÌ>Që,ˆnÚyHWcstTf¡š‚ ç«·‘ÀÕR8 -Øš¯!M|è² s_<¸,¼š£ß”[ãwózx‚€ï˦×û¹líÍX”2儿`zÚç¯ÊÿKÓç^YÅáZe×ò}H/k¿ýñù§Ý' X§ \ No newline at end of file
--- a/test-data/STR-FM/.git/packed-refs Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -# pack-refs with: peeled fully-peeled -3b65f228ef1817a991dcd2a7f0bc35eeabf56cd9 refs/remotes/origin/master
--- a/test-data/STR-FM/.git/refs/heads/master Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -cebc3ab80ab25aa2af4ae265bd89387c2225a708
--- a/test-data/STR-FM/.git/refs/remotes/origin/HEAD Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -ref: refs/remotes/origin/master
--- a/test-data/STR-FM/.git/refs/remotes/origin/master Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -cebc3ab80ab25aa2af4ae265bd89387c2225a708
--- a/test-data/changespacetounderscore_readname.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -import sys -fd=open(sys.argv[1]) -output=open(sys.argv[2],'w') -columntochange=int(sys.argv[3])-1 # default is 6-1=5 -lines=fd.xreadlines() -for line in lines: - temp=line.strip().split('\t') - temp=filter(None,temp) - temp2=temp[columntochange].replace(' ','_') - product=temp[:columntochange] - product.append(temp2) - product.extend(temp[columntochange+1:]) - output.writelines('\t'.join(product)+'\n') -fd.close() -output.close() \ No newline at end of file
--- a/test-data/combinedprobforallelecombination.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -import sys -import collections -import math -SAMPLINGCOL=11 -ALLELE1COL=7 -ALLELE2COL=8 -SIGNCOL=4 -readprofileCOL=2 -motifCOL=3 -filaname=sys.argv[1] -fd=open(filaname) -lines=fd.readlines() -binomialcombine=collections.defaultdict(list) -for line in lines: - temp=line.strip().split('\t') - allelelist=[] - allelelist.append(int(temp[ALLELE1COL-1])) - allelelist.append(int(temp[ALLELE2COL-1])) - allelelist.sort() - #allelelist=map(str,allelelist) - alleleave=str(allelelist[0])+'_'+str(allelelist[1]) - #alleleave=str(sum(allelelist)/2.0) - ##alleleave=str(allelelist[0])+'_'+str(allelelist[1]) - totalcov=len(temp[readprofileCOL-1].split(',')) - motif=temp[motifCOL-1] - samplingvalue=float(temp[SAMPLINGCOL-1]) - SIGN=1 - binomialcombine[(totalcov,alleleave,motif)].append(SIGN*samplingvalue) -allkeys= binomialcombine.keys() -allkeys.sort() -##print allkeys -print 'read_depth'+'\t'+'allele'+'\t'+'heterozygous_prob'+'\t'+'motif' -for key in allkeys: - ##templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2],str(map(str,(binomialcombine[key])))] - templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2]] - - print '\t'.join(templist) -#print allkeys#,binomialcombine - - -
--- a/test-data/combineprobforallelecombination.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -<tool id="combineproballelecom" name="Combine probability to generate read profile " version="2.0.0"> - <description>from the same allele combination</description> - <command interpreter="python2.7">combinedprobforallelecombination.py $input > $output </command> - - <inputs> - <param name="input" type="data" label="Select microsatellite length profile" /> - - </inputs> - <outputs> - <data name="output" format="tabular" /> - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="input" value="probvalueforhetero_out.txt"/> - <output name="output" file="combineprob_out.txt"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -- This tool will combine probability that the allele combination can generated any read profile in the input. This is the last step to calculate probability to detect heterozygous for each allele combination and each depth. - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -The input format is the same as output from **Evaluate the probability of the allele combination to generate read profile** tool. - -- Column 1 = location of microsatellite locus. -- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). -- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. -- Column 4 = homozygous/heterozygous label. -- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous) -- Column 6 = Allele for most probable homozygous form. -- Column 7 = Allele 1 for most probable heterozygous form. -- Column 8 = Allele 2 for most probable heterozygous form. -- Column 9 = Probability of the allele combination to generate given read profile. -- Column 10 = Number of possible rearrangement of given read profile. -- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10) -- Column 12 = Read depth - -Only column 2,3,4,7,8,11 were used in calculation. - -**Output** - - -The output will contain the following header and column - -- Line 1 header: read_depth allele heterozygous_prob motif -- Column 1 = read depth -- Column 2 = allele combination -- Column 3 = probability to detect heterozygous of that allele combination -- Column 4 = motif - - - - -</help> -</tool>
--- a/test-data/fetchflank.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -<tool id="fetchflank" name="Fetch flanking bases" version="1.0.0"> - <description> of microsatellites and output as two fastq files in forward-forward orientation</description> - <command interpreter="python">pair_fetch_DNA_ff.py $microsat_in_read $Leftflanking $Rightflanking $qualitycutoff $lengthofbasetocheckquality </command> - - <inputs> - <param name="microsat_in_read" type="data" label="Select data of microsatellites in reads" /> - <param name="qualitycutoff" type="integer" value="20" label="Minimum quality score (Phred+33) for microsatellites and flanking regions" /> - <param name="lengthofbasetocheckquality" type="integer" value="20" label="Length of flanking regions that require quality screening" /> - </inputs> - <outputs> - <data format="fastq" name="Leftflanking" /> - <data format="fastq" name="Rightflanking" /> - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="microsat_in_read" value="samplefq.snoope"/> - <param name="qualitycutoff" value="20"/> - <param name="lengthofbasetocheckquality" value="20"/> - <output name="Leftflanking" file="microsatellite_flanking_L.fastq"/> - <output name="Rightflanking" file="microsatellite_flanking_R.fastq"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -This tool will fetch flanking regions around microsatellites, screen for quality score at microsatellites and adjacent flanking regions, and output two fastq files containing flanking regions in forward-forward direction. - -- This tool assumes that the quality score is Phred+33, such as Sanger fastq. -- Reads that have either left or right flanking regions shorter than the length of flanking regions that require quality screening will be removed. - -**Citation** -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -The input files need to be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score** - -**Output** - -The output will be the two fastq files. The first file contains left flank regions. The second file contains right flanking regions. - -**Example** - -- Suppose we detected the microsatellites from short reads :: - - 6 40 54 G 0 SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA - - -- We want to get fastq files of flanking regions around microsatellite with quality score at least 20 on Phred +33 - -- Then the program will report these two fastq files :: - - @SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 - TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT - +SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 - GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG - - - @SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 - TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG - +SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 - GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA - - - -</help> -</tool>
--- a/test-data/heteroprob.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,199 +0,0 @@ -### import libraries ### -import sys -import collections, math -import heapq -import itertools - - - -### basic function ### -def permuterepeat(n,rlist): - f = math.factorial - nfac=f(n) - rfaclist=[f(i) for i in rlist] - for rfac in rfaclist: - nfac=nfac/rfac - return nfac - -def nCr(n,r): - f = math.factorial - return f(n) / f(r) / f(n-r) - -def averagelist(a,b,expectedlevelofminor): - product=[] - for i in range(len(a)): - product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i]) - - return product - -def complement_base(read): - collect='' - for i in read: - if i.upper()=='A': - collect+='T' - elif i.upper()=='T': - collect+='A' - elif i.upper()=='C': - collect+='G' - elif i.upper()=='G': - collect+='C' - return collect -def makeallpossible(read): - collect=[] - for i in range(len(read)): - tmp= read[i:]+read[:i] - collect.append(tmp) - collect.append(complement_base(tmp)) - return collect - -def motifsimplify(base): - '''str--> str - ''' - motiflength=len(base) - temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base)))) - - return temp[0] - -def majorallele(seq): - binseq=list(set(seq)) - binseq.sort(reverse=True) # highly mutate mode - #binseq.sort() # majority mode - storeform='' - storevalue=0 - for i in binseq: - if seq.count(i)>storevalue: - storeform=i - storevalue=seq.count(i) - - return int(storeform) - -### decide global parameter ### -COORDINATECOLUMN=1 -ALLELECOLUMN=2 -MOTIFCOLUMN=3 -inputname=sys.argv[1] -errorprofile=sys.argv[2] -EXPECTEDLEVELOFMINOR=float(sys.argv[3]) -if EXPECTEDLEVELOFMINOR >0.5: - try: - errorexpectcontribution=int('a') - except Exception, eee: - print eee - stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5") -MINIMUMMUTABLE=0 ###1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012 - - -## Fixed global variable -ALLREPEATTYPE=[1,2,3,4] -ALLREPEATTYPENAME=['mono','di','tri','tetra'] -monomotif=['A','C'] -dimotif=['AC','AG','AT','CG'] -trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG'] -tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\ -'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\ -'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC'] -ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif} -monorange=range(5,60) -dirange=range(6,60) -trirange=range(9,60) -tetrarange=range(12,80) -ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange} - -######################################### -######## Prob calculation sector ######## -######################################### -def multinomial_prob(majorallele,STRlength,motif,probdatabase): - '''int,int,str,dict-->int - ### get prob for each STRlength to be generated from major allele - ''' - #print (majorallele,STRlength,motif) - prob=probdatabase[len(motif)][motif][majorallele][STRlength] - return prob - -################################################ -######## error model database sector ########### -################################################ - -## structure generator -errormodeldatabase={1:{},2:{},3:{},4:{}} -sumbymajoralleledatabase={1:{},2:{},3:{},4:{}} -for repeattype in ALLREPEATTYPE: - for motif in ALLMOTIF[repeattype]: - errormodeldatabase[repeattype][motif]={} - sumbymajoralleledatabase[repeattype][motif]={} - for motifsize1 in ALLRANGE[repeattype]: - errormodeldatabase[repeattype][motif][motifsize1]={} - sumbymajoralleledatabase[repeattype][motif][motifsize1]=0 - for motifsize2 in ALLRANGE[repeattype]: - errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE -#print errormodeldatabase -## read database - -## get read count for each major allele -fd=open(errorprofile) -lines=fd.readlines() -for line in lines: - temp=line.strip().split('\t') - t_major=int(temp[0]) - t_count=int(temp[2]) - motif=temp[3] - sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count -fd.close() -##print sumbymajoralleledatabase - -## get probability -fd=open(errorprofile) -lines=fd.readlines() -for line in lines: - temp=line.strip().split('\t') - t_major=int(temp[0]) - t_read=int(temp[1]) - t_count=int(temp[2]) - motif=temp[3] - if sumbymajoralleledatabase[len(motif)][motif][t_major]>0: - errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0) - #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0)) - - #else: - # errormodeldatabase[repeattype][motif][t_major][t_read]=0 -fd.close() -#print errormodeldatabase -#print math.log(100,10) -######################################### -######## input reading sector ########### -######################################### - - - -fd = open(inputname) -##fd=open('sampleinput_C.txt') -lines=fd.xreadlines() -for line in lines: - i_read=[] - i2_read=[] - temp=line.strip().split('\t') - i_coordinate=temp[COORDINATECOLUMN-1] - i_motif=motifsimplify(temp[MOTIFCOLUMN-1]) - i_read=temp[ALLELECOLUMN-1].split(',') - i_read=map(int,i_read) - depth=len(i_read) - heteromajor1=int(temp[6]) - heteromajor2=int(temp[7]) - -### calculate the change to detect combination (using error profile) - heterozygous_collector=0 - alist=[multinomial_prob(heteromajor1,x,i_motif,errormodeldatabase)for x in i_read] - blist=[multinomial_prob(heteromajor2,x,i_motif,errormodeldatabase)for x in i_read] - - ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR) - - if 0 in ablist: - continue - heterozygous_collector=reduce(lambda y, z: y*z,ablist ) - -### prob of combination (using multinomial distribution) - frequency_distribution=[len(list(group)) for key, group in itertools.groupby(i_read)] - ## print frequency_distribution - expandbypermutation=permuterepeat(depth,frequency_distribution) - - print line.strip()+'\t'+str(heterozygous_collector)+'\t'+str(expandbypermutation)+'\t'+str(expandbypermutation*heterozygous_collector)+'\t'+str(depth)
--- a/test-data/microsatcompat.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -import sys -# remove all read that have unmatch microsat -# check only one line at a time -def complement_base(read): - collect='' - for i in read: - if i.upper()=='A': - collect+='T' - elif i.upper()=='T': - collect+='A' - elif i.upper()=='C': - collect+='G' - elif i.upper()=='G': - collect+='C' - return collect - -def makeallpossible(read): - collect=[] - for i in range(len(read)): - tmp= read[i:]+read[:i] - collect.append(tmp) - collect.append(complement_base(tmp)) - return collect - - -fd=open(sys.argv[1]) -lines=fd.xreadlines() -firstcolumn=int(sys.argv[2])-1 #4 -secondcolumn=int(sys.argv[3])-1 # 10 -for line in lines: - temp=line.strip().split('\t') - temp=filter(None,temp) - micro1=temp[firstcolumn] - micro2=temp[secondcolumn] - if micro1 in makeallpossible(micro2): - print line.strip() \ No newline at end of file
--- a/test-data/microsatcompat.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -<tool id="microsatcompat" name="Check microsatellites motif compatibility" version="1.0.0"> - <description> </description> - <command interpreter="python">microsatcompat.py $input $column1 $column2 > $output </command> - - <inputs> - <param name="input" type="data" label="Select input" /> - <param name="column1" type="integer" value="4" label="First column number" /> - <param name="column2" type="integer" value="10" label="Second column number" /> - </inputs> - <outputs> - <data format="tabular" name="output" /> - - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="input" value="microsatcompat_in.txt"/> - <param name="column1" value="4"/> - <param name="column2" value="10"/> - <output name="output" file="microsatcompat_out.txt"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -This tool is used to select only the input lines which have compatible microsatellite motifs between two columns. Compatible here is defined as the microsatellites motif that are complementary or have the same sequence when change starting point of motif. For example, **A** is the same as **T**. Also, **AGG** is the same as **GAG**. - -For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to make sure that the microsatellites in the reads have the same motif as the microsatellites in the reference at the corresponding mapped location. - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -The input files can be any tab delimited file. - -If this tool is used in TRFM microsatellite profiling, it should contains: - -- Column 1 = microsatellite location in reference chromosome -- Column 2 = microsatellite location in reference start -- Column 3 = microsatellite location in reference stop -- Column 4 = microsatellite location in reference motif -- Column 5 = microsatellite location in reference length -- Column 6 = microsatellite location in reference motif size -- Column 7 = length of microsatellites (bp) -- Column 8 = length of left flanking regions (bp) -- Column 9 = length of right flanking regions (bp) -- Column 10 = repeat motif (bp) -- Column 11 = hamming distance -- Column 12 = read name -- Column 13 = read sequence with soft masking of microsatellites -- Column 14 = read quality (the same Phred score scale as input) -- Column 15 = read name (The same as column 12) -- Column 16 = chromosome -- Column 17 = left flanking region start -- Column 18 = left flanking region stop -- Column 19 = microsatellite start as infer from pair-end -- Column 20 = microsatellite stop as infer from pair-end -- Column 21 = right flanking region start -- Column 22 = right flanking region stop -- Column 23 = microsatellite length in reference -- Column 24 = microsatellite sequence in reference - -**Output** - -The same as input format. - - -</help> -</tool>
--- a/test-data/microsatellite.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1271 +0,0 @@ -#!/usr/bin/env python -""" -Snoop thru a fasta file looking for microsatellite repeats of given periods -Output format: length_of_repeat left_flank_length right_flank_length repeat_motif hamming_distance read_name read_sequence read_quality (additional columns) - -If --r option turned on, output format will have additional columns behind: -read_name read_chr pre_s pre_e tr_s tr_e suf_s suf_e tr_len tr_ref_seq - -pre_s where the read start -pre_e the last position before microsatellite -tr_s where microsatellite start -tr_e where microsatellite end -suf_s first base after microsatellite -tr_ref_seq reference sequence corresponding to microsatellite - -* output positions are 0 based - -:Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu) - -modifing log: - -09/27/2013 -replace function dense_intervals with function non_negative_intervals, which do not need to import such file. - -10/18/2013 -modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel - -02/25/2014 -add function that can deal with mapped reads -with additional output - -02/28/2014 -modify the 0-based end point, as in 0-base area, it is half-open [ ) -so the 0-based site, should always be added by 1 - -03/05/2014 -deal with multi-fasta -""" -from sys import argv,stdin,stderr,exit -from string import maketrans -from md5 import new as md5_new -import re -#from pyfracluster import dense_intervals - -def usage(s=None): - message = """ -usage: microsat_snoop [fasta_file] [options] - <fasta_file> Name of file to read sequences from; if absent, - sequences are read from stdin - --fasta Input file is in fasta format - (this is the default) - --fastq Input file is in fastq format - (default is fasta unless filename is .fastq) - --fastq:noquals Input file is in fastq format, but discard quals - --sam Input file is SAM file - --r Indicate additional output information, if indicated, - --ref option is mendatory - --ref=<filepath> Reference file (absolute) path - --period=<length> (mandatory,cumulative) repeat length(s) to be - searched for - <length> is expected to be small, less than 10 - <length> can also be a comma-separated list, or - a range <low>..<high> - --rate=<fraction> control the candidate repeat interval detector; - it will consider intervals with at least - <fraction> of matches when shifted by the period; - <fraction> is between 0 and 1 and can be either a - real number or <n>/<d> - (default is 6/7) - --minlength=<length> minimum length of intervals reported, in bp - (default is 20) - --progress=<count> how often to report the sequence we're searching - (default is no progress report) - --allowduplicates process all input sequences - (this is the default) - --noduplicates ignore any input sequence that's the same as an - earlier sequence - --nonearduplicates ignore any input sequence that has the same first - 100 bp as an earlier sequence - --nonearduplicate=<length> ignore any input sequence that has the same first - <length> bp as an earlier sequence - --hamming=<count> Don't report candidate repeat intervals that have - more than <count> mismatches - (default is to do no such filtering) - --prefix=<length> Don't report candidate repeat intervals that - start within <length> of the sequence start - (default is to do no such filtering) - --suffix=<length> Don't report candidate repeat intervals that - end within <length> of the sequence end - (default is to do no such filtering) - --subsample=<k>/<n> Process only the <k>th sequence of every group of - <n> sequences; <k> ranges from 1 to <n> - --multipleruns Consider all candidate intervals in a sequence - (default is to consider only the longest) - --partialmotifs Consider microatelites with a partial motif - (default is to consider only whole motifs) - --splitbyvalidity Preprocess sequences, splitting at Ns; this - prevents candidates from including Ns - (default is not to split) - --noflankdisplay Show entire sequence as flanking regions - (this is the default) - --flankdisplay=<length> Limit length of flanking regions shown - --readnamesuffix=<string> Root of suffix to append to read names; e.g. 1 - for forward, 2 for reverse; this triggers other - info to be included in the suffix - (default is "1" for fastq; no suffix for fasta) - --head=<number> limit the number of sequences processed - --markend Write a marker line upon completion - (default is not to write a marker) - --help=details Describe the process, and quit""" - - if (s == None): exit (message) - else: exit ("%s\n%s" % (s,message)) - - -detailedDescription = """In broad terms, the process works as follows: - -(1) Identify intervals that are highly correlated with the interval shifted by - P (the repeat period). These intervals are called "runs" or "candidates". - The level of correlation required is controlled by rateThreshold. - Depending on whether we want to look for more than one microsat, we either - find the longest such run (simple algorithm) or many runs (more complicated - algorithm). The following steps are then performed on each run. - -(2) Find the most likely repeat motif in the run. This is done by counting - all kmers (of length P) and choosing the most frequent. If that kmer is - itself covered by a sub-repeat we discard this run. The idea is that we - can ignore a 6-mer like ACGACG because we will find it when we are looking - for 3-mers. - -(3) Once we identify the most likely repeat motif, we then modify the - interval, adjusting start and end to find the interval that has the fewest - mismatches vs. a sequence of the motif repeated (hamming distance). Only - whole copies of the motif are considered. - -(4) At this point we have a valid microsat interval (in the eyes of the - program). It is subjected to some filtering stages (hamming distance or too - close to an end), and if it satisfies those conditions, it's reported to - the user.""" - -def main(): - global debug - - #=== parse the command line === - - inputFilename = None - referenceFileName = None #add by Chen Sun on 02/25 - inputFormat = None - repeatPeriods = [] - rateThreshold = 6 / 7.0 - lengthThreshold = 20 - reportProgress = None - discardDuplicates = False - discardNearDuplicates = False - nearDuplicatePrefix = 100 - hammingThreshold = 0 - prefixThreshold = None - suffixThreshold = None - subsampleK = None - subsampleN = None - reportMultipleRuns = False - allowPartialMotifs = False - splitByValidity = False - flankDisplayLimit = None - readNameSuffix = None - headLimit = None - markEndOfFile = False - additionalInfo = False - debug = [] - - for arg in argv[1:]: - if (arg == "--fasta"): - inputFormat = "fasta" - elif (arg == "--fastq"): - inputFormat = "fastq" - elif (arg == "--fastq:noquals"): - inputFormat = "fastq:noquals" - elif (arg == "--sam"): - inputFormat = "sam" - elif (arg == "--r"): - additionalInfo = True - elif (arg.startswith("--ref=")): - referenceFileName = arg.split("=",1)[1] - elif (arg.startswith("--period=")): - val = arg.split("=",1)[1] - for period in val.split(","): - if (".." in period): - (lowPeriod,highPeriod) = period.split("..",1) - lowPeriod = int(lowPeriod) - highPeriod = int(highPeriod) - for period in xrange(lowPeriod,highPeriod+1): - repeatPeriods += [period] - else: - repeatPeriods += [int(period)] - elif (arg.startswith("--rate=")): - val = arg.split("=",1)[1] - rateThreshold = float_or_fraction(val) - assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val - elif (arg.startswith("--minlength=")): - val = arg.split("=",1)[1] - lengthThreshold = int(val) - assert (lengthThreshold >= 0) - elif (arg.startswith("--progress=")): - val = arg.split("=",1)[1] - reportProgress = int(val) - elif (arg == "--allowduplicates"): - discardDuplicates = False - discardNearDuplicates = False - elif (arg == "--noduplicates"): - discardDuplicates = True - discardNearDuplicates = False - elif (arg == "--nonearduplicates"): - discardDuplicates = False - discardNearDuplicates = True - elif (arg.startswith("--nonearduplicate=")): - val = arg.split("=",1)[1] - discardDuplicates = False - discardNearDuplicates = True - nearDuplicatePrefix = int(val) - assert (nearDuplicatePrefix > 0) - elif (arg.startswith("--hamming=")): - val = arg.split("=",1)[1] - hammingThreshold = int(val) - assert (hammingThreshold >= 0) - elif (arg.startswith("--prefix=")): - val = arg.split("=",1)[1] - prefixThreshold = int(val) - assert (prefixThreshold >= 0) - elif (arg.startswith("--suffix=")): - val = arg.split("=",1)[1] - suffixThreshold = int(val) - assert (suffixThreshold >= 0) - elif (arg.startswith("--subsample=")): - val = arg.split("=",1)[1] - (k,n) = val.split("/",2) - subsampleK = int(k) - subsampleN = int(n) - assert (0 < subsampleK <= subsampleN) - elif (arg == "--multipleruns"): - reportMultipleRuns = True - elif (arg == "--partialmotifs"): - allowPartialMotifs = True - elif (arg == "--splitbyvalidity"): - splitByValidity = True - elif (arg == "--noflankdisplay"): - flankDisplayLimit = None - elif (arg.startswith("--flankdisplay=")): - val = arg.split("=",1)[1] - flankDisplayLimit = int(val) - assert (flankDisplayLimit >= 0) - elif (arg.startswith("--readnamesuffix")): - readNameSuffix = arg.split("=",1)[1] - elif (arg.startswith("--head=")): - headLimit = int_with_unit(arg.split("=",1)[1]) - elif (arg == "--markend"): - markEndOfFile = True - elif (arg == "--help=details"): - exit (detailedDescription) - elif (arg.startswith("--debug=")): - debug += (arg.split("=",1)[1]).split(",") - elif (arg.startswith("--")): - usage("unrecognized option: %s" % arg) - elif (inputFilename == None): - inputFilename = arg - else: - usage("unrecognized option: %s" % arg) - - #=== determine periods of interest === - - if (repeatPeriods == []): - usage("you gotta give me a repeat period") - - if (additionalInfo == True): - if (referenceFileName == None): - usage("reference file path needed. use --ref=<reference> to indicate") - - periodSeed = {} - for period in repeatPeriods: - if (period < 1): usage("period %d is not valid" % period) - periodSeed[period] = True - - repeatPeriods = [period for period in periodSeed] - repeatPeriods.sort() - - #=== determine input format === - - if (inputFormat == "fasta"): sequence_reader = fasta_sequences - elif (inputFormat == "fastq"): sequence_reader = fastq_sequences - elif (inputFormat == "fastq:noquals"): sequence_reader = fastq_sequences - elif (inputFormat == "sam"): sequence_reader = sam_sequences - elif (inputFilename == None): sequence_reader = fasta_sequences - elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences - elif (inputFilename.endswith(".fq")): sequence_reader = fastq_sequences - elif (inputFilename.endswith(".sam")): sequence_reader = sam_sequences - else: sequence_reader = fasta_sequences - - if (inputFilename != None): inputF = file(inputFilename,"rt") - else: inputF = stdin - - if (readNameSuffix == None) \ - and (sequence_reader == fastq_sequences) \ - and (inputFormat != "fastq:noquals"): - readNameSuffix = "1" - - #=== process the sequences === - - refSequence = {} - rightName = "" - sequence = "" - if additionalInfo: - firstFasta = True - originalRefF = open(referenceFileName) - for line in originalRefF.readlines(): - line = line.replace('\r','') - line = line.replace('\n','') - if line.startswith(">"): - if firstFasta: - firstFasta = False - else: - refSequence[rightName] = sequence - rightName = line[1:] - sequence = "" - continue - sequence += line - originalRefF.close() - refSequence[rightName] = sequence - - sequenceSeen = {} - - numSequences = 0 - for seqInfo in sequence_reader(inputF): - numSequences += 1 - if (headLimit != None) and (numSequences > headLimit): - print >>stderr, "limit of %d sequences reached" % headLimit - break - - if (sequence_reader == sam_sequences): - #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar - (name, sequence, refName, pre_s, cigar) = seqInfo - quals = None - elif (sequence_reader == fastq_sequences): - (name,sequence,quals) = seqInfo - if (inputFormat == "fastq:noquals"): quals = None - else: - (name,sequence) = seqInfo - quals = None - - if (reportProgress != None) and (numSequences % reportProgress == 0): - print >>stderr, "%s %d" % (name,numSequences) - - # if we're subsampling and not interested in this sequence, skip it - - if (subsampleN != None): - if ((numSequences-1) % subsampleN != (subsampleK-1)): - continue - - # if this sequence is shorter than the length of interest, skip it - - seqLen = len(sequence) - if (seqLen < period) or (seqLen < lengthThreshold): continue - - # if we're not interested in duplicates and this is one, skip it; - # note that we assume no hash collisions occur, i.e. that all hash - # matches are truly sequence matches - - if (discardDuplicates): - h = hash108(sequence) - if (h in sequenceSeen): continue - sequenceSeen[h] = True - elif (discardNearDuplicates): - h = hash108(sequence[:nearDuplicatePrefix]) - if (h in sequenceSeen): continue - sequenceSeen[h] = True - - # split the sequence into chunks of valid nucleotides - - if (splitByValidity): - chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)] - else: - chunks = [(0,len(sequence))] - - # evaluate for each period of interest - - for period in repeatPeriods: - - # operate on each chunk - - for (chunkStart,chunkEnd) in chunks: - chunkLen = chunkEnd - chunkStart - if (chunkLen < period) or (chunkLen < lengthThreshold): continue - - if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug): - print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd) - - # compute correlation sequence - - corr = correlation_sequence(sequence,period,chunkStart,chunkEnd) - - if ("correlation" in debug) or ("runs" in debug): - print >>stderr, sequence[chunkStart:chunkEnd] - print >>stderr, corr - - # find runs (candidates for being a microsat) - - if (reportMultipleRuns): - runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold) - else: - runs = longest_suitable_run(corr,lengthThreshold,rateThreshold) - if (runs == []): continue - - - if ("runs" in debug): - for (start,end) in runs: - run = [" "] * seqLen - for ix in xrange(start-period,end): - run[ix] = "*" - print >>stderr, "".join(run) - - if ("candidates" in debug): - for (start,end) in runs: - print >>stderr, "%s %d %d" % (name,start,end) - - # process runs and report those that pass muster - - runCount = 0 - for (start,end) in runs: - runCount += 1 - - start = chunkStart + start - period - end = chunkStart + end - - (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs) - if (kmer == None): continue # (no useful repeat kmer was found) - - rptExtent = end - start - prefixLen = start - suffixLen = seqLen - end - if (rptExtent <= period): continue - if (hammingThreshold != None) and (d > hammingThreshold): continue - if (prefixThreshold != None) and (prefixLen < prefixThreshold): continue - if (suffixThreshold != None) and (suffixLen < suffixThreshold): continue - - if (flankDisplayLimit == None): - seq = sequence[:start] \ - + sequence[start:end].lower() \ - + sequence[end:] - else: - seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \ - + sequence[start:end].lower() \ - + sequence[end:min(chunkEnd,end+flankDisplayLimit)] - reportName = name - if (readNameSuffix != None): - reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount) - if (quals == None or quals == "." or quals == "\t."): quals = "\t." - else: quals = "\t" + quals - if not additionalInfo: - print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \ - % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals) - else: - #pre_e = pre_s + prefixLen - 1 - refPoint = pre_s - donorPoint = 0 - - donorBeforeStart = prefixLen - 1 #pre_e - donorMicroStart = prefixLen #tr_s - donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e - donorAfterMicro = donorMicroEnd + 1 #suf_s - donorEnd = len(seq) - 1 #suf_e - - set_pre_e = False - set_tr_s = False - set_tr_e = False - set_suf_s = False - set_suf_e = False - - pre_e = 0 - tr_s = 0 - tr_e = 0 - suf_s = 0 - suf_e = 0 - - matchList = re.findall('(\d+)([IDM])', cigar) - unCognitiveCigar = False - for matchN, matchType in matchList: - matchNum = int(matchN) - if matchType == "M": - donorPoint = donorPoint + matchNum - refPoint = refPoint + matchNum - elif matchType == "D": - refPoint = refPoint + matchNum - continue - elif matchType == "I": - donorPoint = donorPoint + matchNum - else: - unCognitiveCigar = True - break - - if not set_pre_e: - if donorPoint >= donorBeforeStart: - pre_e = refPoint - (donorPoint - donorBeforeStart) - set_pre_e = True - else: - continue - - if not set_tr_s: - if donorPoint >= donorMicroStart: - tr_s = refPoint - (donorPoint - donorMicroStart) - set_tr_s = True - else: - continue - - if not set_tr_e: - if donorPoint >= donorMicroEnd: - tr_e = refPoint - (donorPoint - donorMicroEnd) - set_tr_e = True - else: - continue - - if not set_suf_s: - if donorPoint >= donorAfterMicro: - suf_s = refPoint - (donorPoint - donorAfterMicro) - set_suf_s = True - else: - continue - - if not set_suf_e: - if donorPoint >= donorEnd: - suf_e = refPoint - (donorPoint - donorEnd) - set_suf_e = True - else: - continue - - if unCognitiveCigar: - break - tr_len = tr_e - tr_s + 1 - - if refName not in refSequence: - tr_ref_seq = "." - else: - if refSequence[refName] == "": - tr_ref_seq = "." - elif len(refSequence[refName]) <= tr_e: - tr_ref_seq = "." - else: - tr_ref_seq = refSequence[refName][tr_s:tr_e+1] - - pre_e += 1 - tr_e += 1 - suf_e += 1 - print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \ - % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq) - - if (markEndOfFile): - print "# microsat_snoop end-of-file" - - if (inputF != stdin): - inputF.close() - -# non_negative_intervals -# find intervals with exactly + and no - -# from string like this : +++++++++---+++++++++ -def non_negative_intervals(seq, minLength=None): - - start = -1 - end = -1 - firstPlus = 1 - #print seq - for ix in range(len(seq)): # for every char in seq - ch = seq[ix] - if(ch == "+"): - if(firstPlus): - firstPlus = 0 - start = ix - else: - continue - elif(ch == "-"): - if(start >= 0): - end = ix-1 - if((end - start + 1) >= minLength): - yield (start,end+1) - start = -1 - firstPlus = 1 - if(start > 0): - if((ix - start + 1) >= minLength): - yield (start, ix+1) - - -################################################################### -# modified by Chen Sun on 7/11/2014 -# We do not want other modules, so parse these functions inside -# -################################################################### - -# parse a string of the form {positives}/{positives_and_neutrals} - -def parse_spec(s): - if ("/" not in s): raise ValueError - (n,d) = s.split("/",1) - if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError - if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError - - positives = n[1:-1] - d = d[1:-1] - - for ch in positives: - if (ch not in d): raise ValueError - - neutrals = [ch for ch in d if (ch not in positives)] - return (positives,neutrals) - - -# convert a string to a number, allowing fractions - -def float_or_fraction(s): - if ("/" in s): - (numer,denom) = s.split("/",1) - return float(numer)/float(denom) - else: - return float(s) - - -# dense_intervals-- -# Find all non-overlapping runs with a good enough rate (of positives), and -# which meet our length threshold. -# -# The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing -# long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999. -# -# $$$ we use the denominator as the threshold, but we really should use the -# $$$ .. numerator, comparing it to minLength*rate - -def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None): - - if (blockers == None): - blockers = "".join([chr(n) for n in range(1,256) - if (chr(n) not in positives) - and (chr(n) not in neutrals)]) - - stackLeft = [None] # stack with each entry containing five - stackRight = [None] # .. elements; note that entry zero is not - stackLeftScore = [None] # .. used - stackRightScore = [None] - stackLower = [None] - top = 0 - score = 0 - - for ix in range(len(seq)): - ch = seq[ix] - if (ch in blockers): - # emit intervals - - for sp in range(1,top+1): - left = stackLeft [sp] + 1 - right = stackRight[sp] - - while (left < right) and (seq[left] not in positives): left += 1 - while (right > left) and (seq[right] not in positives): right -= 1 - - right += 1 - if (minLength == None) or (right - left >= minLength): - yield (left,right) - - #empty stack - - stackLeft = [None] - stackRight = [None] - stackLeftScore = [None] - stackRightScore = [None] - stackLower = [None] - top = 0 - score = 0 - continue - - if (ch in positives): weight = 1-rate - elif (ch in neutrals): weight = -rate - else: raise ValueError - - score += weight - #if ("algorithm" in debug): - # print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score), - - if (weight < 0): - #if ("algorithm" in debug): - # print >>sys.stderr - continue - - if (top > 0) and (stackRight[top] == ix-1): - # add this site to the interval on top of the stack - - stackRight [top] = ix - stackRightScore[top] = score - - #if ("algorithm" in debug): - # print >>sys.stderr, \ - # " extending [%d] %d-%d %4.1f %4.1f" \ - # % (top, - # stackLeft [top], stackRight [top], - # stackLeftScore[top], stackRightScore[top]), - - else: - # create a one site interval - - top += 1 - if (top >= len(stackLeft)): - stackLeft += [None] - stackRight += [None] - stackLeftScore += [None] - stackRightScore += [None] - stackLower += [None] - - stackLeft [top] = ix - 1 - stackLeftScore [top] = score - weight - stackRight [top] = ix - stackRightScore[top] = score - stackLower [top] = top - 1 - - while (stackLower[top] > 0) \ - and (stackLeftScore[stackLower[top]] > stackLeftScore[top]): - stackLower[top] = stackLower[stackLower[top]] - - #if ("algorithm" in debug): - # print >>sys.stderr, \ - # " creating [%d] %d-%d %4.1f %4.1f -> %d" \ - # % (top, - # stackLeft [top], stackRight [top], - # stackLeftScore[top], stackRightScore[top], - # stackLower [top]), - - # merge intervals; if there is a previous interval with a no-higher - # left score and no-higher right score, merge this interval (and all - # intervening ones) into that one - - while (top > 1) \ - and (stackLower[top] > 0) \ - and (stackRightScore[stackLower[top]] <= stackRightScore[top]): - stackRight [stackLower[top]] = stackRight [top] - stackRightScore[stackLower[top]] = stackRightScore[top] - top = stackLower[top] - - #if ("algorithm" in debug): - # print >>sys.stderr, \ - # "\n%*s merging [%d] %d-%d %4.1f %4.1f" \ - # % (13, "", top, - # stackLeft[top], stackRight [top], - # stackLeftScore[top], stackRightScore[top]), - - #if ("algorithm" in debug): - # print >>sys.stderr - - # emit intervals - - for sp in range(1,top+1): - left = stackLeft [sp] + 1 - right = stackRight[sp] - - while (left < right) and (seq[left] not in positives): left += 1 - while (right > left) and (seq[right] not in positives): right -= 1 - - right += 1 - if (minLength == None) or (right - left >= minLength): - yield (left,right) - - -################################################################### -# modified by Chen Sun on 7/11/2014 -# -################################################################### - -# correlation_sequence-- -# Compute the correlation sequence for a given period. This is a sequence -# of + and - indicating whether the base at a given position matches the one -# P positions earlier (where P is the period). The first P positions are -# blank. Positions with single character runs longer than the period are -# considered as non-matches, unless the period is 1. - -def correlation_sequence(sequence,period,start=None,end=None): - if (start == None): start = 0 - if (end == None): end = len(sequence) - - prevCh = sequence[start] - run = 1 - for ix in xrange(start+1,start+period): - ch = sequence[ix] - if (ch != prevCh): run = 1 - else: run += 1 - prevCh = ch - - corr = [" "] * period - for ix in xrange(start+period,end): - rptCh = sequence[ix-period] - ch = sequence[ix] - if (ch != prevCh): run = 1 - else: run += 1 - if (ch in "ACGT") \ - and (ch == rptCh) \ - and ((period == 1) or (run < period)): - corr += ["+"] - else: - corr += ["-"] - prevCh = ch - - return "".join(corr) - - -# longest_suitable_run-- -# Find longest run with a good enough rate (of positives). -# -# We score a "+" as 1-r and anything else as -r. This is based on the fol- -# lowing derivation (p is the number of "+"s, n is the number of non-"+"s): -# p/(p+n) >= r -# ==> p >= rp + rn -# ==> (1-r)p - rn >= 0 -# -# We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing). -# -# $$$ we use the denominator as the threshold, but we really should use the -# $$$ .. numerator, comparing it to minLength*rate -# -# $$$ this needs to account for $$$ this situation: -# $$$ sequence: ACGACGACGACGTTATTATTATTA -# $$$ matches: +++++++++---+++++++++ -# $$$ this is currently considered to be one interval (if rate <= 6/7), but it -# $$$ ought to be two; we can't just post-process, though, because some other -# $$$ interval might be longer than the longest half of this; maybe what we -# $$$ need to do is consider matches at distances -P and -2P, or if we match -# $$$ -P but that itself was a mismatch, we should carry the mismatch forward - -def longest_suitable_run(seq,minLength,rate): - maxEndingHere = 0 - maxSoFar = 0 - start = None - - for ix in xrange(len(seq)): - if (seq[ix] == "+"): s = 1-rate - else: s = -rate - - if (maxEndingHere+s < 0): - maxEndingHere = 0 - block = ix - else: - maxEndingHere += s - if (maxEndingHere >= maxSoFar): - maxSoFar = maxEndingHere - start = block + 1 - end = ix + 1 - - if (start == None) or (end - start < minLength): - return [] - else: - return [(start,end)] - - -# all_suitable_runs-- -# Find all non-overlapping runs with a good enough rate (of positives), and -# which meet our length threshold. -# $$$ this needs to post-process the intervals, splitting them to account for -# $$$ this situation: -# $$$ sequence: ACGACGACGACGTTATTATTATTA -# $$$ matches: +++++++++---+++++++++ -# $$$ this is currently reported as one interval (if rate <= 6/7), but it -# $$$ ought to be two - -def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold): - - ################################################################ - # modified by Chen Sun on 07/11/2014 - # - ################################################################ - - if hammingThreshold > 0: - return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)] - elif hammingThreshold == 0: - return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)] - - -# find_repeat_element-- -# Find the most plausible repeat element for a run, and nudge the ends of -# the run if needed. Note that we will not consider kmers that represent -# shorter repeats. For example, we won't report ACTACT as a 6-mer since we -# consider this to have a shorter period than 6. - -def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False): - - if hammingThreshold > 0: - (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials) - return (kmer,bestD,bestStart,bestEnd) - # count the number of occurences of each k-mer; note that we can't - # reject kmers containing smaller repeats yet, since for a sequence like - # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best - # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end - # up reporting something like ACACAA as the best motif - - if ("element" in debug): - print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end) - - if ("partial" in debug): - print period, seq, start, end, allowPartials; - print seq[start:end] - - kmerToCount = {} - kmerToFirst = {} - for ix in xrange(start,end-(period-1)): - kmer = seq[ix:ix+period] - if ("N" in kmer): continue - if (kmer not in kmerToCount): - kmerToCount[kmer] = 1 - kmerToFirst[kmer] = ix - else: - kmerToCount[kmer] += 1 - #if ("element" in debug): - # print >>stderr, " %d: %s" % (ix,kmer) - - # choose the best k-mer; this is simply the most frequently occurring one, - # with ties broken by whichever one came first - - kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount] - if (kmers == []): return (None,None,start,end) - kmers.sort() - - if ("element" in debug): - for (count,first,kmer) in kmers: - print >>stderr, " %s: %d" % (kmer,-count) - - (count,first,kmer) = kmers[0] - if (contains_repeat(kmer)): return (None,None,start,end) - - # determine the hamming distance between the run and a simple repeat, for - # each "plausible" start and end; we compute the distance for each such - # interval, and choose the one with the lowest hamming distance; ties are - # broken in a deterministic-but-unspecified manner - - bestD = bestStart = bestEnd = None - ################################################################################### - # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013 - # since we do not allow hamming_distance > 0, which means we do not allow mutation, - # we do not need this section to produce bestStart and End - ################################################################################### - - #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials): - # d = hamming_distance(seq,s,e,kmer) - # if (d == None): continue - # if (bestD == None) or (d <= bestD): - # (bestD,bestStart,bestEnd) = (d,s,e) - - - - bestStart = start - - if(allowPartials): - bestEnd = end - elif(not allowPartials): - bestEnd = start - pattern = seq[start:start+period] - if ("partial" in debug): - print "kmer:", kmer - if(pattern != kmer): - print "pattern:", pattern - - while(bestEnd <= end-period): - bestEnd += period - - # bestD will always be 0, as we do not allow mutation - bestD = 0 - - if ("partial" in debug): - print bestD, bestStart, bestEnd - - ################################################################################### - # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10 - # - ################################################################################### - return (kmer,bestD,bestStart,bestEnd) - - -def find_hamming_repeat_element(period,seq,start,end,allowPartials=False): - - # count the number of occurences of each k-mer; note that we can't - # reject kmers containing smaller repeats yet, since for a sequence like - # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best - # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end - # up reporting something like ACACAA as the best motif - - if ("element" in debug): - print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end) - - kmerToCount = {} - kmerToFirst = {} - for ix in xrange(start,end-(period-1)): - kmer = seq[ix:ix+period] - if ("N" in kmer): continue - if (kmer not in kmerToCount): - kmerToCount[kmer] = 1 - kmerToFirst[kmer] = ix - else: - kmerToCount[kmer] += 1 - #if ("element" in debug): - # print >>stderr, " %d: %s" % (ix,kmer) - - # choose the best k-mer; this is simply the most frequently occurring one, - # with ties broken by whichever one came first - - kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount] - if (kmers == []): return (None,None,start,end) - kmers.sort() - - if ("element" in debug): - for (count,first,kmer) in kmers: - print >>stderr, " %s: %d" % (kmer,-count) - - (count,first,kmer) = kmers[0] - if (contains_repeat(kmer)): return (None,None,start,end) - - # determine the hamming distance between the run and a simple repeat, for - # each "plausible" start and end; we compute the distance for each such - # interval, and choose the one with the lowest hamming distance; ties are - # broken in a deterministic-but-unspecified manner - - bestD = bestStart = bestEnd = None - - for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials): - d = hamming_distance(seq,s,e,kmer) - if (d == None): continue - if (bestD == None) or (d <= bestD): - (bestD,bestStart,bestEnd) = (d,s,e) - - return (kmer,bestD,bestStart,bestEnd) - -# plausible_intervals-- -# Yield all plausible intervals intersecting with a run. We generate all -# starts within P bp of the run's start. For each of these, we either (a) try -# all ends within P bp of run's end, or (b) trim the new interval to a whole -# multiple of the period, and report this short interval and the longer -# interval with one more period appended. Case (a) allows partial motifs, -# while case (b) only allows whole motifs. - -def plausible_intervals(start,end,period,seqLen,allowPartials=False): - - # generate intervals that allow a partial copy of the motif - - if (allowPartials): - for candStart in xrange(start-(period-1),start+period): - if (candStart < 0): continue - for candEnd in xrange(end-(period-1),end+period): - if (candEnd > seqLen): continue - if (candEnd <= candStart+period): continue - yield (candStart,candEnd) - - # -OR- generate intervals that allow only whole copies of the motif - - else: - for candStart in xrange(start-(period-1),start+period): - if (candStart < 0): continue - candEnd = candStart + ((end-candStart)/period)*period - yield (candStart,candEnd) - candEnd += period - if (candEnd <= seqLen): yield (candStart,candEnd) - - -# hamming_distance-- -# Determine the hamming distance between the run and a simple repeat. -# $$$ improve this by allowing gaps, and stopping when we reach a threshold - -kmerToDiffs = {} # (this is used for memo-ization) - -def hamming_distance(seq,start,end,kmer): - period = len(kmer) - if (end < start + period): return None - - wholeEnd = start + ((end-start)/period)*period - - if (kmer not in kmerToDiffs): - kmerToDiffs[kmer] = { kmer:0 } - - d = 0 - for ix in xrange(start,wholeEnd,period): - qmer = seq[ix:ix+period] # same size as the kmer motif - if (qmer in kmerToDiffs[kmer]): - d += kmerToDiffs[kmer][qmer] - continue - diffs = 0 - for iy in xrange(0,period): - if (qmer[iy] != kmer[iy]): diffs += 1 - kmerToDiffs[kmer][qmer] = diffs - d += diffs - - if (end > wholeEnd): - qmer = seq[wholeEnd:end] # shorter than the kmer motif - if (qmer in kmerToDiffs[kmer]): - d += kmerToDiffs[kmer][qmer] - else: - diffs = 0 - for iy in xrange(0,len(qmer)): - if (qmer[iy] != kmer[iy]): diffs += 1 - kmerToDiffs[kmer][qmer] = diffs - d += diffs - - return d - - -# fasta_sequences-- -# Read the fasta sequences from a file. Note that we convert to upper case, -# and convert any letter other than ACGT to N. - -nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN") - -def fasta_sequences(f): - seqName = None - seqNucs = None - - for line in f: - line = line.strip() - if (line.startswith(">")): - if (seqName != None): - yield (seqName,"".join(seqNucs)) - seqName = sequence_name(line) - seqNucs = [] - elif (seqName == None): - assert (False), "first sequence has no header" - else: - seqNucs += [line] - - if (seqName != None): - yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap)) - - -# fastq_sequences-- -# Read the fastq sequences from a file. Note that we convert to upper case, -# and convert any letter other than ACGT to N. - -def fastq_sequences(f): - lineNum = 0 - for line in f: - lineNum += 1 - line = line.strip() - - if (lineNum % 4 == 1): - assert (line.startswith("@")), \ - "bad read name at line %d" % lineNum - seqName = line[1:] - continue - - if (lineNum % 4 == 2): - seqNucs = line - continue - - if (lineNum % 4 == 3): - assert (line.startswith("+")), \ - "can't understand line %d:\n%s" % (lineNum,line) - continue - - quals = line - assert (len(quals) == len(seqNucs)), \ - "length mismatch read vs. qualities at line %d" % lineNum - yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals) - - assert (lineNum % 4 == 0), \ - "incomplete read at end of file" - -def sam_sequences(f): - lineNum = 0 - for line in f: - lineNum += 1 - line = line.strip() - - if line.startswith("@"): - continue - - columns = line.split("\t") - seqName = columns[0] - refName = columns[2] - pre_s = int(columns[3]) - 1 - cigar = columns[5] - seqNucs = columns[9] - - yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar) - -# sequence_name-- -# Extract the sequence name from a fasta header. -# $$$ this may need to be improved $$$ - -def sequence_name(s): - s = s[1:].strip() - if (s == ""): return "" - else: return s.split()[0] - - -# nucleotide_runs-- -# Yield (start,end) for all runs of valid nucleotides in a sequence. - -def nucleotide_runs(s): - runs = [] - start = None - for (ix,nuc) in enumerate(s): - if (nuc in "ACGT"): - if (start == None): - start = ix - else: - if (start != None): - yield (start,ix) - start = None - - if (start != None): yield (start,len(s)) - - -# contains_repeat-- -# Determine whether a short sequence contains a repeated element, such as a -# 6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT). The repeat -# must cover the entire sequence, without mismatches. - -def contains_repeat(kmer): - kmerLength = len(kmer) - hasRepeat = False - rptLen = 1 - while (not hasRepeat) and (2 * rptLen <= kmerLength): - if (kmerLength % rptLen != 0): - rptLen += 1 - continue - isRepeat = True - for i in xrange(rptLen,kmerLength,rptLen): - if (kmer[i:i+rptLen] != kmer[:rptLen]): - isRepeat = False - break - if (isRepeat): - hasRepeat = True - break - rptLen += 1 - return hasRepeat - - -# hash108-- -# Return a 108-bit hash "value" of a string - -def hash108(s): - m = md5_new() - m.update(s) - return m.hexdigest()[:27] - - -# float_or_fraction-- -# Convert a string to a number, allowing fractions - -def float_or_fraction(s): - if ("/" in s): - (numer,denom) = s.split("/",1) - return float(numer)/float(denom) - else: - return float(s) - - -# int_with_unit-- -# Parse a string as an integer, allowing unit suffixes - -def int_with_unit(s): - if (s.endswith("K")): - multiplier = 1000 - s = s[:-1] - elif (s.endswith("M")): - multiplier = 1000 * 1000 - s = s[:-1] - elif (s.endswith("G")): - multiplier = 1000 * 1000 * 1000 - s = s[:-1] - else: - multiplier = 1 - - try: return int(s) * multiplier - except ValueError: return int(math.ceil(float(s) * multiplier)) - - -if __name__ == "__main__": main() -
--- a/test-data/microsatellite.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,178 +0,0 @@ -<tool id="microsatellite" name="Microsatellite detection" version="1.0.0"> - <description>for short read, reference, and mapped data</description> - <command interpreter="python2.7"> microsatellite.py - "${filePath}" - #if $inputFileSource.inputFileType == "fasta" - --fasta - #elif $inputFileSource.inputFileType == "fastq" - --fastq - #elif $inputFileSource.inputFileType == "fastq_noquals" - --fastq:noquals - #elif $inputFileSource.inputFileType == "sam" - --sam - #end if - - #if $inputFileSource.inputFileType == "sam" - #if $inputFileSource.referenceFileSource.requireReference - --r --ref="${inputFileSource.referenceFileSource.referencePath}" - #end if - #end if - - --period="${period}" - - #if $partialmotifs == "true" - --partialmotifs - #end if - - --minlength="${minlength}" - - - --prefix="${prefix}" - --suffix="${surfix}" - - --hamming="${hammingThreshold}" - - #if $multipleruns - --multipleruns - #end if - - #if $flankSetting.noflankdisplay - --noflankdisplay - #else - --flankdisplay=${flankSetting.flankdisplay} - #end if - > $stdout - </command> - - <inputs> - <param name="filePath" label="Select input file" type="data"/> - <conditional name="inputFileSource"> - <param name="inputFileType" type="select" label="Select input file type"> - <option value="fasta">Fasta File</option> - <option value="fastq">Fastq File</option> - <option value="fastq_noquals">Fastq File without Quality Information</option> - <option value="sam">SAM File</option> - </param> - <when value="sam"> - <conditional name="referenceFileSource"> - <param name="requireReference" label="Do you want to extract correspond microsatellites in reference for comparison?" type="boolean"> - </param> - <when value="true"> - <param name="referencePath" label="Select reference file" type="data"/> - </when> - </conditional> - </when> - </conditional> - - <param name="period" label="Motif size of microsatellites of interest (e.g. Mononucleotide microsatellite =1) (must be less than 10)" type="integer" size="2" value="1"/> - <param name="partialmotifs" label="Consider microsatellites with a partial motif?" type="boolean" checked="True"/> - <param name="minlength" label="Minimal length (bp) of microsatellite sequence reported" type="integer" size="2" value="5"/> - - - <param name="prefix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/> - <param name="surfix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/> - - - <param name="hammingThreshold" label="Hamming threshold of microsatellite, If greater than 0, interrupted microsatellites will also be reported" type="integer" size="2" value="0"/> - <param name="multipleruns" label="Consider all candidate intervals in a sequence. If not check, only the longest one will be considered" type="boolean" checked="True"> </param> - <conditional name="flankSetting"> - <param name="noflankdisplay" label="Show the entire flanking regions" type="boolean" checked="True"/> - <when value="false"> - <param name="flankdisplay" label="Limit length (bp) of flanking regions shown" type="integer" size="4" value="5"/> - </when> - </conditional> - - </inputs> - <outputs> - <data name="stdout" format="tabular"/> - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="filePath" value="C_sample_fastq"/> - <param name="period" value="1"/> - <param name="partialmotifs" value="true" /> - <param name="minlength" value="3" /> - <param name="prefix" value="5"/> - <param name="surfix" value="5"/> - <param name="hammingThreshold" value="0"/> - <param name="multipleruns" value="true"> </param> - <output name="microsatellite" file="C_sample_snoope"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -We use different algorithms to detect microsatellites depend on hamming distance parameter. -If hamming distance is set to zero, the program will only concern about uninterrupted microsatellites. The process works as follows. - -1) Scanning reads using sliding windows. For a given repeat period ‘k’ (e.g. k=2 for dinucleotide TRs), we compared consecutive k-mer window size sequences, with a step size of k. If a base at a given position matches one k positions earlier it was marked with a plus, if corresponding sites had different bases it was marked with a minus. The first k position is blank. - -2) Since we do not allow mutations in reported TR, consecutive “+†signal sequence means that a k-mer TR is present in this sample. - -3) Report k-mer TRs if the length is larger than a threshold provided by the user. - -If hamming distance is set to integer more than zero, the program will concern both uninterrupted and interrupted microsatellites. The process works as follows: - -(1) Identify intervals that are highly correlated with the interval shifted by ‘k’ (the repeat period). These intervals are called "runs" or "candidates". The allowed level of correlation is 6/7. Depending on whether we want to look for more than one microsat, we either find the longest such run (simple algorithm) or many runs (more complicated algorithm). The following steps are then performed on each run. - -(2) Find the most likely repeat motif in the run. This is done by counting all kmers (of length P) and choosing the most frequent. If that kmer is itself covered by a sub-repeat we discard this run. The idea is that we can ignore a 6-mer like ACGACG because we will find it when we are looking for 3-mers. - -(3) Once we identify the most likely repeat motif, we then modify the interval, adjusting start and end to find the interval that has the fewest mismatches vs. a sequence of the motif repeated (hamming distance). - -(4) At this point we have a valid microsat interval (in the eyes of the program). It is subjected to some filtering stages (hamming distance or too close to an end), and if it satisfies those conditions, it's reported to the user - -For more option, the script to run this program can be downloaded and run with python independently from Galaxy. There are more option for the script mode. Help page is build-in inside the script. - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** -This tool is developed by Chen Sun (cxs1031@cse.psu.edu) and Bob Harris (rsharris@bx.psu.edu) - -**Input** - -- The input files can be fastq, fasta, fastq without quality score, and SAM format. - -**Output** - -For fastq, the output will contain the following columns: - -- Column 1 = length of microsatellites (bp) -- Column 2 = length of left flanking regions (bp) -- Column 3 = length of right flanking regions (bp) -- Column 4 = repeat motif (bp) -- Column 5 = hamming distance -- Column 6 = read name -- Column 7 = read sequence with soft masking of microsatellites -- Column 8 = read quality (the same Phred score scale as input) - -For fasta, fastq without quality score and sam format, column 8 will be replaced with dot(.). - -If the users have mapped file (SAM) and would like to profile microsatellites from premapped data instead of using flank-based mapping approach, they can select SAM format input and specify that they want correspond microsatellites in reference for comparison. The output will be as follow: - -- Column 1 = length of microsatellites (bp) -- Column 2 = length of left flanking regions (bp) -- Column 3 = length of right flanking regions (bp) -- Column 4 = repeat motif (bp) -- Column 5 = hamming distance -- Column 6 = read name -- Column 7 = read sequence with soft masking of microsatellites -- Column 8 = read quality (the same Phred score scale as input) -- Column 9 = read name (The same as column 6) -- Column 10 = chromosome -- Column 11 = left flanking region start -- Column 12 = left flanking region stop -- Column 13 = microsatellite start as infer from pair-end -- Column 14 = microsatellite stop as infer from pair-end -- Column 15 = right flanking region start -- Column 16 = right flanking region stop -- Column 17 = microsatellite length in reference -- Column 18 = microsatellite sequence in reference - -</help> -</tool>
--- a/test-data/microsatpurity.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -import sys -# remove all read that have impure microsat -# check only one line at a time - - -fd=open(sys.argv[1]) -lines=fd.xreadlines() -##motifIx=int(sys.argv[2]) -period=int(sys.argv[2]) -tr_ref_seqIx=int(sys.argv[3])-1 -##output=(sys.argv[4]) -##fout=open(output,'w') -for line in lines: - temp=line.strip().split('\t') - temp=filter(None,temp) - #motif=temp[motifIx] - tr_ref_seq=temp[tr_ref_seqIx] - ##period=len(motif) - cand_motif=tr_ref_seq[:period] - len_microsat=len(tr_ref_seq) - expand_microsat_cand=cand_motif*(len_microsat/period) + cand_motif[:(len_microsat%period)] - if tr_ref_seq == expand_microsat_cand: - print line.strip() - ##print line.strip() >> fout \ No newline at end of file
--- a/test-data/microsatpurity.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -<tool id="microsatpurity" name="Select uninterrupted microsatellites" version="1.0.0"> - <description> of a specific column</description> - <command interpreter="python">microsatpurity.py $input $period $column_n > $output </command> - - <inputs> - <param name="input" type="data" label="Select input" /> - <param name="period" type="integer" label="motif size" value="1"/> - <param name="column_n" type="integer" value="0" label="Select column that contains microsatellites of interest (0 = last column)" /> - </inputs> - <outputs> - <data format="tabular" name="output" /> - - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="input" value="microsatpurity_in.txt"/> - <param name="period" value="2"/> - <param name="column_n" value="0"/> - <output name="output" file="microsatpurity_out.txt"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -This tool is used to select only the uninterrupted microsatellites. Interrupted microsatellites (e.g. ATATATATAATATAT) or sequences of microsatellites with non-microsatellite parts (e.g. ATATATATATG) will be removed. - -For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to avoid the cases that flanking bases were misread as microsatellite. Thus, the read profile will only reflect the variation of TR length from expansion/contraction. -For example, suppose that the sequence around microsatellite is AGCGACGaaaaaaGCGATCA. If we observe read with sequence AGCGACGaaaaaaaaaaGCGATCA, we can indicate that this is microsatellite expansion. However, if we observe AGCGACGaaaaaaaCGATCA, this is more like a substitution of G to A. These incidents can be removed with this tool. -You can use the tool **combine mapped flaked bases** to get the microsatellites in reference that correspond to sequence between mapped reads. If the user map these reads around the uninterrupted microsatelites in reference, the corresponding sequences between these pairs should be the uninterrupted microsatellites regardless of expansion/contraction of microsatellites in short read data. However, if the substitution of flanking base or if the fluorescent signal from the previous run make it look like substitution, the corresponding sequences in reference in between the pairs will not be uninterrupted microsatellites. Thus this tool can remove those cases and keep only microsatellite expansion/contraction. - - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -The input files can be any tab delimited file. - -If this tool is used in TRFM microsatellite profiling, it should contains: - -- Column 1 = microsatellite location in reference chromosome -- Column 2 = microsatellite location in reference start -- Column 3 = microsatellite location in reference stop -- Column 4 = microsatellite location in reference motif -- Column 5 = microsatellite location in reference length -- Column 6 = microsatellite location in reference motif size -- Column 7 = length of microsatellites (bp) -- Column 8 = length of left flanking regions (bp) -- Column 9 = length of right flanking regions (bp) -- Column 10 = repeat motif (bp) -- Column 11 = hamming distance -- Column 12 = read name -- Column 13 = read sequence with soft masking of microsatellites -- Column 14 = read quality (the same Phred score scale as input) -- Column 15 = read name (The same as column 12) -- Column 16 = chromosome -- Column 17 = left flanking region start -- Column 18 = left flanking region stop -- Column 19 = microsatellite start as infer from pair-end -- Column 20 = microsatellite stop as infer from pair-end -- Column 21 = right flanking region start -- Column 22 = right flanking region stop -- Column 23 = microsatellite length in reference -- Column 24 = microsatellite sequence in reference - -**Output** - -The same as input format. - - -</help> -</tool>
--- a/test-data/pair_fetch_DNA_ff.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -#!/usr/bin/env python -# pair_fetch_DNA_ff.py -# Function: filter microsat and flanking region by quality score; -# remove read with any base that has lower quality score than "quality_require" within "flanking_base" and convert from snoope to fastq -# Note that require flanking length need to be screen by Bob snoope script first - -# Author: Arkarachai Fungtammasan -# Version 1.0.0 (15 July 2012) -# Input format: length_of_repeat[0] left_flank_length[1] right_flank_length[2] repeat_motif[3] hamming_distance[4] read_name[5] read_sequence[6] read_quality[7] -# Output format: two fastq file. First file contain left flank. Second file contain right flank. -# Command: python pair_fetch_DNA_ff.py input.txt - -import sys -from galaxy import eggs - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -# read file name - - - -filename=sys.argv[1] -L_filename=sys.argv[2] -R_filename=sys.argv[3] -quality_require=sys.argv[4] -flanking_base=sys.argv[5] -try: - quality_require=int(quality_require) - flanking_base=int(flanking_base) -except Exception, eee: - print eee - stop_err("Quality score cutoff and Length of flanking regions that require quality screening must be integer") - -fd=open(filename) -fdd1=open(L_filename,'w') -fdd2=open(R_filename,'w') -lines=fd.xreadlines() -for line in lines: - temp=line.strip().split('\t') - temp=filter(None,temp) - #get index - left_flank=(0,int(temp[1])) - microsat=(int(temp[1]),int(temp[1])+int(temp[0])) - right_flank=(int(temp[1])+int(temp[0]),int(temp[1])+int(temp[0])+int(temp[2])) - flag=0 - #filter length of left and right flank - if (right_flank[1]-right_flank[0])<flanking_base: - continue - if (left_flank[1]-left_flank[0])<flanking_base: - continue - #filter quality score - for i in temp[7][microsat[0]-flanking_base:microsat[1]+flanking_base]: - if ord(i)<(quality_require+33): - flag=1 - else: - flag=flag - #print out to seperated files - if flag ==0: - newname= temp[5]##+'_'+temp[3]+'_'+temp[0] - fdd1.writelines('@'+newname+'\n') - fdd2.writelines('@'+newname+'\n') - fdd1.writelines(temp[6][left_flank[0]:left_flank[1]]+'\n') - fdd2.writelines(temp[6][right_flank[0]:right_flank[1]]+'\n') - fdd1.writelines('+'+newname+'\n') - fdd2.writelines('+'+newname+'\n') - fdd1.writelines(temp[7][left_flank[0]:left_flank[1]]+'\n') - fdd2.writelines(temp[7][right_flank[0]:right_flank[1]]+'\n') - -fd.close() -fdd1.close() -fdd2.close() - -
--- a/test-data/probvalueforhetero.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ -<tool id="heteroprob" name="Evaluate the probability of the allele combination to generate read profile" version="2.0.0"> - <description></description> - <command interpreter="python2.7">heteroprob.py $microsat_raw $microsat_error_profile $expectedminorallele > $microsat_corrected </command> - - <inputs> - <param name="microsat_raw" type="data" label="Select microsatellite length profile and allele combination file" /> - <param name="microsat_error_profile" type="data" label="Select microsatellite error profile that correspond to this dataset" /> - <param name="expectedminorallele" type="float" value="0.5" label="Expected contribution of minor allele when present (0.5 for genotyping)" /> - - </inputs> - <outputs> - <data name="microsat_corrected" format="tabular" /> - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="microsat_raw" value="probvalueforhetero_in.txt"/> - <param name="microsat_error_profile" value="PCRinclude.allrate.bymajorallele"/> - <param name="expectedminorallele" value="0.5"/> - <output name="microsat_corrected" file="probvalueforhetero_out.txt"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -- This tool will calculate the probability that the allele combination can generated the given read profile. This tool is part of the pipeline to estimate minimum read depth. -- The calculation of probability is very similar to the tool **Correct genotype for microsatellite errors**. However, this tool will restrict the calculation to only the allele combination indicated in input. Also, when it encounter allele combination that cannot be generated from error profile, the total probability will be zero instead of using base substitution rate. - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -The input format is the same as output from **Correct genotype for microsatellite errors** tool. - -- Column 1 = location of microsatellite locus. -- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). -- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. -- Column 4 = homozygous/heterozygous label. -- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous) -- Column 6 = Allele for most probable homozygous form. -- Column 7 = Allele 1 for most probable heterozygous form. -- Column 8 = Allele 2 for most probable heterozygous form. - -Only column 2,3,7,8 were used in calculation. - -**Output** - - -The output will be contain original eight column from the input. However, it will also add these following columns. -- Column 9 = Probability of the allele combination to generate given read profile. -- Column 10 = Number of possible rearrangement of given read profile. -- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10) -- Column 12 = Read depth - - - - -</help> -</tool>
--- a/test-data/profilegenerator.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ -import collections -import itertools -import sys - -filename=sys.argv[1] -MOTIF=sys.argv[2] -MOTIFSIZE=len(MOTIF) -MaxDEPTH=int(sys.argv[3]) -MINIMUMPROB=float(sys.argv[4])##1.0/(10**4) -MININUMCOUNT=1 -fd=open(filename) -lines=fd.readlines() -countbymajorallele=collections.defaultdict(list) -for line in lines: - temp=line.strip().split('\t') - t_major=int(temp[0]) - t_count=int(temp[2]) - countbymajorallele[t_major].append(t_count) -fd.close() -sumbymajorallele=collections.defaultdict(int) -for t_majorallele in countbymajorallele.keys(): - sumbymajorallele[t_majorallele]=sum(countbymajorallele[t_majorallele]) - -fd=open(filename) -##fd=open('PCRinclude.mono.A.bymajorallele') -lines=fd.readlines() -allmajor=collections.defaultdict(list) -for line in lines: - temp=line.strip().split() - if int(temp[0])%MOTIFSIZE==0: - if (int(temp[2])/(sumbymajorallele[int(temp[0])]*1.0))>=MINIMUMPROB: - if int(temp[2])>=MININUMCOUNT: - allmajor[int(temp[0])].append(int(temp[1])) -##print allmajor -allkey=allmajor.keys() -allkey.sort() -#print allkey -keycount=0 -combinelist_collection=[] -for dummycount in range(len(allkey)-1): - pair1,pair2=allkey[keycount],allkey[keycount+1] - pair1list=allmajor[pair1] - pair2list=allmajor[pair2] - #print pair1list,pair2list - pair1list.extend(pair2list) - combinelist=list(set(pair1list)) - combinelist.sort() - ##print combinelist - combinelist_collection.append(tuple(combinelist)) - keycount+=1 -combinelist_collection=list(set(combinelist_collection)) -newcombinelist_collection=combinelist_collection[:] -#combinelist_collection=set(combinelist_collection) -for smallset1 in combinelist_collection: - for smallset2 in combinelist_collection: - if set(smallset1).issubset(set(smallset2)) and smallset1 != smallset2: - newcombinelist_collection.remove(smallset1) - break -##print combinelist_collection - -for depth in range(2,MaxDEPTH+1): - for member_list in newcombinelist_collection: - for member in itertools.combinations_with_replacement(member_list,depth): - print 'chr'+'\t'+','.join(map(str,member))+'\t'+MOTIF - -
--- a/test-data/profilegenerator.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -<tool id="Profilegenerator" name="Generate all possible combination of read profile" version="2.0.0"> - <description> of the consecutive allele from given error profile </description> - <command interpreter="python2.7">profilegenerator.py $error_profile $MOTIF $Maxdepth $minprob > $output </command> - - <inputs> - <param name="error_profile" type="data" label="Select error profile" /> - <param name="MOTIF" type="text" value="A" label="Type in a motif of interest (e.g. AGC)" /> - <param name="Maxdepth" type="integer" value="30" label="Maximum read depth of interest" /> - <param name="minprob" type="float" value="0.00000001" label="Minimum error rate to be considered" /> - - </inputs> - <outputs> - <data name="output" format="tabular" /> - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="error_profile" value="sampleprofilegenerator_in"/> - <param name="MOTIF" value="A"/> - <param name="Maxdepth" value="3"/> - <param name="minprob" file="0.00000001"/> - <output name="output" file="sampleprofilegenerator_out"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -This tool will generate all possible combination of observed read profile of the consecutive alleles from given error profile. The range of observed read length can be filtered to contain only those that are frequently occur using "Minimum error rate to be considered" parameter. - -This problem will collect the lists of valid (pass "Minimum error rate to be considered" threshold) observed length profiles from combination of consecutive allele lengths. The lists that are equivalent or the subset of the other lists will be removed. For each depth and each list, length profile were generated from combination with replacement which compatible with python 2.7. There could be redundant error profiles generated from different lists if more than one combination of allele is generated due to overlap range of observed microsatellite lengths. The user need to remove them which can be done easily using **sort | uniq** command in unix. - - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -- The error profile needs to contain these three columns. -- Column 1 = Correct microsatellite length -- Column 2 = Observed microsatellite length -- Column 3 = Number of observation - -**Output** - -- Column 1 = Place holder for location of microsatellite locus. (just "chr") -- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). -- Column 3 = motif of microsatellite in this locus. - -**Example** - -- Suppose that we provide the following read profile :: - - 9 9 100000 - 10 10 91456 - 10 9 1259 - 11 11 39657 - 11 10 1211 - 11 12 514 - - -- Using default minimum probability to be consider and motif = A, all observed read lengths are valid. The program will generated lists of observed length profiles from consecutive allele length. :: - - 9:10 = [9,10] - 10:11 = [9,10,11,12] - -- Lists that are subsets of other lists will be removed. Thus, [9,10] will not be considered. - -- Then the program will generate all combination with replacement for each depth from each list. Using **maximum read depth =3**, we will ge the following output. :: - - - chr 9,9 A - chr 9,10 A - chr 9,11 A - chr 9,12 A - chr 10,10 A - chr 10,11 A - chr 10,12 A - chr 11,11 A - chr 11,12 A - chr 12,12 A - chr 9,9,9 A - chr 9,9,10 A - chr 9,9,11 A - chr 9,9,12 A - chr 9,10,10 A - chr 9,10,11 A - chr 9,10,12 A - chr 9,11,11 A - chr 9,11,12 A - chr 9,12,12 A - chr 10,10,10 A - chr 10,10,11 A - chr 10,10,12 A - chr 10,11,11 A - chr 10,11,12 A - chr 10,12,12 A - chr 11,11,11 A - chr 11,11,12 A - chr 11,12,12 A - chr 12,12,12 A - - -</help> -</tool>
--- a/test-data/readdepth2sequencingdepth.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -<tool id="readdepth2seqdepth" name="Convert informative read depth to sequencing depth" version="1.0.0"> - <description>for flank-based mapping of microsatellites</description> - <command interpreter="python2.7">sequencingdepthconversion_G.py $repeatlength $flanksize $readlength $infodepth $probprediction > $output </command> - - <inputs> - <param name="repeatlength" type="integer" value="10" label="Repeat length (bp)" /> - <param name="flanksize" type="integer" value="20" label="Required flank bases on each side in mapping" /> - <param name="readlength" type="integer" value="100" label="Read length (treat all read as single end read)" /> - <param name="infodepth" type="integer" value="5" label="Required read depth" /> - <param name="probprediction" type="float" value="0.9" label="Proportion of genome that need certain level of read depth" /> - </inputs> - <outputs> - <data format="input" name="output" /> - - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="repeatlength" value="10"/> - <param name="flanksize" value="20" /> - <param name="readlength" value="100" /> - <param name="infodepth" value="5" /> - <param name="probprediction" value="0.9" /> - <output name="output" file="readdepth2seqdepth.out"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -This tool is used to convert informative read depth (specified by user) to sequencing depth when the microsatellites is mapped using TRFM pipeline. -The locus specific sequencing depth is the sequencing depth that will make a certain loci have certain read depth based on uniform mapped of read. It is calculated as: :: - - yrequired = ( X * L ) / (L - (2F+r-1)) - -Where X = read depth, L = read length, F = the number of flanked bases required on each flanking regions, r = the expected repeat length of microsatellite of interest. - -The genome wide sequencing depth is the sequencing depth that will make certain percentage of genome (e.g. 90 percent or 95 percent) to have certain locus specific sequencing depth. It's calculated using numerical guessing to find smallest lambda that: :: - - 0.90 (or other proportion specified by user) < = P(Y=0) + P(Y=1) + …+ P(Y=yrequired-1) - - P(Y=y) = (lambda^(y) * e ^(-lambda)) /y! - - y = specific level of sequencing depth. Lambda = genome wide sequencing depth - - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - - -</help> -</tool>
--- a/test-data/sequencingdepthconversion_G.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def info2require(X,L,F,r): - '''infodepth,readlength,flanksize,repeatlength - ''' - return int(math.ceil((X*L*1.0)/(L-(1*((2*F)+r-1))))) - -def poissondef(meancov,specificcov): - nominator=1.0*(meancov**specificcov)*(math.e**(-1*meancov)) - denominator=math.factorial(specificcov) - return nominator/denominator - -def require2recommend(needprob,mindepth): - i=mindepth - reverseneedprob=1-needprob - sumprob=1 - while sumprob>reverseneedprob: #mean cov - sumprob=0 - for j in range(0,mindepth): #specific cov - sumprob+=poissondef(i,j) - i+=1 - - return i-1 - -import sys,math - -repeatlength=int(sys.argv[1]) -flanksize=int(sys.argv[2])#20 -readlength=int(sys.argv[3])#100 -infodepth=int(sys.argv[4])#5 -probdetection=float(sys.argv[5])#0.90 - -if probdetection >1: - try: - probvalue=int('probvalue') - except Exception, eee: - print eee - stop_err("Proportion of genome to have certain locus specific must be between 0 and 1") - -print 'repeat_length'+'\t'+'read_length'+'\t'+'informative_read_depth''\t'+'=locus_specific_sequencing_depth'+'\t'+'=genome_wide_sequencing_depth' -t_requiredepth=info2require(infodepth,readlength,flanksize,repeatlength) -t_recomendseq=require2recommend(probdetection,t_requiredepth) -preplotlist=[repeatlength,readlength,infodepth,t_requiredepth,t_recomendseq] -plotlist=map(str,preplotlist) -print '\t'.join(plotlist) - -#print info2require(infodepth,readlength,flanksize,repeatlength) -#print poissondef(10,3) -#print require2recommend(0.90,80) -#informative_read_depth -#required_seq_depth -#recommend_seq_depth \ No newline at end of file
--- a/test-data/space2underscore_readname.xml Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -<tool id="space2underscore_readname" name="Read name modifier" version="1.0.0"> - <description>--change space to underscore of a specific column</description> - <command interpreter="python">changespacetounderscore_readname.py $input $output $column_n </command> - - <inputs> - <param name="input" type="data" label="Select input" /> - <param name="column_n" type="integer" value="6" label="Select column to modify" /> - </inputs> - <outputs> - <data format="tabular" name="output" /> - - </outputs> - <tests> - <!-- Test data with valid values --> - <test> - <param name="input" value="samplefq.snoope"/> - <param name="column_n" value="6"/> - <output name="output" file="samplefq.snoope.new"/> - </test> - - </tests> - <help> - - -.. class:: infomark - -**What it does** - -This tool is used to change space to underscore. For TRFM pipeline (profiling microsatellites in short read data), this tool is used to change space in read name to underscore to prevent the downstream tools which might recognize incorrect column number due to space in read name. If the input do not have space in read name, this step can be skipped. - -**Citation** - -When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research** - -**Input** - -The input files can be any tab delimited file. - -If this tool is used in TRFM microsatellite profiling, it should be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score** - -**Output** - -The same as input format. - - -</help> -</tool>
--- a/test-data/test-data/C_sample_fastq Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -@IL2_40_2_1_735_755 -ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGAAATAACAT -+ -IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5* -@IL2_40_2_1_919_700 -ATAAGGAAAAAAAAAAAAAAAACCAGGTCTTTTTTTTTTTTTTTTTGTTAT -+ -IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
--- a/test-data/test-data/C_sample_snoope Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -3 33 15 A 0 IL2_40_2_1_735_755_1_per1_2 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTaaaGTGCTGAAATAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5* -3 42 6 A 0 IL2_40_2_1_735_755_1_per1_3 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGaaaTAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5* -16 6 29 A 0 IL2_40_2_1_919_700_1_per1_1 ATAAGGaaaaaaaaaaaaaaaaCCAGGTCTTTTTTTTTTTTTTTTTGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$& -17 29 5 T 0 IL2_40_2_1_919_700_1_per1_2 ATAAGGAAAAAAAAAAAAAAAACCAGGTCtttttttttttttttttGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
--- a/test-data/test-data/PCRinclude.allrate.bymajorallele Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,997 +0,0 @@ -10 10 91456 A -10 9 1259 A -10 11 605 A -10 8 16 A -10 12 8 A -10 7 2 A -11 11 39657 A -11 10 1211 A -11 12 514 A -11 9 54 A -11 13 9 A -11 8 3 A -11 14 1 A -12 12 18850 A -12 11 986 A -12 13 417 A -12 10 73 A -12 14 8 A -12 9 1 A -12 8 1 A -13 13 10201 A -13 12 885 A -13 14 320 A -13 11 83 A -13 15 12 A -13 10 8 A -14 14 3649 A -14 13 409 A -14 15 151 A -14 12 62 A -14 11 6 A -14 16 5 A -14 10 1 A -15 15 847 A -15 14 140 A -15 16 60 A -15 13 20 A -15 17 4 A -15 12 3 A -16 16 182 A -16 15 60 A -16 17 14 A -16 14 12 A -16 13 1 A -16 12 1 A -16 18 1 A -17 17 11 A -17 16 5 A -17 15 2 A -17 18 1 A -18 18 4 A -18 17 2 A -5 5 10047169 A -5 6 44 A -6 6 2808071 A -6 5 195 A -6 7 69 A -7 7 1097174 A -7 6 313 A -7 8 83 A -7 5 6 A -8 8 369496 A -8 7 387 A -8 9 248 A -8 6 3 A -8 10 2 A -9 9 184958 A -9 8 707 A -9 10 486 A -9 7 5 A -9 11 4 A -10 10 46 C -10 9 3 C -5 5 1354993 C -5 6 7 C -6 6 193431 C -6 5 14 C -6 7 2 C -7 7 22171 C -7 6 4 C -8 8 2966 C -8 9 3 C -8 7 3 C -9 9 638 C -9 8 8 C -9 7 1 C -10 10 21211 AC -10 8 3 AC -10 12 1 AC -11 11 15048 AC -11 9 10 AC -12 12 6043 AC -12 10 15 AC -12 14 1 AC -13 13 5070 AC -13 11 40 AC -13 15 1 AC -14 14 3093 AC -14 12 44 AC -14 10 1 AC -15 15 2848 AC -15 13 31 AC -15 17 1 AC -16 16 1273 AC -16 14 30 AC -16 12 2 AC -17 17 1297 AC -17 15 27 AC -18 18 1269 AC -18 16 43 AC -18 20 2 AC -18 14 1 AC -19 19 679 AC -19 17 17 AC -19 21 1 AC -20 20 645 AC -20 18 34 AC -20 22 2 AC -20 16 1 AC -21 21 723 AC -21 19 28 AC -21 17 1 AC -21 23 1 AC -22 22 499 AC -22 20 29 AC -22 18 3 AC -23 23 540 AC -23 21 30 AC -23 19 2 AC -23 25 1 AC -24 24 385 AC -24 22 38 AC -24 26 2 AC -24 20 1 AC -25 25 407 AC -25 23 22 AC -25 27 2 AC -25 21 1 AC -26 26 257 AC -26 24 30 AC -26 22 3 AC -26 28 1 AC -26 20 1 AC -27 27 339 AC -27 25 28 AC -27 23 3 AC -27 29 2 AC -28 28 202 AC -28 26 17 AC -28 30 6 AC -29 29 277 AC -29 27 29 AC -29 31 6 AC -29 25 3 AC -30 30 117 AC -30 28 12 AC -30 32 3 AC -30 18 1 AC -31 31 144 AC -31 29 18 AC -31 27 4 AC -31 33 2 AC -32 32 101 AC -32 30 23 AC -32 28 2 AC -32 34 2 AC -32 26 1 AC -33 33 106 AC -33 31 15 AC -33 35 3 AC -33 29 1 AC -34 34 33 AC -34 32 7 AC -35 35 21 AC -35 33 4 AC -35 31 1 AC -36 36 12 AC -36 34 1 AC -37 37 10 AC -37 35 3 AC -37 31 1 AC -37 39 1 AC -38 38 4 AC -38 36 1 AC -6 6 1521439 AC -7 7 513952 AC -8 8 134603 AC -8 6 2 AC -9 9 60741 AC -9 7 3 AC -9 11 1 AC -10 10 21772 AG -10 8 3 AG -10 12 1 AG -11 11 13880 AG -11 9 10 AG -11 13 1 AG -12 12 5628 AG -12 10 13 AG -12 14 4 AG -13 13 4494 AG -13 11 17 AG -14 14 1898 AG -14 12 15 AG -15 15 2427 AG -15 13 18 AG -16 16 1076 AG -16 14 24 AG -16 12 1 AG -17 17 874 AG -17 15 12 AG -17 19 1 AG -17 13 1 AG -18 18 536 AG -18 16 20 AG -18 14 1 AG -19 19 563 AG -19 17 25 AG -20 20 201 AG -20 18 14 AG -21 21 260 AG -21 19 10 AG -22 22 83 AG -22 20 5 AG -23 23 147 AG -23 21 5 AG -23 25 1 AG -24 24 99 AG -24 22 4 AG -24 18 1 AG -25 25 62 AG -25 23 3 AG -25 27 1 AG -26 26 38 AG -26 24 8 AG -27 27 24 AG -27 25 3 AG -27 23 1 AG -28 28 14 AG -28 26 2 AG -29 29 12 AG -29 27 5 AG -29 31 1 AG -30 30 7 AG -30 28 2 AG -31 31 7 AG -31 27 3 AG -31 23 1 AG -32 32 4 AG -32 28 1 AG -6 6 1880822 AG -7 7 684837 AG -7 9 1 AG -8 8 183381 AG -9 9 75547 AG -9 7 6 AG -9 11 1 AG -10 10 18179 AT -10 8 7 AT -10 12 4 AT -11 11 8969 AT -11 9 5 AT -11 13 2 AT -12 12 4888 AT -12 10 8 AT -12 14 2 AT -13 13 2785 AT -13 11 17 AT -13 15 1 AT -14 14 2310 AT -14 12 40 AT -14 16 4 AT -14 10 2 AT -15 15 1461 AT -15 13 33 AT -15 11 1 AT -15 17 1 AT -16 16 879 AT -16 14 42 AT -16 18 2 AT -16 12 1 AT -17 17 599 AT -17 15 38 AT -17 19 2 AT -17 13 1 AT -18 18 367 AT -18 16 29 AT -18 20 7 AT -18 14 1 AT -19 19 223 AT -19 17 34 AT -19 21 3 AT -20 20 97 AT -20 18 14 AT -20 16 2 AT -20 22 1 AT -21 21 60 AT -21 19 18 AT -21 17 1 AT -22 22 53 AT -22 20 15 AT -22 24 5 AT -22 18 3 AT -23 23 11 AT -23 21 1 AT -24 24 7 AT -24 20 2 AT -24 22 2 AT -6 6 1671932 AT -6 8 1 AT -7 7 595145 AT -8 8 195533 AT -8 10 5 AT -8 6 2 AT -9 9 52576 AT -9 7 3 AT -10 10 17 CG -11 11 17 CG -12 12 6 CG -6 6 4097 CG -7 7 678 CG -8 8 184 CG -9 9 19 CG -10 10 19552 AAC -11 11 19003 AAC -12 12 6245 AAC -12 9 1 AAC -13 13 3406 AAC -14 14 8448 AAC -14 11 2 AAC -15 15 2356 AAC -15 12 6 AAC -16 16 1373 AAC -16 13 4 AAC -17 17 3140 AAC -17 14 5 AAC -18 18 944 AAC -18 15 2 AAC -19 19 456 AAC -19 16 1 AAC -20 20 1474 AAC -20 17 3 AAC -21 21 328 AAC -21 18 1 AAC -22 22 178 AAC -23 23 538 AAC -23 26 1 AAC -24 24 112 AAC -25 25 60 AAC -26 26 239 AAC -26 23 1 AAC -27 27 45 AAC -28 28 58 AAC -28 25 2 AAC -29 29 77 AAC -30 30 17 AAC -31 31 38 AAC -31 28 1 AAC -32 32 94 AAC -32 29 3 AAC -33 33 15 AAC -35 35 55 AAC -35 32 1 AAC -38 38 12 AAC -41 41 6 AAC -9 9 57212 AAC -10 10 31455 AAG -11 11 11876 AAG -12 12 3458 AAG -12 9 6 AAG -13 13 1141 AAG -14 14 928 AAG -15 15 548 AAG -15 12 4 AAG -16 16 189 AAG -17 17 235 AAG -18 18 63 AAG -19 19 66 AAG -20 20 122 AAG -22 22 11 AAG -23 23 33 AAG -9 9 104524 AAG -10 10 69106 AAT -11 11 30381 AAT -12 12 12001 AAT -12 9 1 AAT -13 13 7168 AAT -13 10 2 AAT -14 14 5470 AAT -14 11 3 AAT -15 15 2524 AAT -15 12 3 AAT -16 16 1733 AAT -16 13 1 AAT -17 17 1324 AAT -17 14 3 AAT -18 18 1022 AAT -18 15 3 AAT -19 19 502 AAT -19 16 3 AAT -20 20 570 AAT -20 17 2 AAT -21 21 370 AAT -21 18 1 AAT -22 22 98 AAT -23 23 164 AAT -23 20 3 AAT -24 24 143 AAT -24 21 1 AAT -25 25 122 AAT -25 22 1 AAT -26 26 45 AAT -26 23 2 AAT -27 27 32 AAT -27 24 1 AAT -28 28 6 AAT -29 29 64 AAT -29 26 1 AAT -30 30 28 AAT -30 24 1 AAT -31 31 9 AAT -32 32 9 AAT -32 29 1 AAT -38 38 6 AAT -9 9 179182 AAT -9 12 1 AAT -10 10 14290 ACC -11 11 5692 ACC -12 12 1795 ACC -13 13 1141 ACC -14 14 545 ACC -15 15 308 ACC -16 16 162 ACC -17 17 107 ACC -18 18 23 ACC -19 19 35 ACC -20 20 44 ACC -21 21 5 ACC -22 22 5 ACC -22 19 1 ACC -23 23 11 ACC -25 25 7 ACC -26 26 7 ACC -27 27 10 ACC -28 28 24 ACC -28 25 1 ACC -35 35 5 ACC -9 9 46614 ACC -10 10 2865 ACG -11 11 900 ACG -12 12 325 ACG -13 13 82 ACG -14 14 83 ACG -9 9 9465 ACG -10 10 6269 ACT -11 11 2284 ACT -12 12 634 ACT -13 13 441 ACT -14 14 295 ACT -15 15 118 ACT -16 16 60 ACT -17 17 71 ACT -18 18 58 ACT -19 19 42 ACT -20 20 24 ACT -24 24 5 ACT -37 37 8 ACT -41 41 5 ACT -41 35 1 ACT -9 9 20025 ACT -10 10 2897 AGC -11 11 948 AGC -12 12 320 AGC -13 13 97 AGC -14 14 87 AGC -15 15 13 AGC -16 16 9 AGC -17 17 25 AGC -17 14 1 AGC -9 9 9579 AGC -10 10 21141 AGG -11 11 8128 AGG -12 12 2964 AGG -13 13 1209 AGG -14 14 860 AGG -15 15 320 AGG -16 16 190 AGG -17 17 225 AGG -18 18 147 AGG -20 20 80 AGG -21 21 9 AGG -22 22 35 AGG -23 23 27 AGG -24 24 8 AGG -26 26 9 AGG -9 9 57350 AGG -10 10 5964 ATC -11 11 2346 ATC -12 12 789 ATC -13 13 386 ATC -14 14 285 ATC -15 15 165 ATC -16 16 93 ATC -17 17 149 ATC -18 18 51 ATC -19 19 6 ATC -20 20 15 ATC -21 21 15 ATC -22 22 29 ATC -23 23 25 ATC -24 24 24 ATC -26 26 34 ATC -27 27 9 ATC -28 28 30 ATC -29 29 8 ATC -30 30 8 ATC -31 31 11 ATC -34 34 11 ATC -34 31 1 ATC -36 36 5 ATC -9 9 19837 ATC -10 10 11 CCG -11 11 24 CCG -14 14 5 CCG -16 16 5 CCG -9 9 135 CCG -12 12 10192 AAAC -13 13 4917 AAAC -14 14 4704 AAAC -15 15 12713 AAAC -16 16 2415 AAAC -17 17 1431 AAAC -18 18 1861 AAAC -18 14 2 AAAC -19 19 5254 AAAC -19 15 2 AAAC -19 23 1 AAAC -20 20 913 AAAC -20 16 1 AAAC -21 21 615 AAAC -22 22 509 AAAC -22 18 2 AAAC -23 23 2249 AAAC -23 19 5 AAAC -23 15 1 AAAC -24 24 329 AAAC -24 20 2 AAAC -25 25 230 AAAC -25 21 1 AAAC -26 26 175 AAAC -27 27 548 AAAC -27 23 2 AAAC -28 28 195 AAAC -28 24 1 AAAC -29 29 62 AAAC -30 30 67 AAAC -31 31 165 AAAC -31 27 1 AAAC -32 32 64 AAAC -33 33 63 AAAC -34 34 21 AAAC -35 35 40 AAAC -36 36 55 AAAC -37 37 6 AAAC -38 38 8 AAAC -39 39 10 AAAC -40 40 7 AAAC -45 45 7 AAAC -12 12 12855 AAAG -12 16 13 AAAG -12 20 9 AAAG -12 18 2 AAAG -13 13 6727 AAAG -14 14 3699 AAAG -14 13 8 AAAG -15 15 3858 AAAG -15 17 6 AAAG -15 13 1 AAAG -16 16 1244 AAAG -17 17 750 AAAG -17 13 1 AAAG -18 18 380 AAAG -18 20 5 AAAG -18 14 1 AAAG -19 19 1164 AAAG -19 15 1 AAAG -20 20 153 AAAG -21 21 186 AAAG -22 22 115 AAAG -23 23 321 AAAG -23 19 1 AAAG -24 24 82 AAAG -25 25 89 AAAG -26 26 26 AAAG -26 13 3 AAAG -27 27 64 AAAG -28 28 36 AAAG -29 29 32 AAAG -31 31 31 AAAG -33 33 19 AAAG -35 35 10 AAAG -36 36 11 AAAG -38 38 16 AAAG -41 41 5 AAAG -12 12 23143 AAAT -13 13 10045 AAAT -14 14 6815 AAAT -15 15 8439 AAAT -16 16 3102 AAAT -16 12 2 AAAT -17 17 2018 AAAT -17 13 2 AAAT -18 18 2044 AAAT -19 19 2955 AAAT -19 15 1 AAAT -19 14 1 AAAT -20 20 909 AAAT -21 21 711 AAAT -21 17 2 AAAT -22 22 500 AAAT -22 18 2 AAAT -23 23 993 AAAT -23 19 3 AAAT -24 24 382 AAAT -24 20 3 AAAT -25 25 190 AAAT -26 26 185 AAAT -26 22 1 AAAT -27 27 281 AAAT -27 23 2 AAAT -28 28 165 AAAT -28 24 2 AAAT -29 29 48 AAAT -30 30 46 AAAT -31 31 101 AAAT -32 32 28 AAAT -33 33 19 AAAT -34 34 24 AAAT -34 30 1 AAAT -35 35 41 AAAT -35 31 2 AAAT -36 36 16 AAAT -37 37 6 AAAT -38 38 5 AAAT -39 39 20 AAAT -39 35 1 AAAT -40 40 5 AAAT -41 41 10 AAAT -42 42 6 AAAT -45 45 6 AAAT -12 12 1468 AACC -13 13 590 AACC -14 14 318 AACC -15 15 163 AACC -16 16 102 AACC -17 17 106 AACC -18 18 18 AACC -19 19 34 AACC -20 20 7 AACC -22 22 7 AACC -23 23 13 AACC -24 24 16 AACC -25 25 9 AACC -31 31 9 AACC -12 12 214 AACG -13 13 135 AACG -14 14 39 AACG -15 15 45 AACG -12 12 522 AACT -13 13 142 AACT -14 14 143 AACT -15 15 88 AACT -16 16 16 AACT -17 17 51 AACT -18 18 7 AACT -20 20 21 AACT -21 21 27 AACT -23 23 7 AACT -24 24 11 AACT -30 30 5 AACT -12 12 346 AAGC -13 13 83 AAGC -14 14 60 AAGC -15 15 40 AAGC -16 16 21 AAGC -18 18 9 AAGC -19 19 7 AAGC -12 12 4943 AAGG -13 13 2714 AAGG -14 14 1385 AAGG -14 15 3 AAGG -15 15 949 AAGG -16 16 612 AAGG -16 14 4 AAGG -17 17 331 AAGG -18 18 362 AAGG -19 19 204 AAGG -20 20 138 AAGG -21 21 149 AAGG -22 22 68 AAGG -23 23 49 AAGG -24 24 27 AAGG -25 25 44 AAGG -26 26 8 AAGG -27 27 14 AAGG -28 28 14 AAGG -29 29 14 AAGG -30 30 12 AAGG -31 31 23 AAGG -34 34 11 AAGG -43 43 6 AAGG -12 12 2676 AAGT -13 13 1438 AAGT -14 14 940 AAGT -15 15 649 AAGT -16 16 305 AAGT -17 17 291 AAGT -18 18 181 AAGT -19 19 55 AAGT -20 20 73 AAGT -21 21 8 AAGT -22 22 43 AAGT -22 26 1 AAGT -23 23 32 AAGT -23 19 1 AAGT -24 24 18 AAGT -25 25 19 AAGT -26 26 8 AAGT -27 27 12 AAGT -29 29 18 AAGT -30 30 12 AAGT -31 31 12 AAGT -32 32 11 AAGT -33 33 35 AAGT -34 34 9 AAGT -35 35 6 AAGT -12 12 594 AATC -13 13 205 AATC -14 14 88 AATC -15 15 112 AATC -16 16 20 AATC -17 17 81 AATC -18 18 23 AATC -21 21 13 AATC -22 22 8 AATC -24 24 19 AATC -26 26 7 AATC -28 28 9 AATC -33 33 6 AATC -12 12 2293 AATG -13 13 1226 AATG -14 14 678 AATG -15 15 455 AATG -16 16 222 AATG -17 17 211 AATG -18 18 104 AATG -19 19 79 AATG -20 20 40 AATG -21 21 33 AATG -22 22 73 AATG -23 23 24 AATG -24 24 16 AATG -25 25 18 AATG -26 26 15 AATG -27 27 22 AATG -27 23 1 AATG -28 28 5 AATG -32 32 17 AATG -33 33 16 AATG -12 12 2633 AATT -13 13 1086 AATT -14 14 1052 AATT -15 15 386 AATT -16 16 393 AATT -17 17 98 AATT -18 18 104 AATT -19 19 105 AATT -20 20 34 AATT -21 21 12 AATT -22 22 20 AATT -25 25 18 AATT -26 26 25 AATT -27 27 7 AATT -29 29 7 AATT -35 35 12 AATT -12 12 1406 ACAG -13 13 964 ACAG -14 14 300 ACAG -15 15 130 ACAG -16 16 102 ACAG -17 17 49 ACAG -18 18 30 ACAG -19 19 88 ACAG -20 20 5 ACAG -23 23 5 ACAG -12 12 4868 ACAT -12 15 4 ACAT -13 13 3216 ACAT -14 14 957 ACAT -15 15 1052 ACAT -16 16 588 ACAT -17 17 422 ACAT -18 18 239 ACAT -19 19 238 ACAT -19 15 1 ACAT -20 20 25 ACAT -21 21 79 ACAT -22 22 20 ACAT -23 23 38 ACAT -27 27 42 ACAT -29 29 18 ACAT -31 31 5 ACAT -32 32 5 ACAT -35 35 6 ACAT -36 36 9 ACAT -41 41 14 ACAT -44 44 8 ACAT -44 40 1 ACAT -50 50 12 ACAT -12 12 833 ACCC -13 13 345 ACCC -14 14 190 ACCC -15 15 60 ACCC -16 16 12 ACCC -17 17 15 ACCC -19 19 8 ACCG -12 12 416 ACCT -13 13 123 ACCT -14 14 140 ACCT -15 15 69 ACCT -16 16 41 ACCT -17 17 45 ACCT -19 19 18 ACCT -20 20 27 ACCT -21 21 19 ACCT -22 22 6 ACCT -27 27 13 ACCT -28 28 7 ACCT -29 29 9 ACCT -30 30 7 ACCT -34 34 6 ACCT -45 45 5 ACCT -12 12 84 ACGC -13 13 52 ACGC -15 15 63 ACGC -12 12 433 ACGG -13 13 163 ACGG -14 14 38 ACGG -15 15 44 ACGG -16 16 7 ACGG -17 17 11 ACGG -19 19 6 ACGG -25 25 10 ACGG -12 12 1119 ACGT -13 13 509 ACGT -14 14 338 ACGT -15 15 16 ACGT -16 16 66 ACGT -17 17 7 ACGT -19 19 27 ACGT -12 12 2211 ACTC -13 13 685 ACTC -14 14 188 ACTC -15 15 151 ACTC -16 16 91 ACTC -18 18 17 ACTC -19 19 24 ACTC -20 20 23 ACTC -21 21 13 ACTC -23 23 19 ACTC -45 45 8 ACTC -12 12 161 ACTG -13 13 69 ACTG -14 14 7 ACTG -15 15 14 ACTG -16 16 15 ACTG -12 12 3118 AGAT -13 13 1216 AGAT -14 14 1084 AGAT -15 15 869 AGAT -16 16 508 AGAT -17 17 322 AGAT -18 18 159 AGAT -19 19 258 AGAT -20 20 63 AGAT -21 21 84 AGAT -22 22 69 AGAT -22 14 6 AGAT -23 23 112 AGAT -24 24 107 AGAT -25 25 36 AGAT -26 26 113 AGAT -27 27 42 AGAT -28 28 58 AGAT -29 29 37 AGAT -30 30 16 AGAT -31 31 32 AGAT -32 32 24 AGAT -33 33 10 AGAT -34 34 43 AGAT -35 35 6 AGAT -36 36 13 AGAT -36 32 1 AGAT -37 37 35 AGAT -38 38 34 AGAT -39 39 20 AGAT -39 35 2 AGAT -40 40 27 AGAT -41 41 29 AGAT -42 42 30 AGAT -43 43 87 AGAT -44 44 67 AGAT -45 45 20 AGAT -46 46 15 AGAT -47 47 28 AGAT -48 48 26 AGAT -49 49 13 AGAT -50 50 11 AGAT -52 52 5 AGAT -54 54 6 AGAT -12 12 236 AGCC -13 13 109 AGCC -14 14 17 AGCC -15 15 14 AGCC -16 16 8 AGCC -18 18 12 AGCC -21 21 18 AGCC -23 23 13 AGCC -12 12 23 AGCG -13 13 19 AGCG -18 18 9 AGCG -12 12 272 AGCT -13 13 89 AGCT -14 14 108 AGCT -15 15 49 AGCT -16 16 19 AGCT -17 17 19 AGCT -18 18 19 AGCT -19 19 44 AGCT -22 22 12 AGCT -27 27 16 AGCT -12 12 87 AGGC -13 13 19 AGGC -14 14 16 AGGC -18 18 7 AGGC -12 12 3610 AGGG -13 13 1980 AGGG -14 14 1095 AGGG -15 15 624 AGGG -16 16 159 AGGG -17 17 59 AGGG -18 18 43 AGGG -19 19 60 AGGG -20 20 49 AGGG -21 21 12 AGGG -23 23 10 AGGG -12 12 531 ATCC -13 13 323 ATCC -14 14 221 ATCC -15 15 58 ATCC -16 16 78 ATCC -17 17 38 ATCC -18 18 12 ATCC -19 19 19 ATCC -20 20 17 ATCC -21 21 44 ATCC -22 22 12 ATCC -23 23 39 ATCC -24 24 11 ATCC -25 25 12 ATCC -27 27 10 ATCC -32 32 6 ATCC -39 39 8 ATCC -40 40 6 ATCC -48 48 7 ATCC -12 12 272 ATCG -13 13 89 ATCG -14 14 108 ATCG -15 15 49 ATCG -16 16 19 ATCG -17 17 19 ATCG -18 18 19 ATCG -19 19 44 ATCG -22 22 12 ATCG -27 27 16 ATCG -12 12 1119 ATGC -13 13 509 ATGC -14 14 338 ATGC -15 15 16 ATGC -16 16 66 ATGC -17 17 7 ATGC -19 19 27 ATGC -12 12 13 CCCG -12 12 178 AGTC -13 13 77 AGTC -14 14 13 AGTC -15 15 12 AGTC
--- a/test-data/test-data/combineprob_out.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -read_depth allele heterozygous_prob motif -2 10_11 0.485943568663 A -2 11_12 0.472130683091 A -2 9_10 0.494635026326 A -3 10_11 0.71878954705 A -3 11_12 0.688571908761 A -3 9_10 0.73801798345 A
--- a/test-data/test-data/microsatcompat_in.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT -17 52191125 52191133 GA 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC -17 52191125 52191133 AC 8 4 8 26 67 AG 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 AGAGAGAG
--- a/test-data/test-data/microsatcompat_out.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
--- a/test-data/test-data/microsatellite_flanking_L.fastq Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 -TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT -+SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 -GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
--- a/test-data/test-data/microsatellite_flanking_R.fastq Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 -TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG -+SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 -GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- a/test-data/test-data/microsatpurity_in.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT -15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATTATATATATATAT -17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
--- a/test-data/test-data/microsatpurity_out.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT -17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
--- a/test-data/test-data/nice1tab.py Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -import sys -fd=open(sys.argv[1]) -lines=fd.readlines() -for line in lines: - temp=line.strip().split() - print '\t'.join(temp) \ No newline at end of file
--- a/test-data/test-data/probvalueforhetero_in.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -chr 9,10 A hetero -1.27220836321 10 10 9 -chr 10,11 A hetero -0.939119957032 11 11 10 -chr 11,12 A hetero -0.720375026792 12 12 11 -chr 9,9,10 A hetero -1.6841441619 9 9 10 -chr 9,10,10 A hetero -0.97233405327 10 10 9 -chr 10,10,11 A hetero -1.29451118958 10 10 11 -chr 10,11,11 A hetero -0.641022011041 11 11 10 -chr 11,11,12 A hetero -1.01921634129 11 11 12 -chr 11,12,12 A hetero -0.425116661902 12 12 11
--- a/test-data/test-data/probvalueforhetero_out.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -chr 9,10 A hetero -1.27220836321 10 10 9 0.247317513163 2 0.494635026326 2 -chr 10,11 A hetero -0.939119957032 11 11 10 0.242971784331 2 0.485943568663 2 -chr 11,12 A hetero -0.720375026792 12 12 11 0.236065341545 2 0.472130683091 2 -chr 9,9,10 A hetero -1.6841441619 9 9 10 0.124528157268 3 0.373584471803 3 -chr 9,10,10 A hetero -0.97233405327 10 10 9 0.121477837216 3 0.364433511647 3 -chr 10,10,11 A hetero -1.29451118958 10 10 11 0.122575544751 3 0.367726634253 3 -chr 10,11,11 A hetero -0.641022011041 11 11 10 0.117020970932 3 0.351062912797 3 -chr 11,11,12 A hetero -1.01921634129 11 11 12 0.11865253007 3 0.35595759021 3 -chr 11,12,12 A hetero -0.425116661902 12 12 11 0.110871439517 3 0.332614318551 3
--- a/test-data/test-data/profilegenerator_in.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -9 9 100000 -10 10 91456 -10 9 1259 -11 11 39657 -11 10 1211 -11 12 514
--- a/test-data/test-data/profilegenerator_out.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -chr 9,9 A -chr 9,10 A -chr 9,11 A -chr 9,12 A -chr 10,10 A -chr 10,11 A -chr 10,12 A -chr 11,11 A -chr 11,12 A -chr 12,12 A -chr 9,9,9 A -chr 9,9,10 A -chr 9,9,11 A -chr 9,9,12 A -chr 9,10,10 A -chr 9,10,11 A -chr 9,10,12 A -chr 9,11,11 A -chr 9,11,12 A -chr 9,12,12 A -chr 10,10,10 A -chr 10,10,11 A -chr 10,10,12 A -chr 10,11,11 A -chr 10,11,12 A -chr 10,12,12 A -chr 11,11,11 A -chr 11,11,12 A -chr 11,12,12 A -chr 12,12,12 A
--- a/test-data/test-data/readdepth2seqdepth.out Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -repeat_length read_length informative_read_depth =locus_specific_sequencing_depth =genome_wide_sequencing_depth -10 100 5 10 15
--- a/test-data/test-data/samplePESAM_2_profile_C.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 shifted 540 713 713 719 719 759 6 GGGGGG -M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 shifted 4007 4082 4082 4088 4088 4258 6 TTTTTT -M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 shifted 1849 1930 1930 1936 1936 2100 6 CCCCCC -M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 shifted 1849 2025 2025 2030 2030 2100 5 GGGGG -M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 shifted 1428 1517 1517 1522 1522 1543 5 AAAAA
--- a/test-data/test-data/sampleTRgenotypingcorrection Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -chr1 14,13,13,13 A hetero -0.429451855856 13 13 14 -chr1 5,6,6,6,6,7,7,8,8 A hetero -14.8744881854 7 6 8
--- a/test-data/test-data/sampleTRprofile_C.txt Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -chr1 14,13,13,13 A -chr1 5,6,6,6,6,7,7,8,8 A
--- a/test-data/test-data/samplefq.snoope Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -6 40 54 G 0 SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- a/test-data/test-data/samplefq.snoope.new Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -6 40 54 G 0 SRR345592.75000006_HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- a/test-data/test-data/sampleprofilegenerator_in Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -9 9 100000 -10 10 91456 -10 9 1259 -11 11 39657 -11 10 1211 -11 12 514
--- a/test-data/test-data/sampleprofilegenerator_out Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -chr 9,9 A -chr 9,10 A -chr 9,11 A -chr 9,12 A -chr 10,10 A -chr 10,11 A -chr 10,12 A -chr 11,11 A -chr 11,12 A -chr 12,12 A -chr 9,9,9 A -chr 9,9,10 A -chr 9,9,11 A -chr 9,9,12 A -chr 9,10,10 A -chr 9,10,11 A -chr 9,10,12 A -chr 9,11,11 A -chr 9,11,12 A -chr 9,12,12 A -chr 10,10,10 A -chr 10,10,11 A -chr 10,10,12 A -chr 10,11,11 A -chr 10,11,12 A -chr 10,12,12 A -chr 11,11,11 A -chr 11,11,12 A -chr 11,12,12 A -chr 12,12,12 A
--- a/test-data/test-data/samplesortedPESAM_C.sam Wed Apr 22 12:19:28 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 113 shifted 720 37 40M = 541 -46 TTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACC HHFG@IIHHHHHIHHFHHGFGGGGDBDDEDDDBBB????? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:40 -M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 177 shifted 541 37 173M = 720 46 CTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAAC ::GECC:*:)D<GEGGGECCCEC?00E?::CCCCEEECC:C*GEC4'.>ACGGEC:CC?>><DCE?C:EC?GECE?:CCECGEEC*GEECEC:GEEGE?GGECC:ECA2CC*CCC8DEGGEGC=CGECEAEGEEDGGEDEGD=EBGGGFDHHHHHHHHEEHHHHHIIHFIIHH XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:173 -M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 113 shifted 4089 37 170M = 4008 -176 GCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGAAGCCATACCAAACGACGAGCGTGACACCACGATGCCTGTAGCAATGGCAACAACGTTGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCTTCCCGGCAACAATTAATAG GECGGGGGGGGGGGGEGEGGGGD>2GEGGGGGEEGGGGGGGGGGGGGEEECEGEAGGEEGEB>=GGFGEAGHHHEHHHFHFF?ED;HFIHHIIIIHIIHHHHIHHHHIHHHHHHHHIIIIHIHHHHIHHHHHIIHHIIHHIIHIIIIIGGGGGGDDDDDDDDBBB????< XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:170 -M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 177 shifted 4008 37 75M = 4089 176 TGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGC CEGGEEEECC?:EEGECGGGGECGGGGEEGGEEGCCGEGGGGGGGGGGDGGGGGE>EEGGGGGGGGGGGAGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:75 -M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 129 shifted 1937 37 164M = 1850 -87 TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT HHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:138T25 -M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 65 shifted 1850 37 81M = 1937 87 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGA ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGH XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:81 -M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 129 shifted 2031 37 70M = 1850 -181 TAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT GGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:44T25 -M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 65 shifted 1850 37 176M = 2031 181 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTT ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGHIIIHHHHHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:176 -M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 129 shifted 1523 37 21M = 1429 -94 GTCTTTAACTCCACCATTAGC GGGEGGEGGGGGCGGGGGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:21 -M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 65 shifted 1429 37 89M = 1523 94 CTATGCATCCAACGCGTTGGGAGCTCTCCCATATGGTCGACCTGCAGGCGGCCGCGAATTCACTAGTGATTTCCAAGGACAAATCAGAG ?????BBBDDDDDDDDGGGFGGFEHIIIIIIIHIIIHIHHHHHIIHFHHHHHHHHHHHHHHHHHHHHGGGGGGGGGGGGGGGGGGEGEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:89
