changeset 0:20ab85af9505

Uploaded
author arkarachai-fungtammasan
date Fri, 03 Oct 2014 20:54:30 -0400
parents
children f265e26ab550
files .DS_Store GenotypeTRcorrection.py GenotypingSTR.xml PEsortedSAM2readprofile.py PEsortedSAM2readprofile.xml changespacetounderscore_readname.py combinedprobforallelecombination.py combineprobforallelecombination.xml fetchflank.xml heteroprob.py microsatcompat.py microsatcompat.xml microsatellite.py microsatellite.xml microsatpurity.py microsatpurity.xml pair_fetch_DNA_ff.py probvalueforhetero.xml profilegenerator.py profilegenerator.xml readdepth2sequencingdepth.xml sequencingdepthconversion_G.py space2underscore_readname.xml test-data/.DS_Store test-data/C_sample_fastq test-data/C_sample_snoope test-data/PCRinclude.allrate.bymajorallele test-data/combineprob_out.txt test-data/microsatcompat_in.txt test-data/microsatcompat_out.txt test-data/microsatellite_flanking_L.fastq test-data/microsatellite_flanking_R.fastq test-data/microsatpurity_in.txt test-data/microsatpurity_out.txt test-data/nice1tab.py test-data/probvalueforhetero_in.txt test-data/probvalueforhetero_out.txt test-data/profilegenerator_in.txt test-data/profilegenerator_out.txt test-data/readdepth2seqdepth.out test-data/samplePESAM_2_profile_C.txt test-data/sampleTRgenotypingcorrection test-data/sampleTRprofile_C.txt test-data/samplefq.snoope test-data/samplefq.snoope.new test-data/sampleprofilegenerator_in test-data/sampleprofilegenerator_out test-data/samplesortedPESAM_C.sam test-data/shifted.2bit
diffstat 49 files changed, 4173 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file .DS_Store has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GenotypeTRcorrection.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,250 @@
+### import libraries ###
+import sys
+import collections, math
+import heapq
+from galaxy import eggs
+
+
+    
+
+
+### basic function ###
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+        
+def averagelist(a,b,expectedlevelofminor):
+    product=[]
+    for i in range(len(a)):
+        product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
+  
+    return product
+        
+def complement_base(read):
+    collect=''
+    for i in read:
+        if i.upper()=='A':
+            collect+='T'
+        elif i.upper()=='T':
+            collect+='A'
+        elif i.upper()=='C':
+            collect+='G'
+        elif i.upper()=='G':
+            collect+='C'
+    return collect
+def makeallpossible(read):
+    collect=[]
+    for i in range(len(read)):
+        tmp= read[i:]+read[:i]
+        collect.append(tmp)
+        collect.append(complement_base(tmp))
+    return collect
+
+def motifsimplify(base):
+    '''str--> str
+    '''
+    motiflength=len(base)
+    temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
+    
+    return temp[0]
+
+def majorallele(seq):
+    binseq=list(set(seq))  
+    binseq.sort(reverse=True)   # highly mutate mode
+    #binseq.sort()              # majority mode
+    storeform=''
+    storevalue=0
+    for i in binseq:
+        if seq.count(i)>storevalue:
+            storeform=i
+            storevalue=seq.count(i)
+            
+    return int(storeform)
+
+### decide global parameter ###
+COORDINATECOLUMN=1
+ALLELECOLUMN=2
+MOTIFCOLUMN=3
+  ##(0.01-0.5)
+MINIMUMMUTABLE=1.2*(1.0/(10**8))  #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
+
+
+## Fixed global variable
+inputname=sys.argv[1]
+errorprofile=sys.argv[2]
+Genotypingcorrected=sys.argv[3]
+EXPECTEDLEVELOFMINOR=float(sys.argv[4])
+if EXPECTEDLEVELOFMINOR >0.5:
+	try:
+		expected_contribution_of_minor_allele=int('expected_contribution_of_minor_allele')
+	except Exception, eee:
+		print eee
+		stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
+ALLREPEATTYPE=[1,2,3,4]
+ALLREPEATTYPENAME=['mono','di','tri','tetra']
+monomotif=['A','C']
+dimotif=['AC','AG','AT','CG']
+trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
+tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
+'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
+'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
+ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
+monorange=range(5,60)
+dirange=range(6,60)
+trirange=range(9,60)
+tetrarange=range(12,80)
+ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
+
+#########################################
+######## Prob calculation sector ########
+#########################################
+def multinomial_prob(majorallele,STRlength,motif,probdatabase):
+    '''int,int,str,dict-->int
+    ### get prob for each STRlength to be generated from major allele
+    '''
+    #print (majorallele,STRlength,motif)
+    prob=probdatabase[len(motif)][motif][majorallele][STRlength]
+    return prob
+
+################################################
+######## error model database sector ###########
+################################################
+
+## structure generator
+errormodeldatabase={1:{},2:{},3:{},4:{}}
+sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
+for repeattype in ALLREPEATTYPE:
+    for motif in ALLMOTIF[repeattype]:
+        errormodeldatabase[repeattype][motif]={}
+        sumbymajoralleledatabase[repeattype][motif]={}
+        for motifsize1 in ALLRANGE[repeattype]:
+            errormodeldatabase[repeattype][motif][motifsize1]={}
+            sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
+            for motifsize2 in ALLRANGE[repeattype]:
+                errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
+
+#print errormodeldatabase
+## read database
+
+
+## get read count for each major allele
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_count=int(temp[2])
+    motif=temp[3]
+    sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
+fd.close()
+##print sumbymajoralleledatabase
+
+## get probability
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_read=int(temp[1])
+    t_count=int(temp[2])
+    motif=temp[3]
+    if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
+        errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
+        #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
+        
+    #else:
+    #    errormodeldatabase[repeattype][motif][t_major][t_read]=0
+fd.close()
+
+#########################################
+######## input reading sector ###########
+#########################################
+fdout=open(Genotypingcorrected,'w')
+
+fd = open(inputname)
+
+lines=fd.xreadlines()
+for line in lines:
+    i_read=[]
+    i2_read=[]
+    temp=line.strip().split('\t')
+    i_coordinate=temp[COORDINATECOLUMN-1]
+    i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
+    i_read=temp[ALLELECOLUMN-1].split(',')
+    i_read=map(int,i_read)
+    coverage=len(i_read)
+
+### Evaluate 1 major allele ###    
+    i_all_allele=list(set(i_read))
+    i_major_allele=majorallele(i_read)
+    f_majorallele=i_read.count(i_major_allele)
+### Evaluate 2 major allele ### 
+    if len(i_all_allele)>1:
+        i2_read=filter(lambda a: a != i_major_allele, i_read)
+        i_major2_allele=majorallele(i2_read)
+        f_majorallele2=i_read.count(i_major2_allele)
+        ### Evaluate 3 major allele ### 
+        if len(i_all_allele)>2:
+            i3_read=filter(lambda a: a != i_major2_allele, i2_read)
+            i_major3_allele=majorallele(i3_read)
+            f_majorallele3=i_read.count(i_major3_allele)
+        ### No 3 major allele ### 
+        elif len(i_all_allele)==2:
+            i_major3_allele=i_major2_allele
+    ### No 2 major allele ### 
+    elif len(i_all_allele)==1:
+        #i_major2_allele=majorallele(i_read)
+        i_major2_allele=i_major_allele+len(i_motif)
+        i_major3_allele=i_major2_allele
+        #print line.strip()+'\t'+'\t'.join(['homo','only',str(i_major_allele),str(i_major_allele),'NA'])
+        #continue
+    else:
+        print("no allele is reading")
+        sys.exit()
+    
+## scope filter
+
+#########################################
+######## prob calculation option ########
+#########################################
+    homozygous_collector=0
+    heterozygous_collector=0
+
+      
+    alist=[multinomial_prob(i_major_allele,x,i_motif,errormodeldatabase)for x in i_read]
+    blist=[multinomial_prob(i_major2_allele,x,i_motif,errormodeldatabase)for x in i_read]
+    clist=[multinomial_prob(i_major3_allele,x,i_motif,errormodeldatabase)for x in i_read]
+    
+    ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
+    bclist=averagelist(blist,clist,EXPECTEDLEVELOFMINOR)
+    aclist=averagelist(alist,clist,EXPECTEDLEVELOFMINOR)
+    
+    #print alist,blist,clist
+    majora=sum([math.log(i,10) for i in alist])
+    majorb=sum([math.log(i,10) for i in blist])    
+    majorc=sum([math.log(i,10) for i in clist])
+    homozygous_collector=max(majora,majorb,majorc)
+    
+    homomajor1=max([(majora,i_major_allele),(majorb,i_major2_allele),(majorc,i_major3_allele)])[1]
+    homomajordict={i_major_allele:majora,i_major2_allele:majorb,i_major3_allele:majorc}
+    
+    majorab=sum([math.log(i,10) for i in ablist])
+    majorbc=sum([math.log(i,10) for i in bclist])    
+    majorac=sum([math.log(i,10) for i in aclist])
+    heterozygous_collector=max(majorab,majorbc,majorac)
+    bothheteromajor=max([(majorab,(i_major_allele,i_major2_allele)),(majorbc,(i_major2_allele,i_major3_allele)),(majorac,(i_major_allele,i_major3_allele))])[1]
+    ##heteromajor1=max(bothheteromajor)
+    ##heteromajor2=min(bothheteromajor)
+    pre_heteromajor1=bothheteromajor[0]
+    pre_heteromajor2=bothheteromajor[1]
+    heteromajor1=max((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
+    heteromajor2=min((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
+    
+    logratio_homo=homozygous_collector-heterozygous_collector
+    
+    if logratio_homo>0:
+        fdout.writelines(line.strip()+'\t'+'\t'.join(['homo',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
+    elif logratio_homo<0:
+        fdout.writelines(line.strip()+'\t'+'\t'.join(['hetero',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
+fd.close()
+fdout.close()  
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GenotypingSTR.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,72 @@
+<tool id="GenotypeSTR" name="Correct genotype for microsatellite errors" version="2.0.0">
+  <description> during sequencing and library prep </description>
+  <command interpreter="python2.7">GenotypeTRcorrection.py  $microsat_raw $microsat_error_profile $microsat_corrected  $expectedminorallele </command>
+
+  <inputs>
+    <param name="microsat_raw" type="data" label="Select microsatellite length profile that need to refine genotyping" />
+    <param name="microsat_error_profile" type="data" label="Select microsatellite error profile that correspond to this dataset" />
+	<param name="expectedminorallele" type="float" value="0.5" label="Expected contribution of minor allele when present (0.5 for genotyping)" />
+
+  </inputs>
+  <outputs>
+    <data name="microsat_corrected" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="microsat_raw" value="sampleTRprofile_C.txt"/>
+      <param name="microsat_error_profile" value="PCRinclude.allrate.bymajorallele"/>
+      <param name="expectedminorallele" value="0.5"/>
+      <output name="microsat_corrected" file="sampleTRgenotypingcorrection"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will correct for microsatellite sequencing and library preparation errors using error rates estimated from hemizygous male X chromosome or any rates provided by user. The read profile for each locus will be processed independently. 
+- First, this tool will find three most common read lengths from input read length profile. If the read profile has only one length of TR, the length of one motif longer than the observed length will be used as the second most common read length. 
+- Second, it will calculate probability of three forms of homozygous and use the form which give the highest probability. The same goes for heterozygous. 
+- Third, this tools will calculate log based 10 of (the probability of homozygous/the probability of heterozygous). If this value is more than 0, it will predict this locus to homozygous. If this value is less than 0, it will predict this locus to heterozygous. If this value is 0, read profile at this locus will be discard. 
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+- The input files need to contain at least three columns. 
+- Column 1 = location of microsatellite locus. 
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. 
+
+**Output**
+
+The output will be contain original three (or more) column as the input. However, it will also have these following columns. 
+
+- Additional column 1 = homozygous/heterozygous label.
+- Additional column 2 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Additional column 3 = Allele for most probable homozygous form.
+- Additional column 4 = Allele 1 for most probable heterozygous form.
+- Additional column 5 = Allele 2 for most probable heterozygous form.
+
+**Example**
+
+- Suppose that we sequence one locus of microsatellite with NGS. This locus has **A** motif and the following length (bp) profile. ::
+
+	chr1_100_106	5, 6, 6, 6, 6, 7, 7, 8, 8	A
+	
+- We want to figure out if this locus is a homolozygous or heterozygous and the corresponding allele(s). Therefore, we use this tool to refine genotype.
+- This tool will calculate the probability of homozygous A6A6, A7A7, and A8A8 to generate observed length profile. Among this A7A7 has the highest probability. Therefore, we use this form as the representative for homozygous.
+- Then, this tool will calculate the probability of heterozygous A6A7, A7A8, and A6A8 to generate observed length profile. Among this A6A8 has the highest probability. Therefore, we use this form as the representative for heterozygous.    
+- The A6A7 has higher probability than A7A7. Therefore, the program will report that this locus is a heterozygous locus. ::
+
+	chr1	5,6,6,6,6,7,7,8,8	A	hetero	-14.8744881854	7	6	8
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PEsortedSAM2readprofile.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+import sys
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import bx.seq.twobit
+
+##output columns: read_name chr prefix_start    prefix_end  TR_start    TR_end  suffix_start    suffix_end  TR_length   TR_sequence
+
+samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname
+seq_path = sys.argv[2] #Path to the reference genome in 2bit format
+
+##maxTRlength=int(sys.argv[4])
+##maxoriginalreadlength=int(sys.argv[5])
+maxTRlength=int(sys.argv[3])
+maxoriginalreadlength=int(sys.argv[4])
+outfile=sys.argv[5]
+fout = open(outfile,'w')
+
+twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
+
+skipped=0
+while True:
+    read = samf.readline().strip()
+    if not(read): #EOF reached
+        break
+    if read[0] == "@":
+        #print read
+        continue
+    mate = samf.readline().strip()
+    if not(mate): #EOF reached
+        break
+    read_elems = read.split()
+    mate_elems = mate.split()
+    read_name = read_elems[0].strip()
+    mate_name = mate_elems[0].strip()
+    while True:
+        if read_name == mate_name:
+            break
+        elif read_name != mate_name:
+            #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry."
+            #break
+            skipped += 1
+            read = mate
+            read_elems = mate_elems
+            mate = samf.readline().strip()
+            read_name = read_elems[0].strip()
+            mate_name = mate_elems[0].strip()
+            if not(mate): #EOF reached
+                break
+            mate_elems = mate.split()
+    #extract XT:A tag
+    #for e in  read_elems:
+    #    if e.startswith('XT:A'):
+    #        read_xt = e
+    #for e in  mate_elems:
+    #    if e.startswith('XT:A'):
+    #        mate_xt = e
+    #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems:   #both read and it's mate need to be mapped uniquely
+    #    continue
+    read_chr = read_elems[2]
+    read_start = int(read_elems[3])
+    read_cigar = read_elems[5]
+    if len(read_cigar.split('M')) != 2:     #we want perfect matches only..cigar= <someInt>M
+        continue
+    read_len = int(read_cigar.split('M')[0])
+    mate_chr = mate_elems[2]
+    mate_start = int(mate_elems[3])
+    mate_cigar = mate_elems[5]
+    if len(mate_cigar.split('M')) != 2:     #we want perfect matches only..cigar= <someInt>M
+        continue
+    mate_len = int(mate_cigar.split('M')[0])
+    if read_chr != mate_chr:            # check that they were mapped to the same chromosome
+        continue
+    if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength):
+        continue
+    if read_start < mate_start:
+        pre_s = read_start-1
+        pre_e = read_start-1+read_len
+        tr_s = read_start-1+read_len
+        tr_e = mate_start-1
+        suf_s = mate_start-1
+        suf_e = mate_start-1+mate_len
+    else:
+        pre_s = mate_start-1
+        pre_e = mate_start-1+mate_len
+        tr_s = mate_start-1+mate_len
+        tr_e = read_start-1
+        suf_s = read_start-1
+        suf_e = read_start-1+read_len
+    tr_len = abs(tr_e - tr_s)
+    if tr_len > maxTRlength:
+        continue
+    if pre_e >= suf_s:  #overlapping prefix and suffix
+        continue
+    tr_ref_seq = twobitfile[read_chr][tr_s:tr_e]
+    ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
+    fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n')
+
+print  "Skipped %d unpaired reads" %(skipped)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PEsortedSAM2readprofile.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,62 @@
+<tool id="PEsortedSAM2readprofile" name="Combine mapped flaked bases" version="1.0.0">
+  <description> from SAM file sorted by readname  </description>
+  <command interpreter="python2.7">PEsortedSAM2readprofile.py  $flankedbasesSAM $twobitref $maxTRlength $maxoriginalreadlength $output </command>
+
+  <inputs>
+    <param name="flankedbasesSAM" type="data" format="sam" label="Select sorted SAM file (by readname) of flaked bases" />
+    <param name="twobitref" type="data" label="Select twobit file reference genome" />
+	<param name="maxTRlength" type="integer" value="100" label="Maximum expected microsatellite length (bp)" />
+	<param name="maxoriginalreadlength" type="integer" value="101" label="Maxinum original read length" />
+
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="flankedbasesSAM" value="samplesortedPESAM_C.sam"/>
+      <param name="twobitref" value="shifted.2bit"/>
+      <param name="maxTRlength" value="100"/>
+      <param name="maxoriginalreadlength" value="250"/>
+      <output name="output" file="samplePESAM_2_profile_C.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will take SAM file sorted by read name, remove unpaired reads, report microsatellites sequences in the reference genome that correspond to the space between paired end reads. Coordinate of start and stop for left and right flanking regions of microsatellites and microsatellite itself as inferred from paired end reads will also be reported.
+- These microsatellites in reference can be used to filter out reads that do not contain microsatellites that concur with microsatellites in reference where the reads mapped to.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+- Sorted SAM files by read name
+
+**Output**
+
+The output will combined two lines of input which are paired. The output format is as follow.
+
+- Column 1 = read name
+- Column 2 = chromosome 
+- Column 3 = left flanking region start
+- Column 4 = left flanking region stop
+- Column 5 = microsatellite start
+- Column 6 = microsatellite stop
+- Column 7 = right flanking region start
+- Column 8 = right flanking region stop
+- Column 9 = microsatellite length in reference
+- Column 10= microsatellite sequence in reference
+
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/changespacetounderscore_readname.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,15 @@
+import sys
+fd=open(sys.argv[1])
+output=open(sys.argv[2],'w')
+columntochange=int(sys.argv[3])-1  # default is 6-1=5
+lines=fd.xreadlines()
+for line in lines:
+	temp=line.strip().split('\t')
+	temp=filter(None,temp)
+	temp2=temp[columntochange].replace(' ','_')
+	product=temp[:columntochange]
+	product.append(temp2)
+	product.extend(temp[columntochange+1:])
+	output.writelines('\t'.join(product)+'\n')
+fd.close()
+output.close()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combinedprobforallelecombination.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,41 @@
+import sys
+import collections
+import math
+SAMPLINGCOL=11
+ALLELE1COL=7
+ALLELE2COL=8
+SIGNCOL=4
+readprofileCOL=2
+motifCOL=3
+filaname=sys.argv[1]
+fd=open(filaname)
+lines=fd.readlines()
+binomialcombine=collections.defaultdict(list)
+for line in lines:
+    temp=line.strip().split('\t')
+    allelelist=[]
+    allelelist.append(int(temp[ALLELE1COL-1]))
+    allelelist.append(int(temp[ALLELE2COL-1]))
+    allelelist.sort()
+    #allelelist=map(str,allelelist)
+    alleleave=str(allelelist[0])+'_'+str(allelelist[1])
+    #alleleave=str(sum(allelelist)/2.0)
+    ##alleleave=str(allelelist[0])+'_'+str(allelelist[1])
+    totalcov=len(temp[readprofileCOL-1].split(','))
+    motif=temp[motifCOL-1]
+    samplingvalue=float(temp[SAMPLINGCOL-1])
+    SIGN=1 
+    binomialcombine[(totalcov,alleleave,motif)].append(SIGN*samplingvalue)
+allkeys= binomialcombine.keys()
+allkeys.sort()
+##print allkeys
+print 'read_depth'+'\t'+'allele'+'\t'+'heterozygous_prob'+'\t'+'motif'
+for key in allkeys:
+    ##templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2],str(map(str,(binomialcombine[key])))]
+    templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2]]
+
+    print '\t'.join(templist)
+#print allkeys#,binomialcombine
+    
+    
+        
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combineprobforallelecombination.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,67 @@
+<tool id="combineproballelecom" name="Combine probability to generate read profile " version="2.0.0">
+  <description>from the same allele combination</description>
+  <command interpreter="python2.7">combinedprobforallelecombination.py  $input > $output </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select microsatellite length profile" />
+ 
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="probvalueforhetero_out.txt"/>
+      <output name="output" file="combineprob_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will combine probability that the allele combination can generated any read profile in the input. This is the last step to calculate probability to detect heterozygous for each allele combination and each depth.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+The input format is the same as output from **Evaluate the probability of the allele combination to generate read profile** tool.
+
+- Column 1 = location of microsatellite locus. 
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. 
+- Column 4 = homozygous/heterozygous label.
+- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Column 6 = Allele for most probable homozygous form.
+- Column 7 = Allele 1 for most probable heterozygous form.
+- Column 8 = Allele 2 for most probable heterozygous form.
+- Column 9 = Probability of the allele combination to generate given read profile.
+- Column 10 = Number of possible rearrangement of given read profile.
+- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10)
+- Column 12 = Read depth
+
+Only column 2,3,4,7,8,11 were used in calculation. 
+
+**Output**
+
+
+The output will contain the following header and column
+ 
+- Line 1 header: read_depth	allele	heterozygous_prob	motif
+- Column 1 = read depth
+- Column 2 = allele combination
+- Column 3 = probability to detect heterozygous of that allele combination
+- Column 4 = motif
+
+
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fetchflank.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,73 @@
+<tool id="fetchflank" name="Fetch flanking bases" version="1.0.0">
+  <description> of microsatellites and output as two fastq files in forward-forward orientation</description>
+  <command interpreter="python">pair_fetch_DNA_ff.py  $microsat_in_read $Leftflanking $Rightflanking $qualitycutoff $lengthofbasetocheckquality  </command>
+
+  <inputs>
+    <param name="microsat_in_read" type="data" label="Select data of microsatellites in reads" />
+    <param name="qualitycutoff" type="integer" value="20" label="Minimum quality score (Phred+33) for microsatellites and flanking regions" />
+    <param name="lengthofbasetocheckquality" type="integer" value="20" label="Length of flanking regions that require quality screening" />        
+  </inputs>
+  <outputs>
+    <data format="fastq" name="Leftflanking" />
+    <data format="fastq" name="Rightflanking" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="microsat_in_read" value="samplefq.snoope"/>
+      <param name="qualitycutoff" value="20"/>
+      <param name="lengthofbasetocheckquality" value="20"/>
+      <output name="Leftflanking" file="microsatellite_flanking_L.fastq"/>
+      <output name="Rightflanking" file="microsatellite_flanking_R.fastq"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool will fetch flanking regions around microsatellites, screen for quality score at microsatellites and adjacent flanking regions, and output two fastq files containing flanking regions in forward-forward direction.
+
+- This tool assumes that the quality score is Phred+33, such as Sanger fastq.
+- Reads that have either left or right flanking regions shorter than the length of flanking regions that require quality screening will be removed.
+
+**Citation**
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+The input files need to be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score**
+
+**Output**
+
+The output will be the two fastq files. The first file contains left flank regions. The second file contains right flanking regions.
+
+**Example**
+
+- Suppose we detected the microsatellites from short reads ::
+
+	6	40	54	G	0	SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1	TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG	GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA
+    
+
+- We want to get fastq files of flanking regions around microsatellite with quality score at least 20 on Phred +33  
+  
+- Then the program will report these two fastq files ::
+
+	@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+	TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT
+	+SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+	GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
+
+
+	@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+	TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG
+	+SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+	GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA
+  
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/heteroprob.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,199 @@
+### import libraries ###
+import sys
+import collections, math
+import heapq
+import itertools
+
+
+
+### basic function ###
+def permuterepeat(n,rlist):
+    f = math.factorial
+    nfac=f(n)
+    rfaclist=[f(i) for i in rlist]
+    for rfac in rfaclist:
+        nfac=nfac/rfac
+    return nfac
+
+def nCr(n,r):
+    f = math.factorial
+    return f(n) / f(r) / f(n-r)
+    
+def averagelist(a,b,expectedlevelofminor):
+    product=[]
+    for i in range(len(a)):
+        product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
+  
+    return product
+        
+def complement_base(read):
+    collect=''
+    for i in read:
+        if i.upper()=='A':
+            collect+='T'
+        elif i.upper()=='T':
+            collect+='A'
+        elif i.upper()=='C':
+            collect+='G'
+        elif i.upper()=='G':
+            collect+='C'
+    return collect
+def makeallpossible(read):
+    collect=[]
+    for i in range(len(read)):
+        tmp= read[i:]+read[:i]
+        collect.append(tmp)
+        collect.append(complement_base(tmp))
+    return collect
+
+def motifsimplify(base):
+    '''str--> str
+    '''
+    motiflength=len(base)
+    temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
+    
+    return temp[0]
+
+def majorallele(seq):
+    binseq=list(set(seq))  
+    binseq.sort(reverse=True)   # highly mutate mode
+    #binseq.sort()              # majority mode
+    storeform=''
+    storevalue=0
+    for i in binseq:
+        if seq.count(i)>storevalue:
+            storeform=i
+            storevalue=seq.count(i)
+            
+    return int(storeform)
+
+### decide global parameter ###
+COORDINATECOLUMN=1
+ALLELECOLUMN=2
+MOTIFCOLUMN=3
+inputname=sys.argv[1]
+errorprofile=sys.argv[2]
+EXPECTEDLEVELOFMINOR=float(sys.argv[3])
+if EXPECTEDLEVELOFMINOR >0.5:
+	try:
+		errorexpectcontribution=int('a')
+	except Exception, eee:
+		print eee
+		stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
+MINIMUMMUTABLE=0 ###1.2*(1.0/(10**8))  #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
+
+
+## Fixed global variable
+ALLREPEATTYPE=[1,2,3,4]
+ALLREPEATTYPENAME=['mono','di','tri','tetra']
+monomotif=['A','C']
+dimotif=['AC','AG','AT','CG']
+trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
+tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
+'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
+'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
+ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
+monorange=range(5,60)
+dirange=range(6,60)
+trirange=range(9,60)
+tetrarange=range(12,80)
+ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
+
+#########################################
+######## Prob calculation sector ########
+#########################################
+def multinomial_prob(majorallele,STRlength,motif,probdatabase):
+    '''int,int,str,dict-->int
+    ### get prob for each STRlength to be generated from major allele
+    '''
+    #print (majorallele,STRlength,motif)
+    prob=probdatabase[len(motif)][motif][majorallele][STRlength]
+    return prob
+
+################################################
+######## error model database sector ###########
+################################################
+
+## structure generator
+errormodeldatabase={1:{},2:{},3:{},4:{}}
+sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
+for repeattype in ALLREPEATTYPE:
+    for motif in ALLMOTIF[repeattype]:
+        errormodeldatabase[repeattype][motif]={}
+        sumbymajoralleledatabase[repeattype][motif]={}
+        for motifsize1 in ALLRANGE[repeattype]:
+            errormodeldatabase[repeattype][motif][motifsize1]={}
+            sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
+            for motifsize2 in ALLRANGE[repeattype]:
+                errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
+#print errormodeldatabase
+## read database
+
+## get read count for each major allele
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_count=int(temp[2])
+    motif=temp[3]
+    sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
+fd.close()
+##print sumbymajoralleledatabase
+
+## get probability
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_read=int(temp[1])
+    t_count=int(temp[2])
+    motif=temp[3]
+    if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
+        errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
+        #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
+        
+    #else:
+    #    errormodeldatabase[repeattype][motif][t_major][t_read]=0
+fd.close()
+#print errormodeldatabase    
+#print math.log(100,10)
+#########################################
+######## input reading sector ###########
+#########################################
+
+
+
+fd = open(inputname)
+##fd=open('sampleinput_C.txt')
+lines=fd.xreadlines()
+for line in lines:
+    i_read=[]
+    i2_read=[]
+    temp=line.strip().split('\t')
+    i_coordinate=temp[COORDINATECOLUMN-1]
+    i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
+    i_read=temp[ALLELECOLUMN-1].split(',')
+    i_read=map(int,i_read)
+    depth=len(i_read)
+    heteromajor1=int(temp[6])
+    heteromajor2=int(temp[7])
+
+### calculate the change to detect combination (using error profile)
+    heterozygous_collector=0  
+    alist=[multinomial_prob(heteromajor1,x,i_motif,errormodeldatabase)for x in i_read]
+    blist=[multinomial_prob(heteromajor2,x,i_motif,errormodeldatabase)for x in i_read]
+      
+    ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
+       
+    if 0 in ablist:
+        continue
+    heterozygous_collector=reduce(lambda y, z: y*z,ablist )
+
+### prob of combination (using multinomial distribution)
+    frequency_distribution=[len(list(group)) for key, group in itertools.groupby(i_read)]
+    ## print frequency_distribution
+    expandbypermutation=permuterepeat(depth,frequency_distribution)
+
+    print line.strip()+'\t'+str(heterozygous_collector)+'\t'+str(expandbypermutation)+'\t'+str(expandbypermutation*heterozygous_collector)+'\t'+str(depth)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatcompat.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,36 @@
+import sys
+# remove all read that have unmatch microsat
+# check only one line at a time
+def complement_base(read):
+    collect=''
+    for i in read:
+        if i.upper()=='A':
+            collect+='T'
+        elif i.upper()=='T':
+            collect+='A'
+        elif i.upper()=='C':
+            collect+='G'
+        elif i.upper()=='G':
+            collect+='C'
+    return collect
+   
+def makeallpossible(read):
+    collect=[]
+    for i in range(len(read)):
+        tmp= read[i:]+read[:i]
+        collect.append(tmp)
+        collect.append(complement_base(tmp))
+    return collect
+
+
+fd=open(sys.argv[1])
+lines=fd.xreadlines()
+firstcolumn=int(sys.argv[2])-1 #4
+secondcolumn=int(sys.argv[3])-1 # 10
+for line in lines:
+    temp=line.strip().split('\t')
+    temp=filter(None,temp)
+    micro1=temp[firstcolumn]
+    micro2=temp[secondcolumn]
+    if micro1 in makeallpossible(micro2):
+        print line.strip()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatcompat.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,76 @@
+<tool id="microsatcompat" name="Check microsatellites motif compatibility" version="1.0.0">
+  <description> </description>
+  <command interpreter="python">microsatcompat.py $input $column1 $column2 > $output </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select input" />
+    <param name="column1" type="integer" value="4" label="First column number" />
+    <param name="column2" type="integer" value="10" label="Second column number" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="microsatcompat_in.txt"/>
+      <param name="column1" value="4"/>      
+      <param name="column2" value="10"/>
+      <output name="output" file="microsatcompat_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to select only the input lines which have compatible microsatellite motifs between two columns. Compatible here is defined as the microsatellites motif that are complementary or have the same sequence when change starting point of motif. For example, **A** is the same as **T**. Also, **AGG** is the same as **GAG**.
+
+For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to make sure that the microsatellites in the reads have the same motif as the microsatellites in the reference at the corresponding mapped location. 
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+The input files can be any tab delimited file. 
+
+If this tool is used in TRFM microsatellite profiling, it should contains:
+
+- Column 1 = microsatellite location in reference chromosome
+- Column 2 = microsatellite location in reference start
+- Column 3 = microsatellite location in reference stop
+- Column 4 = microsatellite location in reference motif
+- Column 5 = microsatellite location in reference length
+- Column 6 = microsatellite location in reference motif size
+- Column 7 = length of microsatellites (bp)
+- Column 8 = length of left flanking regions (bp)
+- Column 9 = length of right flanking regions (bp)
+- Column 10 = repeat motif (bp)
+- Column 11 = hamming distance 
+- Column 12 = read name
+- Column 13 = read sequence with soft masking of microsatellites
+- Column 14 = read quality (the same Phred score scale as input)
+- Column 15 = read name (The same as column 12)
+- Column 16 = chromosome 
+- Column 17 = left flanking region start
+- Column 18 = left flanking region stop
+- Column 19 = microsatellite start as infer from pair-end
+- Column 20 = microsatellite stop as infer from pair-end
+- Column 21 = right flanking region start
+- Column 22 = right flanking region stop
+- Column 23 = microsatellite length in reference
+- Column 24 = microsatellite sequence in reference
+
+**Output**
+
+The same as input format.
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatellite.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,1271 @@
+#!/usr/bin/env python
+"""
+Snoop thru a fasta file looking for microsatellite repeats of given periods
+Output format: length_of_repeat left_flank_length right_flank_length  repeat_motif  hamming_distance  read_name read_sequence read_quality  (additional columns)
+
+If --r option turned on, output format will have additional columns behind:
+read_name read_chr  pre_s pre_e tr_s  tr_e  suf_s suf_e tr_len  tr_ref_seq
+
+pre_s           where the read start
+pre_e           the last position before microsatellite
+tr_s            where microsatellite start
+tr_e            where microsatellite end
+suf_s           first base after microsatellite
+tr_ref_seq      reference sequence corresponding to microsatellite
+
+* output positions are 0 based
+
+:Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu)
+          
+modifing log:
+
+09/27/2013
+replace function dense_intervals with function non_negative_intervals, which do not need to import such file.
+
+10/18/2013
+modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel
+
+02/25/2014
+add function that can deal with mapped reads
+with additional output
+
+02/28/2014
+modify the 0-based end point, as in 0-base area, it is half-open [ )
+so the 0-based site, should always be added by 1
+
+03/05/2014
+deal with multi-fasta
+"""
+from sys          import argv,stdin,stderr,exit
+from string       import maketrans
+from md5          import new as md5_new
+import re
+#from pyfracluster import dense_intervals
+
+def usage(s=None):
+    message = """
+usage: microsat_snoop [fasta_file] [options]
+  <fasta_file>                Name of file to read sequences from;  if absent,
+                              sequences are read from stdin
+  --fasta                     Input file is in fasta format
+                              (this is the default)
+  --fastq                     Input file is in fastq format
+                              (default is fasta unless filename is .fastq)
+  --fastq:noquals             Input file is in fastq format, but discard quals
+  --sam                       Input file is SAM file 
+  --r                         Indicate additional output information, if indicated,
+                              --ref option is mendatory
+  --ref=<filepath>            Reference file (absolute) path
+  --period=<length>           (mandatory,cumulative) repeat length(s) to be
+                              searched for
+                              <length> is expected to be small, less than 10
+                              <length> can also be a comma-separated list, or
+                              a range <low>..<high>
+  --rate=<fraction>           control the candidate repeat interval detector;
+                              it will consider intervals with at least
+                              <fraction> of matches when shifted by the period;
+                              <fraction> is between 0 and 1 and can be either a
+                              real number or <n>/<d>
+                              (default is 6/7)
+  --minlength=<length>        minimum length of intervals reported, in bp
+                              (default is 20)
+  --progress=<count>          how often to report the sequence we're searching
+                              (default is no progress report)
+  --allowduplicates           process all input sequences
+                              (this is the default)
+  --noduplicates              ignore any input sequence that's the same as an
+                              earlier sequence
+  --nonearduplicates          ignore any input sequence that has the same first
+                              100 bp as an earlier sequence
+  --nonearduplicate=<length>  ignore any input sequence that has the same first
+                              <length> bp as an earlier sequence
+  --hamming=<count>           Don't report candidate repeat intervals that have
+                              more than <count> mismatches
+                              (default is to do no such filtering)
+  --prefix=<length>           Don't report candidate repeat intervals that
+                              start within <length> of the sequence start
+                              (default is to do no such filtering)
+  --suffix=<length>           Don't report candidate repeat intervals that
+                              end within <length> of the sequence end
+                              (default is to do no such filtering)
+  --subsample=<k>/<n>         Process only the <k>th sequence of every group of
+                              <n> sequences;  <k> ranges from 1 to <n>
+  --multipleruns              Consider all candidate intervals in a sequence
+                              (default is to consider only the longest)
+  --partialmotifs             Consider microatelites with a partial motif
+                              (default is to consider only whole motifs)
+  --splitbyvalidity           Preprocess sequences, splitting at Ns;  this
+                              prevents candidates from including Ns
+                              (default is not to split)
+  --noflankdisplay            Show entire sequence as flanking regions
+                              (this is the default)
+  --flankdisplay=<length>     Limit length of flanking regions shown
+  --readnamesuffix=<string>   Root of suffix to append to read names;  e.g. 1
+                              for forward, 2 for reverse;  this triggers other
+                              info to be included in the suffix
+                              (default is "1" for fastq;  no suffix for fasta)
+  --head=<number>             limit the number of sequences processed
+  --markend                   Write a marker line upon completion
+                              (default is not to write a marker)
+  --help=details              Describe the process, and quit"""
+
+    if (s == None): exit (message)
+    else:           exit ("%s\n%s" % (s,message))
+
+
+detailedDescription = """In broad terms, the process works as follows:
+
+(1) Identify intervals that are highly correlated with the interval shifted by
+    P (the repeat period).  These intervals are called "runs" or "candidates".
+    The level of correlation required is controlled by rateThreshold. 
+    Depending on whether we want to look for more than one microsat, we either
+    find the longest such run (simple algorithm) or many runs (more complicated
+    algorithm). The following steps are then performed on each run.
+
+(2) Find the most likely repeat motif in the run.  This is done by counting
+    all kmers (of length P) and choosing the most frequent.  If that kmer is
+    itself covered by a sub-repeat we discard this run.  The idea is that we
+    can ignore a 6-mer like ACGACG because we will find it when we are looking
+    for 3-mers.
+
+(3) Once we identify the most likely repeat motif, we then modify the
+    interval, adjusting start and end to find the interval that has the fewest
+    mismatches vs. a sequence of the motif repeated (hamming distance).  Only
+    whole copies of the motif are considered.
+
+(4) At this point we have a valid microsat interval (in the eyes of the
+    program). It is subjected to some filtering stages (hamming distance or too
+    close to an end), and if it satisfies those conditions, it's reported to
+    the user."""
+
+def main():
+    global debug
+
+    #=== parse the command line ===
+
+    inputFilename         = None
+    referenceFileName     = None #add by Chen Sun on 02/25
+    inputFormat           = None
+    repeatPeriods         = []
+    rateThreshold         = 6 / 7.0
+    lengthThreshold       = 20
+    reportProgress        = None
+    discardDuplicates     = False
+    discardNearDuplicates = False
+    nearDuplicatePrefix   = 100
+    hammingThreshold      = 0
+    prefixThreshold       = None
+    suffixThreshold       = None
+    subsampleK            = None
+    subsampleN            = None
+    reportMultipleRuns    = False
+    allowPartialMotifs    = False
+    splitByValidity       = False
+    flankDisplayLimit     = None
+    readNameSuffix        = None
+    headLimit             = None
+    markEndOfFile         = False
+    additionalInfo        = False
+    debug                 = []
+
+    for arg in argv[1:]:
+        if (arg == "--fasta"):
+            inputFormat = "fasta"
+        elif (arg == "--fastq"):
+            inputFormat = "fastq"
+        elif (arg == "--fastq:noquals"):
+            inputFormat = "fastq:noquals"
+        elif (arg == "--sam"):
+            inputFormat = "sam"
+        elif (arg == "--r"):
+            additionalInfo = True
+        elif (arg.startswith("--ref=")):
+            referenceFileName = arg.split("=",1)[1]
+        elif (arg.startswith("--period=")):
+            val = arg.split("=",1)[1]
+            for period in val.split(","):
+                if (".." in period):
+                    (lowPeriod,highPeriod) = period.split("..",1)
+                    lowPeriod  = int(lowPeriod)
+                    highPeriod = int(highPeriod)
+                    for period in xrange(lowPeriod,highPeriod+1):
+                        repeatPeriods += [period]
+                else:
+                    repeatPeriods += [int(period)]
+        elif (arg.startswith("--rate=")):
+            val = arg.split("=",1)[1]
+            rateThreshold = float_or_fraction(val)
+            assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val
+        elif (arg.startswith("--minlength=")):
+            val = arg.split("=",1)[1]
+            lengthThreshold = int(val)
+            assert (lengthThreshold >= 0)
+        elif (arg.startswith("--progress=")):
+            val = arg.split("=",1)[1]
+            reportProgress = int(val)
+        elif (arg == "--allowduplicates"):
+            discardDuplicates     = False
+            discardNearDuplicates = False
+        elif (arg == "--noduplicates"):
+            discardDuplicates     = True
+            discardNearDuplicates = False
+        elif (arg == "--nonearduplicates"):
+            discardDuplicates     = False
+            discardNearDuplicates = True
+        elif (arg.startswith("--nonearduplicate=")):
+            val = arg.split("=",1)[1]
+            discardDuplicates     = False
+            discardNearDuplicates = True
+            nearDuplicatePrefix   = int(val)
+            assert (nearDuplicatePrefix > 0)
+        elif (arg.startswith("--hamming=")):
+            val = arg.split("=",1)[1]
+            hammingThreshold = int(val)
+            assert (hammingThreshold >= 0)
+        elif (arg.startswith("--prefix=")):
+            val = arg.split("=",1)[1]
+            prefixThreshold = int(val)
+            assert (prefixThreshold >= 0)
+        elif (arg.startswith("--suffix=")):
+            val = arg.split("=",1)[1]
+            suffixThreshold = int(val)
+            assert (suffixThreshold >= 0)
+        elif (arg.startswith("--subsample=")):
+            val = arg.split("=",1)[1]
+            (k,n) = val.split("/",2)
+            subsampleK = int(k)
+            subsampleN = int(n)
+            assert (0 < subsampleK <= subsampleN)
+        elif (arg == "--multipleruns"):
+            reportMultipleRuns = True
+        elif (arg == "--partialmotifs"):
+            allowPartialMotifs = True
+        elif (arg == "--splitbyvalidity"):
+            splitByValidity = True
+        elif (arg == "--noflankdisplay"):
+            flankDisplayLimit = None
+        elif (arg.startswith("--flankdisplay=")):
+            val = arg.split("=",1)[1]
+            flankDisplayLimit = int(val)
+            assert (flankDisplayLimit >= 0)
+        elif (arg.startswith("--readnamesuffix")):
+            readNameSuffix = arg.split("=",1)[1]
+        elif (arg.startswith("--head=")):
+            headLimit = int_with_unit(arg.split("=",1)[1])
+        elif (arg == "--markend"):
+            markEndOfFile = True
+        elif (arg == "--help=details"):
+            exit (detailedDescription)
+        elif (arg.startswith("--debug=")):
+            debug += (arg.split("=",1)[1]).split(",")
+        elif (arg.startswith("--")):
+            usage("unrecognized option: %s" % arg)
+        elif (inputFilename == None):
+            inputFilename = arg
+        else:
+            usage("unrecognized option: %s" % arg)
+
+    #=== determine periods of interest ===
+
+    if (repeatPeriods == []):
+        usage("you gotta give me a repeat period")
+
+    if (additionalInfo == True):
+        if (referenceFileName == None):
+            usage("reference file path needed. use --ref=<reference> to indicate")
+
+    periodSeed = {}
+    for period in repeatPeriods:
+        if (period < 1): usage("period %d is not valid" % period)
+        periodSeed[period] = True
+
+    repeatPeriods = [period for period in periodSeed]
+    repeatPeriods.sort()
+
+    #=== determine input format ===
+
+    if   (inputFormat == "fasta"):           sequence_reader = fasta_sequences
+    elif (inputFormat == "fastq"):           sequence_reader = fastq_sequences
+    elif (inputFormat == "fastq:noquals"):   sequence_reader = fastq_sequences
+    elif (inputFormat == "sam"):             sequence_reader = sam_sequences
+    elif (inputFilename == None):            sequence_reader = fasta_sequences
+    elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences
+    elif (inputFilename.endswith(".fq")):    sequence_reader = fastq_sequences
+    elif (inputFilename.endswith(".sam")):   sequence_reader = sam_sequences
+    else:                                    sequence_reader = fasta_sequences
+
+    if (inputFilename != None): inputF = file(inputFilename,"rt")
+    else:                       inputF = stdin
+
+    if   (readNameSuffix == None) \
+     and (sequence_reader == fastq_sequences) \
+     and (inputFormat != "fastq:noquals"):
+        readNameSuffix = "1"
+
+    #=== process the sequences ===
+    
+    refSequence = {}
+    rightName = ""
+    sequence = ""
+    if additionalInfo:
+        firstFasta = True
+        originalRefF = open(referenceFileName)
+        for line in originalRefF.readlines():
+            line = line.replace('\r','')
+            line = line.replace('\n','')
+            if line.startswith(">"):
+                if firstFasta:
+                    firstFasta = False
+                else:
+                    refSequence[rightName] = sequence
+                rightName = line[1:]
+                sequence = ""
+                continue
+            sequence += line
+        originalRefF.close()
+        refSequence[rightName] = sequence
+
+    sequenceSeen = {}
+
+    numSequences = 0
+    for seqInfo in sequence_reader(inputF):
+        numSequences += 1
+        if (headLimit != None) and (numSequences > headLimit):
+            print >>stderr, "limit of %d sequences reached" % headLimit
+            break
+
+        if (sequence_reader == sam_sequences):
+            #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar
+            (name, sequence, refName, pre_s, cigar) = seqInfo
+            quals = None
+        elif (sequence_reader == fastq_sequences):
+            (name,sequence,quals) = seqInfo
+            if (inputFormat == "fastq:noquals"): quals = None
+        else:
+            (name,sequence) = seqInfo
+            quals = None
+
+        if (reportProgress != None) and (numSequences % reportProgress == 0):
+            print >>stderr, "%s %d" % (name,numSequences)
+
+        # if we're subsampling and not interested in this sequence, skip it
+
+        if (subsampleN != None):
+            if ((numSequences-1) % subsampleN != (subsampleK-1)):
+                continue
+
+        # if this sequence is shorter than the length of interest, skip it
+
+        seqLen = len(sequence)
+        if (seqLen < period) or (seqLen < lengthThreshold): continue
+
+        # if we're not interested in duplicates and this is one, skip it;
+        # note that we assume no hash collisions occur, i.e. that all hash
+        # matches are truly sequence matches
+
+        if (discardDuplicates):
+            h = hash108(sequence)
+            if (h in sequenceSeen): continue
+            sequenceSeen[h] = True
+        elif (discardNearDuplicates):
+            h = hash108(sequence[:nearDuplicatePrefix])
+            if (h in sequenceSeen): continue
+            sequenceSeen[h] = True
+
+        # split the sequence into chunks of valid nucleotides
+
+        if (splitByValidity):
+            chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)]
+        else:
+            chunks = [(0,len(sequence))]
+
+        # evaluate for each period of interest
+
+        for period in repeatPeriods:
+
+            # operate on each chunk
+
+            for (chunkStart,chunkEnd) in chunks:
+                chunkLen = chunkEnd - chunkStart
+                if (chunkLen < period) or (chunkLen < lengthThreshold): continue
+
+                if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug):
+                    print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd)
+
+                # compute correlation sequence
+
+                corr = correlation_sequence(sequence,period,chunkStart,chunkEnd)
+
+                if ("correlation" in debug) or ("runs" in debug):
+                    print >>stderr, sequence[chunkStart:chunkEnd]
+                    print >>stderr, corr
+
+                # find runs (candidates for being a microsat) 
+
+                if (reportMultipleRuns):
+                    runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold)
+                else:
+                    runs = longest_suitable_run(corr,lengthThreshold,rateThreshold)
+                if (runs == []): continue
+
+
+                if ("runs" in debug):
+                    for (start,end) in runs:
+                        run = [" "] * seqLen
+                        for ix in xrange(start-period,end):
+                            run[ix] = "*"
+                        print >>stderr, "".join(run)
+
+                if ("candidates" in debug):
+                    for (start,end) in runs:
+                        print >>stderr, "%s %d %d" % (name,start,end)
+
+                # process runs and report those that pass muster
+
+                runCount = 0
+                for (start,end) in runs:
+                    runCount += 1
+
+                    start = chunkStart + start - period
+                    end   = chunkStart + end
+
+                    (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs)
+                    if (kmer == None): continue    # (no useful repeat kmer was found)
+
+                    rptExtent = end - start
+                    prefixLen = start
+                    suffixLen = seqLen - end
+                    if (rptExtent <= period): continue
+                    if (hammingThreshold != None) and (d         > hammingThreshold): continue
+                    if (prefixThreshold  != None) and (prefixLen < prefixThreshold):  continue
+                    if (suffixThreshold  != None) and (suffixLen < suffixThreshold):  continue
+
+                    if (flankDisplayLimit == None):
+                        seq = sequence[:start] \
+                            + sequence[start:end].lower() \
+                            + sequence[end:]
+                    else:
+                        seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \
+                            + sequence[start:end].lower() \
+                            + sequence[end:min(chunkEnd,end+flankDisplayLimit)]
+                    reportName = name
+                    if (readNameSuffix != None):
+                        reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount)
+                    if (quals == None or quals == "." or quals == "\t."): quals = "\t."
+                    else:               quals = "\t" + quals
+                    if not additionalInfo:
+                        print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \
+                            % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals)
+                    else:
+                        #pre_e = pre_s + prefixLen - 1
+                        refPoint = pre_s
+                        donorPoint = 0
+                        
+                        donorBeforeStart = prefixLen - 1 #pre_e
+                        donorMicroStart = prefixLen     #tr_s
+                        donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e
+                        donorAfterMicro = donorMicroEnd + 1 #suf_s
+                        donorEnd = len(seq) - 1    #suf_e
+                        
+                        set_pre_e = False
+                        set_tr_s = False
+                        set_tr_e = False
+                        set_suf_s = False
+                        set_suf_e = False
+                        
+                        pre_e = 0
+                        tr_s = 0
+                        tr_e = 0
+                        suf_s = 0
+                        suf_e = 0
+                        
+                        matchList = re.findall('(\d+)([IDM])', cigar)
+                        unCognitiveCigar = False
+                        for matchN, matchType in matchList:
+                            matchNum = int(matchN)
+                            if matchType == "M":
+                                donorPoint = donorPoint + matchNum
+                                refPoint = refPoint + matchNum
+                            elif matchType == "D":
+                                refPoint = refPoint + matchNum
+                                continue
+                            elif matchType == "I":
+                                donorPoint = donorPoint + matchNum
+                            else:
+                                unCognitiveCigar = True
+                                break
+
+                            if not set_pre_e:
+                                if donorPoint >= donorBeforeStart:
+                                    pre_e = refPoint - (donorPoint - donorBeforeStart)
+                                    set_pre_e = True
+                                else:
+                                    continue
+                                    
+                            if not set_tr_s:
+                                if donorPoint >= donorMicroStart:
+                                    tr_s = refPoint - (donorPoint - donorMicroStart)
+                                    set_tr_s = True
+                                else:
+                                    continue
+                                    
+                            if not set_tr_e:
+                                if donorPoint >= donorMicroEnd:
+                                    tr_e = refPoint - (donorPoint - donorMicroEnd)
+                                    set_tr_e = True
+                                else:
+                                    continue
+                                    
+                            if not set_suf_s:
+                                if donorPoint >= donorAfterMicro:
+                                    suf_s = refPoint - (donorPoint - donorAfterMicro)
+                                    set_suf_s = True
+                                else:
+                                    continue
+                                    
+                            if not set_suf_e:
+                                if donorPoint >= donorEnd:
+                                    suf_e = refPoint - (donorPoint - donorEnd)
+                                    set_suf_e = True
+                                else:
+                                    continue
+                                
+                        if unCognitiveCigar:
+                            break
+                        tr_len = tr_e - tr_s + 1
+
+                        if refName not in refSequence:
+                            tr_ref_seq = "."
+                        else:
+                            if refSequence[refName] == "":
+                                tr_ref_seq = "."
+                            elif len(refSequence[refName]) <= tr_e:
+                                tr_ref_seq = "."
+                            else:
+                                tr_ref_seq = refSequence[refName][tr_s:tr_e+1]
+
+                        pre_e += 1
+                        tr_e += 1
+                        suf_e += 1
+                        print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \
+                            % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
+
+    if (markEndOfFile):
+        print "# microsat_snoop end-of-file"
+
+    if (inputF != stdin):
+        inputF.close()
+
+# non_negative_intervals
+#     find intervals with exactly + and no -
+#     from string like this : +++++++++---+++++++++
+def non_negative_intervals(seq, minLength=None):
+
+    start = -1
+    end = -1
+    firstPlus = 1
+    #print seq
+    for ix in range(len(seq)): # for every char in seq    
+        ch = seq[ix]
+        if(ch == "+"):
+            if(firstPlus):
+                firstPlus = 0
+                start = ix
+            else:
+                continue
+        elif(ch == "-"):
+            if(start >= 0):
+                end = ix-1
+                if((end - start + 1) >= minLength):
+                    yield (start,end+1)
+                start = -1
+                firstPlus = 1
+    if(start > 0):
+        if((ix - start + 1) >= minLength):
+            yield (start, ix+1)
+
+
+###################################################################
+# modified by Chen Sun on 7/11/2014
+# We do not want other modules, so parse these functions inside
+#
+###################################################################
+
+# parse a string of the form {positives}/{positives_and_neutrals}
+
+def parse_spec(s):
+    if ("/" not in s): raise ValueError
+    (n,d) = s.split("/",1)
+    if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError
+    if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError
+
+    positives = n[1:-1]
+    d         = d[1:-1]
+
+    for ch in positives:
+        if (ch not in d): raise ValueError
+
+    neutrals = [ch for ch in d if (ch not in positives)]
+    return (positives,neutrals)
+
+
+# convert a string to a number, allowing fractions
+
+def float_or_fraction(s):
+    if ("/" in s):
+        (numer,denom) = s.split("/",1)
+        return float(numer)/float(denom)
+    else:
+        return float(s)
+
+
+# dense_intervals--
+#    Find all non-overlapping runs with a good enough rate (of positives), and
+#    which meet our length threshold.
+#
+#    The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing
+#    long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999.
+#
+# $$$ we use the denominator as the threshold, but we really should use the
+# $$$ .. numerator, comparing it to minLength*rate
+
+def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None):
+
+    if (blockers == None):
+        blockers = "".join([chr(n) for n in range(1,256)
+                                   if  (chr(n) not in positives)
+                                   and (chr(n) not in neutrals)])
+
+    stackLeft       = [None]    # stack with each entry containing five
+    stackRight      = [None]    # .. elements;  note that entry zero is not
+    stackLeftScore  = [None]    # .. used
+    stackRightScore = [None]
+    stackLower      = [None]
+    top   = 0
+    score = 0
+
+    for ix in range(len(seq)):
+        ch = seq[ix]
+        if (ch in blockers):
+            # emit intervals
+
+            for sp in range(1,top+1):
+                left  = stackLeft [sp] + 1
+                right = stackRight[sp]
+
+                while (left < right) and (seq[left]  not in positives): left  += 1
+                while (right > left) and (seq[right] not in positives): right -= 1
+
+                right += 1
+                if (minLength == None) or (right - left >= minLength):
+                    yield (left,right)
+
+            #empty stack
+
+            stackLeft       = [None]
+            stackRight      = [None]
+            stackLeftScore  = [None]
+            stackRightScore = [None]
+            stackLower      = [None]
+            top   = 0
+            score = 0
+            continue
+
+        if   (ch in positives): weight = 1-rate
+        elif (ch in neutrals):  weight = -rate
+        else: raise ValueError
+
+        score += weight
+        #if ("algorithm" in debug):
+        #    print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score),
+
+        if (weight < 0):
+            #if ("algorithm" in debug):
+            #    print >>sys.stderr
+            continue
+
+        if (top > 0) and (stackRight[top] == ix-1):
+            # add this site to the interval on top of the stack
+
+            stackRight     [top] = ix
+            stackRightScore[top] = score
+
+            #if ("algorithm" in debug):
+            #    print >>sys.stderr, \
+            #          " extending [%d] %d-%d %4.1f %4.1f" \
+            #        % (top,
+            #           stackLeft     [top], stackRight     [top],
+            #           stackLeftScore[top], stackRightScore[top]),
+
+        else:
+            # create a one site interval
+
+            top += 1
+            if (top >= len(stackLeft)):
+                stackLeft       += [None]
+                stackRight      += [None]
+                stackLeftScore  += [None]
+                stackRightScore += [None]
+                stackLower      += [None]
+
+            stackLeft      [top] = ix - 1
+            stackLeftScore [top] = score - weight
+            stackRight     [top] = ix
+            stackRightScore[top] = score
+            stackLower     [top] = top - 1
+
+            while (stackLower[top] > 0) \
+              and (stackLeftScore[stackLower[top]] > stackLeftScore[top]):
+                stackLower[top] = stackLower[stackLower[top]]
+
+            #if ("algorithm" in debug):
+            #    print >>sys.stderr, \
+            #          " creating  [%d] %d-%d %4.1f %4.1f -> %d" \
+            #        % (top,
+            #           stackLeft     [top], stackRight     [top],
+            #           stackLeftScore[top], stackRightScore[top],
+            #           stackLower    [top]),
+
+        # merge intervals;  if there is a previous interval with a no-higher
+        # left score and no-higher right score, merge this interval (and all
+        # intervening ones) into that one
+
+        while (top > 1) \
+          and (stackLower[top] > 0) \
+          and (stackRightScore[stackLower[top]] <= stackRightScore[top]):
+            stackRight     [stackLower[top]] = stackRight     [top]
+            stackRightScore[stackLower[top]] = stackRightScore[top]
+            top = stackLower[top]
+
+            #if ("algorithm" in debug):
+            #    print >>sys.stderr, \
+            #          "\n%*s merging   [%d] %d-%d %4.1f %4.1f" \
+            #        % (13, "", top,
+            #           stackLeft[top],      stackRight     [top],
+            #           stackLeftScore[top], stackRightScore[top]),
+
+        #if ("algorithm" in debug):
+        #    print >>sys.stderr
+
+    # emit intervals
+
+    for sp in range(1,top+1):
+        left  = stackLeft [sp] + 1
+        right = stackRight[sp]
+
+        while (left < right) and (seq[left]  not in positives): left  += 1
+        while (right > left) and (seq[right] not in positives): right -= 1
+
+        right += 1
+        if (minLength == None) or (right - left >= minLength):
+            yield (left,right)
+            
+            
+###################################################################
+# modified by Chen Sun on 7/11/2014
+#
+###################################################################
+
+# correlation_sequence--
+#    Compute the correlation sequence for a given period.  This is a sequence
+#    of + and - indicating whether the base at a given position matches the one
+#    P positions earlier (where P is the period).  The first P positions are
+#    blank.  Positions with single character runs longer than the period are
+#    considered as non-matches, unless the period is 1.
+
+def correlation_sequence(sequence,period,start=None,end=None):
+    if (start == None): start = 0
+    if (end   == None): end   = len(sequence)
+
+    prevCh = sequence[start]
+    run    = 1
+    for ix in xrange(start+1,start+period):
+        ch = sequence[ix]
+        if (ch != prevCh): run =  1
+        else:              run += 1
+        prevCh = ch
+
+    corr = [" "] * period
+    for ix in xrange(start+period,end):
+        rptCh = sequence[ix-period]
+        ch    = sequence[ix]
+        if (ch != prevCh): run =  1
+        else:              run += 1
+        if    (ch    in "ACGT") \
+          and (ch == rptCh) \
+          and ((period == 1) or (run < period)):
+            corr += ["+"]
+        else:
+            corr += ["-"]
+        prevCh = ch
+
+    return "".join(corr)
+
+
+# longest_suitable_run--
+#    Find longest run with a good enough rate (of positives).
+#
+#    We score a "+" as 1-r and anything else as -r.  This is based on the fol-
+#    lowing derivation (p is the number of "+"s, n is the number of non-"+"s):
+#        p/(p+n) >= r
+#        ==> p >= rp + rn
+#        ==> (1-r)p - rn >= 0
+#
+#    We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing).
+#
+# $$$ we use the denominator as the threshold, but we really should use the
+# $$$ .. numerator, comparing it to minLength*rate
+#
+# $$$ this needs to account for $$$ this situation:
+# $$$   sequence: ACGACGACGACGTTATTATTATTA
+# $$$   matches:     +++++++++---+++++++++
+# $$$ this is currently considered to be one interval (if rate <= 6/7), but it
+# $$$ ought to be two;  we can't just post-process, though, because some other
+# $$$ interval might be longer than the longest half of this;  maybe what we
+# $$$ need to do is consider matches at distances -P and -2P, or if we match
+# $$$ -P but that itself was a mismatch, we should carry the mismatch forward
+
+def longest_suitable_run(seq,minLength,rate):
+    maxEndingHere = 0
+    maxSoFar      = 0
+    start         = None
+
+    for ix in xrange(len(seq)):
+        if (seq[ix] == "+"): s = 1-rate
+        else:                s = -rate
+
+        if (maxEndingHere+s < 0):
+            maxEndingHere = 0
+            block         = ix
+        else:
+            maxEndingHere += s
+            if (maxEndingHere >= maxSoFar):
+                maxSoFar = maxEndingHere
+                start    = block + 1
+                end      = ix + 1
+
+    if (start == None) or (end - start < minLength):
+        return []
+    else:
+        return [(start,end)]
+
+
+# all_suitable_runs--
+#    Find all non-overlapping runs with a good enough rate (of positives), and
+#    which meet our length threshold.
+# $$$ this needs to post-process the intervals, splitting them to account for
+# $$$ this situation:
+# $$$   sequence: ACGACGACGACGTTATTATTATTA
+# $$$   matches:     +++++++++---+++++++++
+# $$$ this is currently reported as one interval (if rate <= 6/7), but it
+# $$$ ought to be two
+
+def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold):
+    
+    ################################################################
+    # modified by Chen Sun on 07/11/2014
+    #
+    ################################################################
+    
+    if hammingThreshold > 0:
+        return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)]
+    elif hammingThreshold == 0:
+        return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)]
+
+
+# find_repeat_element--
+#    Find the most plausible repeat element for a run, and nudge the ends of
+#    the run if needed.  Note that we will not consider kmers that represent
+#    shorter repeats.  For example, we won't report ACTACT as a 6-mer since we
+#    consider this to have a shorter period than 6.
+
+def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False):
+
+    if hammingThreshold > 0:
+        (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials)
+        return (kmer,bestD,bestStart,bestEnd)
+    # count the number of occurences of each k-mer;  note that we can't
+    # reject kmers containing smaller repeats yet, since for a sequence like
+    # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+    # 6-mer, and THEN reject it;  if we reject ACACAC while counting, we'd end
+    # up reporting something like ACACAA as the best motif 
+
+    if ("element" in debug):
+        print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+
+    if ("partial" in debug):
+        print period, seq, start, end, allowPartials;
+        print seq[start:end]
+
+    kmerToCount = {}
+    kmerToFirst = {}
+    for ix in xrange(start,end-(period-1)):
+        kmer = seq[ix:ix+period]
+        if ("N" in kmer): continue
+        if (kmer not in kmerToCount):
+            kmerToCount[kmer] = 1
+            kmerToFirst[kmer] = ix
+        else:
+            kmerToCount[kmer] += 1
+        #if ("element" in debug):
+        #    print >>stderr, "    %d: %s" % (ix,kmer)
+
+    # choose the best k-mer;  this is simply the most frequently occurring one,
+    # with ties broken by whichever one came first
+
+    kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+    if (kmers == []): return (None,None,start,end)
+    kmers.sort()
+
+    if ("element" in debug):
+        for (count,first,kmer) in kmers:
+            print >>stderr, "    %s: %d" % (kmer,-count)
+
+    (count,first,kmer) = kmers[0]
+    if (contains_repeat(kmer)): return (None,None,start,end)
+
+    # determine the hamming distance between the run and a simple repeat, for
+    # each "plausible" start and end;  we compute the distance for each such
+    # interval, and choose the one with the lowest hamming distance;  ties are
+    # broken in a deterministic-but-unspecified manner
+
+    bestD = bestStart = bestEnd = None
+    ###################################################################################
+    # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013
+    #     since we do not allow hamming_distance > 0, which means we do not allow mutation,
+    # we do not need this section to produce bestStart and End
+    ###################################################################################
+
+    #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+    #    d = hamming_distance(seq,s,e,kmer)
+    #    if (d == None): continue
+    #    if (bestD == None) or (d <= bestD):
+    #        (bestD,bestStart,bestEnd) = (d,s,e)
+    
+
+
+    bestStart = start
+
+    if(allowPartials):
+        bestEnd = end
+    elif(not allowPartials):
+        bestEnd = start
+        pattern = seq[start:start+period]
+        if ("partial" in debug):
+            print "kmer:", kmer
+            if(pattern != kmer):
+                print "pattern:", pattern
+
+        while(bestEnd <= end-period):
+            bestEnd += period
+
+    # bestD will always be 0, as we do not allow mutation
+    bestD = 0
+    
+    if ("partial" in debug):
+        print bestD, bestStart, bestEnd
+
+    ###################################################################################
+    # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10
+    # 
+    ###################################################################################
+    return (kmer,bestD,bestStart,bestEnd)
+
+
+def find_hamming_repeat_element(period,seq,start,end,allowPartials=False):
+    
+    # count the number of occurences of each k-mer;  note that we can't
+    # reject kmers containing smaller repeats yet, since for a sequence like
+    # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+    # 6-mer, and THEN reject it;  if we reject ACACAC while counting, we'd end
+    # up reporting something like ACACAA as the best motif 
+
+    if ("element" in debug):
+        print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+
+    kmerToCount = {}
+    kmerToFirst = {}
+    for ix in xrange(start,end-(period-1)):
+        kmer = seq[ix:ix+period]
+        if ("N" in kmer): continue
+        if (kmer not in kmerToCount):
+            kmerToCount[kmer] = 1
+            kmerToFirst[kmer] = ix
+        else:
+            kmerToCount[kmer] += 1
+        #if ("element" in debug):
+        #    print >>stderr, "    %d: %s" % (ix,kmer)
+
+    # choose the best k-mer;  this is simply the most frequently occurring one,
+    # with ties broken by whichever one came first
+
+    kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+    if (kmers == []): return (None,None,start,end)
+    kmers.sort()
+
+    if ("element" in debug):
+        for (count,first,kmer) in kmers:
+            print >>stderr, "    %s: %d" % (kmer,-count)
+
+    (count,first,kmer) = kmers[0]
+    if (contains_repeat(kmer)): return (None,None,start,end)
+
+    # determine the hamming distance between the run and a simple repeat, for
+    # each "plausible" start and end;  we compute the distance for each such
+    # interval, and choose the one with the lowest hamming distance;  ties are
+    # broken in a deterministic-but-unspecified manner
+
+    bestD = bestStart = bestEnd = None
+
+    for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+        d = hamming_distance(seq,s,e,kmer)
+        if (d == None): continue
+        if (bestD == None) or (d <= bestD):
+            (bestD,bestStart,bestEnd) = (d,s,e)
+
+    return (kmer,bestD,bestStart,bestEnd)
+
+# plausible_intervals--
+#    Yield all plausible intervals intersecting with a run.  We generate all
+#    starts within P bp of the run's start.  For each of these, we either (a) try
+#    all ends within P bp of run's end, or (b) trim the new interval to a whole
+#    multiple of the period, and report this short interval and the longer
+#    interval with one more period appended.  Case (a) allows partial motifs,
+#    while case (b) only allows whole motifs.
+
+def plausible_intervals(start,end,period,seqLen,allowPartials=False):
+
+    # generate intervals that allow a partial copy of the motif
+
+    if (allowPartials):
+        for candStart in xrange(start-(period-1),start+period):
+            if (candStart < 0): continue
+            for candEnd in xrange(end-(period-1),end+period):
+                if (candEnd > seqLen): continue
+                if (candEnd <= candStart+period): continue
+                yield (candStart,candEnd)
+
+    # -OR- generate intervals that allow only whole copies of the motif
+
+    else:
+        for candStart in xrange(start-(period-1),start+period):
+            if (candStart < 0): continue
+            candEnd = candStart + ((end-candStart)/period)*period
+            yield (candStart,candEnd)
+            candEnd += period
+            if (candEnd <= seqLen): yield (candStart,candEnd)
+
+
+# hamming_distance--
+#    Determine the hamming distance between the run and a simple repeat.
+# $$$ improve this by allowing gaps, and stopping when we reach a threshold
+
+kmerToDiffs = {}  # (this is used for memo-ization)
+
+def hamming_distance(seq,start,end,kmer):
+    period = len(kmer)
+    if (end < start + period): return None
+
+    wholeEnd = start + ((end-start)/period)*period
+
+    if (kmer not in kmerToDiffs):
+        kmerToDiffs[kmer] = { kmer:0 }
+
+    d = 0
+    for ix in xrange(start,wholeEnd,period):
+        qmer = seq[ix:ix+period]    # same size as the kmer motif
+        if (qmer in kmerToDiffs[kmer]):
+            d += kmerToDiffs[kmer][qmer]
+            continue
+        diffs = 0
+        for iy in xrange(0,period):
+            if (qmer[iy] != kmer[iy]): diffs += 1
+        kmerToDiffs[kmer][qmer] = diffs
+        d += diffs
+
+    if (end > wholeEnd):
+        qmer = seq[wholeEnd:end]    # shorter than the kmer motif
+        if (qmer in kmerToDiffs[kmer]):
+            d += kmerToDiffs[kmer][qmer]
+        else:
+            diffs = 0
+            for iy in xrange(0,len(qmer)):
+                if (qmer[iy] != kmer[iy]): diffs += 1
+            kmerToDiffs[kmer][qmer] = diffs
+            d += diffs
+
+    return d
+
+
+# fasta_sequences--
+#    Read the fasta sequences from a file.  Note that we convert to upper case,
+#    and convert any letter other than ACGT to N.
+
+nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN")
+
+def fasta_sequences(f):
+    seqName = None
+    seqNucs = None
+
+    for line in f:
+        line = line.strip()
+        if (line.startswith(">")):
+            if (seqName != None):
+                yield (seqName,"".join(seqNucs))
+            seqName = sequence_name(line)
+            seqNucs = []
+        elif (seqName == None):
+            assert (False), "first sequence has no header"
+        else:
+            seqNucs += [line]
+
+    if (seqName != None):
+        yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap))
+
+
+# fastq_sequences--
+#    Read the fastq sequences from a file.  Note that we convert to upper case,
+#    and convert any letter other than ACGT to N.
+
+def fastq_sequences(f):
+    lineNum = 0
+    for line in f:
+        lineNum += 1
+        line = line.strip()
+
+        if (lineNum % 4 == 1):
+            assert (line.startswith("@")), \
+                   "bad read name at line %d" % lineNum
+            seqName = line[1:]
+            continue
+
+        if (lineNum % 4 == 2):
+            seqNucs = line
+            continue
+
+        if (lineNum % 4 == 3):
+            assert (line.startswith("+")), \
+                   "can't understand line %d:\n%s" % (lineNum,line)
+            continue
+
+        quals = line
+        assert (len(quals) == len(seqNucs)), \
+               "length mismatch read vs. qualities at line %d" % lineNum
+        yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals)
+
+    assert (lineNum % 4 == 0), \
+           "incomplete read at end of file"
+
+def sam_sequences(f):
+    lineNum = 0
+    for line in f:
+        lineNum += 1
+        line = line.strip()
+
+        if line.startswith("@"):
+            continue
+
+        columns = line.split("\t")
+        seqName = columns[0]
+        refName = columns[2]
+        pre_s = int(columns[3]) - 1
+        cigar = columns[5]
+        seqNucs = columns[9]
+        
+        yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar)
+
+# sequence_name--
+#    Extract the sequence name from a fasta header.
+#    $$$ this may need to be improved $$$
+
+def sequence_name(s):
+    s = s[1:].strip()
+    if (s == ""): return ""
+    else:         return s.split()[0]
+
+
+# nucleotide_runs--
+#    Yield (start,end) for all runs of valid nucleotides in a sequence.
+
+def nucleotide_runs(s):
+    runs  = []
+    start = None
+    for (ix,nuc) in enumerate(s):
+        if (nuc in "ACGT"):
+            if (start == None):
+                start = ix
+        else:
+            if (start != None):
+                yield (start,ix)
+                start = None
+
+    if (start != None): yield (start,len(s))
+
+
+# contains_repeat--
+#    Determine whether a short sequence contains a repeated element, such as a
+#    6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT).  The repeat
+#    must cover the entire sequence, without mismatches.
+
+def contains_repeat(kmer):
+    kmerLength = len(kmer)
+    hasRepeat = False
+    rptLen = 1
+    while (not hasRepeat) and (2 * rptLen <= kmerLength):
+        if (kmerLength % rptLen != 0):
+            rptLen += 1
+            continue
+        isRepeat = True
+        for i in xrange(rptLen,kmerLength,rptLen):
+            if (kmer[i:i+rptLen] != kmer[:rptLen]):
+                isRepeat = False
+                break
+        if (isRepeat):
+            hasRepeat = True
+            break
+        rptLen += 1
+    return hasRepeat
+
+
+# hash108--
+#    Return a 108-bit hash "value" of a string
+
+def hash108(s):
+    m = md5_new()
+    m.update(s)
+    return m.hexdigest()[:27]
+
+
+# float_or_fraction--
+#    Convert a string to a number, allowing fractions
+
+def float_or_fraction(s):
+    if ("/" in s):
+        (numer,denom) = s.split("/",1)
+        return float(numer)/float(denom)
+    else:
+        return float(s)
+
+
+# int_with_unit--
+#    Parse a string as an integer, allowing unit suffixes
+
+def int_with_unit(s):
+    if (s.endswith("K")):
+        multiplier = 1000
+        s = s[:-1]
+    elif (s.endswith("M")):
+        multiplier = 1000 * 1000
+        s = s[:-1]
+    elif (s.endswith("G")):
+        multiplier = 1000 * 1000 * 1000
+        s = s[:-1]
+    else:
+        multiplier = 1
+
+    try:               return               int(s)   * multiplier
+    except ValueError: return int(math.ceil(float(s) * multiplier))
+
+
+if __name__ == "__main__": main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatellite.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,178 @@
+<tool id="microsatellite" name="Microsatellite detection" version="1.0.0">
+	<description>for short read, reference, and mapped data</description>
+	<command interpreter="python2.7"> microsatellite.py
+	"${filePath}"
+	#if $inputFileSource.inputFileType == "fasta"
+		--fasta
+    #elif $inputFileSource.inputFileType == "fastq"
+		--fastq
+    #elif $inputFileSource.inputFileType == "fastq_noquals"
+		--fastq:noquals
+	#elif $inputFileSource.inputFileType == "sam"
+		--sam
+    #end if
+	
+	#if $inputFileSource.inputFileType == "sam"
+		#if $inputFileSource.referenceFileSource.requireReference
+			--r --ref="${inputFileSource.referenceFileSource.referencePath}"
+		#end if
+    #end if
+	
+	--period="${period}"
+	
+	#if $partialmotifs == "true"
+		--partialmotifs
+    #end if
+	
+	--minlength="${minlength}"
+
+
+	--prefix="${prefix}"
+	--suffix="${surfix}"
+	
+	--hamming="${hammingThreshold}"
+	
+	#if $multipleruns
+		--multipleruns
+        #end if
+
+	#if $flankSetting.noflankdisplay
+		--noflankdisplay
+	#else
+		--flankdisplay=${flankSetting.flankdisplay}
+	#end if
+	&gt; $stdout
+	</command>
+	
+  <inputs>
+	<param name="filePath" label="Select input file" type="data"/>
+	<conditional name="inputFileSource">
+		<param name="inputFileType" type="select" label="Select input file type">
+			<option value="fasta">Fasta File</option>
+			<option value="fastq">Fastq File</option>
+			<option value="fastq_noquals">Fastq File without Quality Information</option>
+			<option value="sam">SAM File</option>
+		</param>
+		<when value="sam">
+		    <conditional name="referenceFileSource">
+				<param name="requireReference" label="Do you want to extract correspond microsatellites in reference for comparison?" type="boolean">
+				</param>
+				<when value="true">
+					<param name="referencePath" label="Select reference file" type="data"/>
+				</when>
+			</conditional>
+		</when>
+	</conditional>
+	
+	<param name="period" label="Motif size of microsatellites of interest (e.g. Mononucleotide microsatellite =1) (must be less than 10)" type="integer" size="2" value="1"/>
+  <param name="partialmotifs" label="Consider microsatellites with a partial motif?" type="boolean" checked="True"/>
+	<param name="minlength" label="Minimal length (bp) of microsatellite sequence reported" type="integer" size="2" value="5"/>
+	
+
+	<param name="prefix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
+	<param name="surfix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
+
+	
+	<param name="hammingThreshold" label="Hamming threshold of microsatellite, If greater than 0,  interrupted microsatellites will also be reported" type="integer" size="2" value="0"/>
+	<param name="multipleruns" label="Consider all candidate intervals in a sequence. If not check, only the longest one will be considered" type="boolean" checked="True"> </param>
+	<conditional name="flankSetting">
+        	<param name="noflankdisplay" label="Show the entire flanking regions" type="boolean" checked="True"/>
+		<when value="false">
+			<param name="flankdisplay" label="Limit length (bp) of flanking regions shown" type="integer" size="4" value="5"/>
+		</when>
+	</conditional>
+    
+  </inputs>
+  <outputs>
+    <data name="stdout" format="tabular"/>
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="filePath" value="C_sample_fastq"/>
+	  <param name="period" value="1"/>
+      <param name="partialmotifs" value="true" />
+	  <param name="minlength" value="3" />
+	  <param name="prefix" value="5"/>
+	  <param name="surfix" value="5"/>
+	  <param name="hammingThreshold"  value="0"/>
+	  <param name="multipleruns" value="true"> </param>
+      <output name="microsatellite" file="C_sample_snoope"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+We use different algorithms to detect microsatellites depend on hamming distance parameter. 
+If hamming distance is set to zero, the program will only concern about uninterrupted microsatellites. The process works as follows.
+
+1) Scanning reads using sliding windows. For a given repeat period ‘k’ (e.g. k=2 for dinucleotide TRs), we compared consecutive k-mer window size sequences, with a step size of k. If a base at a given position matches one k positions earlier it was marked with a plus, if corresponding sites had different bases it was marked with a minus. The first k position is blank.
+
+2) Since we do not allow mutations in reported TR, consecutive “+” signal sequence means that a k-mer TR is present in this sample. 
+
+3) Report k-mer TRs if the length is larger than a threshold provided by the user.
+
+If hamming distance is set to integer more than zero, the program will concern both uninterrupted and interrupted microsatellites. The process works as follows:
+
+(1) Identify intervals that are highly correlated with the interval shifted by ‘k’ (the repeat period).  These intervals are called "runs" or "candidates". The allowed level of correlation is 6/7. Depending on whether we want to look for more than one microsat, we either find the longest such run (simple algorithm) or many runs (more complicated algorithm). The following steps are then performed on each run.
+
+(2) Find the most likely repeat motif in the run.  This is done by counting all kmers (of length P) and choosing the most frequent.  If that kmer is itself covered by a sub-repeat we discard this run.  The idea is that we can ignore a 6-mer like ACGACG because we will find it when we are looking for 3-mers.
+
+(3) Once we identify the most likely repeat motif, we then modify the interval, adjusting start and end to find the interval that has the fewest mismatches vs. a sequence of the motif repeated (hamming distance). 
+
+(4) At this point we have a valid microsat interval (in the eyes of the program). It is subjected to some filtering stages (hamming distance or too close to an end), and if it satisfies those conditions, it's reported to the user
+    
+For more option, the script to run this program can be downloaded and run with python independently from Galaxy. There are more option for the script mode. Help page is build-in inside the script.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+This tool is developed by Chen Sun (cxs1031@cse.psu.edu) and Bob Harris (rsharris@bx.psu.edu)
+ 
+**Input**
+
+- The input files can be fastq, fasta, fastq without quality score, and SAM format.
+
+**Output**
+
+For fastq, the output will contain the following columns:
+
+- Column 1 = length of microsatellites (bp)
+- Column 2 = length of left flanking regions (bp)
+- Column 3 = length of right flanking regions (bp)
+- Column 4 = repeat motif (bp)
+- Column 5 = hamming distance 
+- Column 6 = read name
+- Column 7 = read sequence with soft masking of microsatellites
+- Column 8 = read quality (the same Phred score scale as input)
+
+For fasta, fastq without quality score and sam format, column 8 will be replaced with dot(.).
+
+If the users have mapped file (SAM) and would like to profile microsatellites from premapped data instead of using flank-based mapping approach, they can select SAM format input and specify that they want correspond microsatellites in reference for comparison. The output will be as follow:
+
+- Column 1 = length of microsatellites (bp)
+- Column 2 = length of left flanking regions (bp)
+- Column 3 = length of right flanking regions (bp)
+- Column 4 = repeat motif (bp)
+- Column 5 = hamming distance 
+- Column 6 = read name
+- Column 7 = read sequence with soft masking of microsatellites
+- Column 8 = read quality (the same Phred score scale as input)
+- Column 9 = read name (The same as column 6)
+- Column 10 = chromosome 
+- Column 11 = left flanking region start
+- Column 12 = left flanking region stop
+- Column 13 = microsatellite start as infer from pair-end
+- Column 14 = microsatellite stop as infer from pair-end
+- Column 15 = right flanking region start
+- Column 16 = right flanking region stop
+- Column 17 = microsatellite length in reference
+- Column 18 = microsatellite sequence in reference
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatpurity.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,24 @@
+import sys
+# remove all read that have impure microsat
+# check only one line at a time
+
+
+fd=open(sys.argv[1])
+lines=fd.xreadlines()
+##motifIx=int(sys.argv[2])
+period=int(sys.argv[2])
+tr_ref_seqIx=int(sys.argv[3])-1
+##output=(sys.argv[4])
+##fout=open(output,'w')
+for line in lines:
+    temp=line.strip().split('\t')
+    temp=filter(None,temp)
+    #motif=temp[motifIx]
+    tr_ref_seq=temp[tr_ref_seqIx]
+    ##period=len(motif)
+    cand_motif=tr_ref_seq[:period]
+    len_microsat=len(tr_ref_seq)
+    expand_microsat_cand=cand_motif*(len_microsat/period) + cand_motif[:(len_microsat%period)]
+    if tr_ref_seq == expand_microsat_cand:
+    	print line.strip()
+        ##print line.strip() >> fout
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatpurity.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,79 @@
+<tool id="microsatpurity" name="Select uninterrupted microsatellites" version="1.0.0">
+  <description> of a specific column</description>
+  <command interpreter="python">microsatpurity.py $input $period $column_n > $output </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select input" />
+    <param name="period" type="integer" label="motif size" value="1"/>
+    <param name="column_n" type="integer" value="0" label="Select column that contains microsatellites of interest (0 = last column)" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="microsatpurity_in.txt"/>
+      <param name="period" value="2"/>      
+      <param name="column_n" value="0"/>
+      <output name="output" file="microsatpurity_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to select only the uninterrupted microsatellites. Interrupted microsatellites (e.g. ATATATATAATATAT) or sequences of microsatellites with non-microsatellite parts (e.g. ATATATATATG) will be removed.
+
+For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to avoid the cases that flanking bases were misread as microsatellite. Thus, the read profile will only reflect the variation of TR length from expansion/contraction.
+For example, suppose that the sequence around microsatellite is AGCGACGaaaaaaGCGATCA. If we observe read with sequence AGCGACGaaaaaaaaaaGCGATCA, we can indicate that this is microsatellite expansion. However, if we observe AGCGACGaaaaaaaCGATCA, this is more like a substitution of G to A. These incidents can be removed with this tool.
+You can use the tool **combine mapped flaked bases** to get the microsatellites in reference that correspond to sequence between mapped reads. If the user map these reads around the uninterrupted microsatelites in reference, the corresponding sequences between these pairs should be the uninterrupted microsatellites regardless of expansion/contraction of microsatellites in short read data. However, if the substitution of flanking base or if the fluorescent signal from the previous run make it look like substitution, the corresponding sequences in reference in between the pairs will not be uninterrupted microsatellites. Thus this tool can remove those cases and keep only microsatellite expansion/contraction.
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+The input files can be any tab delimited file. 
+
+If this tool is used in TRFM microsatellite profiling, it should contains:
+
+- Column 1 = microsatellite location in reference chromosome
+- Column 2 = microsatellite location in reference start
+- Column 3 = microsatellite location in reference stop
+- Column 4 = microsatellite location in reference motif
+- Column 5 = microsatellite location in reference length
+- Column 6 = microsatellite location in reference motif size
+- Column 7 = length of microsatellites (bp)
+- Column 8 = length of left flanking regions (bp)
+- Column 9 = length of right flanking regions (bp)
+- Column 10 = repeat motif (bp)
+- Column 11 = hamming distance 
+- Column 12 = read name
+- Column 13 = read sequence with soft masking of microsatellites
+- Column 14 = read quality (the same Phred score scale as input)
+- Column 15 = read name (The same as column 12)
+- Column 16 = chromosome 
+- Column 17 = left flanking region start
+- Column 18 = left flanking region stop
+- Column 19 = microsatellite start as infer from pair-end
+- Column 20 = microsatellite stop as infer from pair-end
+- Column 21 = right flanking region start
+- Column 22 = right flanking region stop
+- Column 23 = microsatellite length in reference
+- Column 24 = microsatellite sequence in reference
+
+**Output**
+
+The same as input format.
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pair_fetch_DNA_ff.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# pair_fetch_DNA_ff.py
+# Function: filter microsat and flanking region by quality score;
+# remove read with any base that has lower quality score than "quality_require" within "flanking_base" and convert from snoope to fastq
+# Note that require flanking length need to be screen by Bob snoope script first
+
+# Author: Arkarachai Fungtammasan
+# Version 1.0.0 (15 July 2012)
+# Input format: length_of_repeat[0] 	 left_flank_length[1]	right_flank_length[2]	repeat_motif[3]	hamming_distance[4]	read_name[5]	read_sequence[6]	read_quality[7]
+# Output format: two fastq file. First file contain left flank. Second file contain right flank.
+# Command: python pair_fetch_DNA_ff.py input.txt
+
+import sys
+from galaxy import eggs
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+# read file name
+
+
+	
+filename=sys.argv[1]
+L_filename=sys.argv[2]
+R_filename=sys.argv[3]
+quality_require=sys.argv[4]
+flanking_base=sys.argv[5]
+try:
+	quality_require=int(quality_require)
+	flanking_base=int(flanking_base)
+except Exception, eee:
+	print eee
+	stop_err("Quality score cutoff and Length of flanking regions that require quality screening must be integer")
+	
+fd=open(filename)
+fdd1=open(L_filename,'w')
+fdd2=open(R_filename,'w')
+lines=fd.xreadlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    temp=filter(None,temp)
+    #get index
+    left_flank=(0,int(temp[1]))
+    microsat=(int(temp[1]),int(temp[1])+int(temp[0]))
+    right_flank=(int(temp[1])+int(temp[0]),int(temp[1])+int(temp[0])+int(temp[2]))
+    flag=0
+    #filter length of left and right flank
+    if (right_flank[1]-right_flank[0])<flanking_base:
+    	continue
+    if (left_flank[1]-left_flank[0])<flanking_base:
+    	continue
+    #filter quality score
+    for i in temp[7][microsat[0]-flanking_base:microsat[1]+flanking_base]:
+        if ord(i)<(quality_require+33):
+            flag=1
+        else:
+            flag=flag
+    #print out to seperated files
+    if flag ==0:
+        newname= temp[5]##+'_'+temp[3]+'_'+temp[0]
+        fdd1.writelines('@'+newname+'\n')
+        fdd2.writelines('@'+newname+'\n')
+        fdd1.writelines(temp[6][left_flank[0]:left_flank[1]]+'\n')
+        fdd2.writelines(temp[6][right_flank[0]:right_flank[1]]+'\n')
+        fdd1.writelines('+'+newname+'\n')
+        fdd2.writelines('+'+newname+'\n')
+        fdd1.writelines(temp[7][left_flank[0]:left_flank[1]]+'\n')
+        fdd2.writelines(temp[7][right_flank[0]:right_flank[1]]+'\n')
+
+fd.close()
+fdd1.close()
+fdd2.close()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/probvalueforhetero.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,66 @@
+<tool id="heteroprob" name="Evaluate the probability of the allele combination to generate read profile" version="2.0.0">
+  <description></description>
+  <command interpreter="python2.7">heteroprob.py  $microsat_raw $microsat_error_profile  $expectedminorallele > $microsat_corrected </command>
+
+  <inputs>
+    <param name="microsat_raw" type="data" label="Select microsatellite length profile and allele combination file" />
+    <param name="microsat_error_profile" type="data" label="Select microsatellite error profile that correspond to this dataset" />
+	<param name="expectedminorallele" type="float" value="0.5" label="Expected contribution of minor allele when present (0.5 for genotyping)" />
+
+  </inputs>
+  <outputs>
+    <data name="microsat_corrected" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="microsat_raw" value="probvalueforhetero_in.txt"/>
+      <param name="microsat_error_profile" value="PCRinclude.allrate.bymajorallele"/>
+      <param name="expectedminorallele" value="0.5"/>
+      <output name="microsat_corrected" file="probvalueforhetero_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will calculate the probability that the allele combination can generated the given read profile. This tool is part of the pipeline to estimate minimum read depth.
+- The calculation of probability is very similar to the tool **Correct genotype for microsatellite errors**. However, this tool will restrict the calculation to only the allele combination indicated in input. Also, when it encounter allele combination that cannot be generated from error profile, the total probability will be zero instead of using base substitution rate. 
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+The input format is the same as output from **Correct genotype for microsatellite errors** tool.
+
+- Column 1 = location of microsatellite locus. 
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. 
+- Column 4 = homozygous/heterozygous label.
+- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Column 6 = Allele for most probable homozygous form.
+- Column 7 = Allele 1 for most probable heterozygous form.
+- Column 8 = Allele 2 for most probable heterozygous form.
+
+Only column 2,3,7,8 were used in calculation. 
+
+**Output**
+
+
+The output will be contain original eight column from the input. However, it will also add these following columns. 
+- Column 9 = Probability of the allele combination to generate given read profile.
+- Column 10 = Number of possible rearrangement of given read profile.
+- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10)
+- Column 12 = Read depth
+
+
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/profilegenerator.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,66 @@
+import collections
+import itertools
+import sys
+
+filename=sys.argv[1]
+MOTIF=sys.argv[2]
+MOTIFSIZE=len(MOTIF)
+MaxDEPTH=int(sys.argv[3])
+MINIMUMPROB=float(sys.argv[4])##1.0/(10**4)
+MININUMCOUNT=1
+fd=open(filename)
+lines=fd.readlines()
+countbymajorallele=collections.defaultdict(list)
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_count=int(temp[2])
+    countbymajorallele[t_major].append(t_count)
+fd.close()
+sumbymajorallele=collections.defaultdict(int)
+for t_majorallele in countbymajorallele.keys():
+    sumbymajorallele[t_majorallele]=sum(countbymajorallele[t_majorallele])
+
+fd=open(filename)
+##fd=open('PCRinclude.mono.A.bymajorallele')
+lines=fd.readlines()
+allmajor=collections.defaultdict(list)
+for line in lines:
+    temp=line.strip().split()
+    if int(temp[0])%MOTIFSIZE==0:
+        if (int(temp[2])/(sumbymajorallele[int(temp[0])]*1.0))>=MINIMUMPROB:
+            if int(temp[2])>=MININUMCOUNT:
+                allmajor[int(temp[0])].append(int(temp[1]))
+##print allmajor
+allkey=allmajor.keys()
+allkey.sort()
+#print allkey
+keycount=0
+combinelist_collection=[]
+for dummycount in range(len(allkey)-1):
+    pair1,pair2=allkey[keycount],allkey[keycount+1]
+    pair1list=allmajor[pair1]
+    pair2list=allmajor[pair2]
+    #print pair1list,pair2list
+    pair1list.extend(pair2list)
+    combinelist=list(set(pair1list))
+    combinelist.sort()
+    ##print combinelist
+    combinelist_collection.append(tuple(combinelist))
+    keycount+=1
+combinelist_collection=list(set(combinelist_collection))
+newcombinelist_collection=combinelist_collection[:]
+#combinelist_collection=set(combinelist_collection)
+for smallset1 in combinelist_collection:
+    for smallset2 in combinelist_collection:
+        if set(smallset1).issubset(set(smallset2)) and smallset1 != smallset2:
+            newcombinelist_collection.remove(smallset1)
+            break
+##print combinelist_collection
+    
+for depth in range(2,MaxDEPTH+1):
+    for member_list in newcombinelist_collection:
+        for member in itertools.combinations_with_replacement(member_list,depth):
+            print 'chr'+'\t'+','.join(map(str,member))+'\t'+MOTIF
+                
+    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/profilegenerator.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,110 @@
+<tool id="Profilegenerator" name="Generate all possible combination of read profile" version="2.0.0">
+  <description> of the consecutive allele from given error profile </description>
+  <command interpreter="python2.7">profilegenerator.py  $error_profile $MOTIF $Maxdepth $minprob > $output </command>
+
+  <inputs>
+    <param name="error_profile" type="data" label="Select error profile" />
+    <param name="MOTIF" type="text" value="A" label="Type in a motif of interest (e.g. AGC)" />
+	<param name="Maxdepth" type="integer" value="30" label="Maximum read depth of interest" />
+	<param name="minprob" type="float" value="0.00000001" label="Minimum error rate to be considered" />
+
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="error_profile" value="sampleprofilegenerator_in"/>
+      <param name="MOTIF" value="A"/>
+      <param name="Maxdepth" value="3"/>
+      <param name="minprob" file="0.00000001"/>
+      <output name="output" file="sampleprofilegenerator_out"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool will generate all possible combination of observed read profile of the consecutive alleles from given error profile. The range of observed read length can be filtered to contain only those that are frequently occur using "Minimum error rate to be considered" parameter.
+
+This problem will collect the lists of valid (pass "Minimum error rate to be considered" threshold) observed length profiles from combination of consecutive allele lengths. The lists that are equivalent or the subset of the other lists will be removed. For each depth and each list, length profile were generated from combination with replacement which compatible with python 2.7. There could be redundant error profiles generated from different lists if more than one combination of allele is generated due to overlap range of observed microsatellite lengths. The user need to remove them which can be done easily using **sort | uniq** command in unix.
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+- The error profile needs to contain these three columns. 
+- Column 1 = Correct microsatellite length 
+- Column 2 = Observed microsatellite length 
+- Column 3 = Number of observation
+
+**Output**
+
+- Column 1 = Place holder for location of microsatellite locus. (just "chr")
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. 
+ 
+**Example**
+
+- Suppose that we provide the following read profile ::
+
+	9	9	100000
+	10	10	91456
+	10	9	1259	
+	11	11	39657
+	11	10	1211
+	11	12	514
+	
+
+- Using default minimum probability to be consider and motif = A, all observed read lengths are valid. The program will generated lists of observed length profiles from consecutive allele length. ::
+
+	9:10 = [9,10]
+	10:11 = [9,10,11,12]
+	
+- Lists that are subsets of other lists will be removed. Thus, [9,10] will not be considered. 
+
+- Then the program will generate all combination with replacement for each depth from each list. Using **maximum read depth =3**, we will ge the following output. ::
+
+	
+	chr	9,9	A
+	chr	9,10	A
+	chr	9,11	A
+	chr	9,12	A
+	chr	10,10	A
+	chr	10,11	A
+	chr	10,12	A
+	chr	11,11	A
+	chr	11,12	A
+	chr	12,12	A
+	chr	9,9,9	A
+	chr	9,9,10	A
+	chr	9,9,11	A
+	chr	9,9,12	A
+	chr	9,10,10	A
+	chr	9,10,11	A
+	chr	9,10,12	A
+	chr	9,11,11	A
+	chr	9,11,12	A
+	chr	9,12,12	A
+	chr	10,10,10	A
+	chr	10,10,11	A
+	chr	10,10,12	A
+	chr	10,11,11	A
+	chr	10,11,12	A
+	chr	10,12,12	A
+	chr	11,11,11	A
+	chr	11,11,12	A
+	chr	11,12,12	A
+	chr	12,12,12	A
+
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readdepth2sequencingdepth.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,57 @@
+<tool id="readdepth2seqdepth" name="Convert informative read depth to sequencing depth" version="1.0.0">
+  <description>for flank-based mapping of microsatellites</description>
+  <command interpreter="python2.7">sequencingdepthconversion_G.py $repeatlength $flanksize $readlength $infodepth $probprediction > $output </command>
+
+  <inputs>
+    <param name="repeatlength" type="integer" value="10" label="Repeat length (bp)" />
+    <param name="flanksize" type="integer" value="20" label="Required flank bases on each side in mapping" />
+    <param name="readlength" type="integer" value="100" label="Read length (treat all read as single end read)" />
+    <param name="infodepth" type="integer" value="5" label="Required read depth" />
+    <param name="probprediction" type="float" value="0.9" label="Proportion of genome that need certain level of read depth" />
+  </inputs>
+  <outputs>
+    <data format="input" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+		<param name="repeatlength" value="10"/>
+    	<param name="flanksize" value="20" />
+    	<param name="readlength" value="100" />
+    	<param name="infodepth" value="5" />
+		<param name="probprediction"  value="0.9" />
+		<output name="output" file="readdepth2seqdepth.out"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to convert informative read depth (specified by user) to sequencing depth when the microsatellites is mapped using TRFM pipeline.
+The locus specific sequencing depth is the sequencing depth that will make a certain loci have certain read depth based on uniform mapped of read. It is calculated as: ::
+
+	yrequired = ( X * L ) / (L - (2F+r-1))
+	
+Where X = read depth, L = read length, F = the number of flanked bases required on each flanking regions, r = the expected repeat length of microsatellite of interest.
+
+The genome wide sequencing depth is the sequencing depth that will make certain percentage of genome (e.g. 90 percent or 95 percent) to have certain locus specific sequencing depth. It's calculated using numerical guessing to find smallest lambda that: ::
+
+	 0.90 (or other proportion specified by user) &lt; = P(Y=0) + P(Y=1) + …+ P(Y=yrequired-1)  
+	 
+	 P(Y=y) = (lambda^(y) * e ^(-lambda)) /y!
+
+ y = specific level of sequencing depth. Lambda = genome wide sequencing depth
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sequencingdepthconversion_G.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,54 @@
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+        
+def info2require(X,L,F,r):
+    '''infodepth,readlength,flanksize,repeatlength
+    '''
+    return int(math.ceil((X*L*1.0)/(L-(1*((2*F)+r-1)))))
+    
+def poissondef(meancov,specificcov):
+    nominator=1.0*(meancov**specificcov)*(math.e**(-1*meancov))
+    denominator=math.factorial(specificcov)
+    return nominator/denominator
+
+def require2recommend(needprob,mindepth):
+    i=mindepth
+    reverseneedprob=1-needprob
+    sumprob=1
+    while sumprob>reverseneedprob: #mean cov
+        sumprob=0
+        for j in range(0,mindepth): #specific cov
+            sumprob+=poissondef(i,j)
+        i+=1
+        
+    return i-1
+
+import sys,math
+
+repeatlength=int(sys.argv[1])
+flanksize=int(sys.argv[2])#20
+readlength=int(sys.argv[3])#100
+infodepth=int(sys.argv[4])#5
+probdetection=float(sys.argv[5])#0.90
+
+if probdetection >1:
+    try:
+        probvalue=int('probvalue')
+    except Exception, eee:
+        print eee
+        stop_err("Proportion of genome to have certain locus specific must be between 0 and 1")
+
+print 'repeat_length'+'\t'+'read_length'+'\t'+'informative_read_depth''\t'+'=locus_specific_sequencing_depth'+'\t'+'=genome_wide_sequencing_depth'
+t_requiredepth=info2require(infodepth,readlength,flanksize,repeatlength)
+t_recomendseq=require2recommend(probdetection,t_requiredepth)
+preplotlist=[repeatlength,readlength,infodepth,t_requiredepth,t_recomendseq]
+plotlist=map(str,preplotlist)
+print '\t'.join(plotlist)
+
+#print info2require(infodepth,readlength,flanksize,repeatlength)
+#print poissondef(10,3)
+#print require2recommend(0.90,80)
+#informative_read_depth
+#required_seq_depth
+#recommend_seq_depth
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/space2underscore_readname.xml	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,47 @@
+<tool id="space2underscore_readname" name="Read name modifier" version="1.0.0">
+  <description>--change space to underscore of a specific column</description>
+  <command interpreter="python">changespacetounderscore_readname.py  $input $output $column_n </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select input" />
+    <param name="column_n" type="integer" value="6" label="Select column to modify" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="samplefq.snoope"/>
+      <param name="column_n" value="6"/>
+      <output name="output" file="samplefq.snoope.new"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to change space to underscore. For TRFM pipeline (profiling microsatellites in short read data), this tool is used to change space in read name to underscore to prevent the downstream tools which might recognize incorrect column number due to space in read name. If the input do not have space in read name, this step can be skipped.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+ 
+**Input**
+
+The input files can be any tab delimited file. 
+
+If this tool is used in TRFM microsatellite profiling, it should be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score**
+
+**Output**
+
+The same as input format.
+
+
+</help>
+</tool>
\ No newline at end of file
Binary file test-data/.DS_Store has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/C_sample_fastq	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,8 @@
+@IL2_40_2_1_735_755
+ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGAAATAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+@IL2_40_2_1_919_700
+ATAAGGAAAAAAAAAAAAAAAACCAGGTCTTTTTTTTTTTTTTTTTGTTAT
++
+IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/C_sample_snoope	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,4 @@
+3	33	15	A	0	IL2_40_2_1_735_755_1_per1_2	ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTaaaGTGCTGAAATAACAT	IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+3	42	6	A	0	IL2_40_2_1_735_755_1_per1_3	ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGaaaTAACAT		IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+16	6	29	A	0	IL2_40_2_1_919_700_1_per1_1	ATAAGGaaaaaaaaaaaaaaaaCCAGGTCTTTTTTTTTTTTTTTTTGTTAT	IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
+17	29	5	T	0	IL2_40_2_1_919_700_1_per1_2	ATAAGGAAAAAAAAAAAAAAAACCAGGTCtttttttttttttttttGTTAT		IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/PCRinclude.allrate.bymajorallele	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,997 @@
+10	10	91456	A
+10	9	1259	A
+10	11	605	A
+10	8	16	A
+10	12	8	A
+10	7	2	A
+11	11	39657	A
+11	10	1211	A
+11	12	514	A
+11	9	54	A
+11	13	9	A
+11	8	3	A
+11	14	1	A
+12	12	18850	A
+12	11	986	A
+12	13	417	A
+12	10	73	A
+12	14	8	A
+12	9	1	A
+12	8	1	A
+13	13	10201	A
+13	12	885	A
+13	14	320	A
+13	11	83	A
+13	15	12	A
+13	10	8	A
+14	14	3649	A
+14	13	409	A
+14	15	151	A
+14	12	62	A
+14	11	6	A
+14	16	5	A
+14	10	1	A
+15	15	847	A
+15	14	140	A
+15	16	60	A
+15	13	20	A
+15	17	4	A
+15	12	3	A
+16	16	182	A
+16	15	60	A
+16	17	14	A
+16	14	12	A
+16	13	1	A
+16	12	1	A
+16	18	1	A
+17	17	11	A
+17	16	5	A
+17	15	2	A
+17	18	1	A
+18	18	4	A
+18	17	2	A
+5	5	10047169	A
+5	6	44	A
+6	6	2808071	A
+6	5	195	A
+6	7	69	A
+7	7	1097174	A
+7	6	313	A
+7	8	83	A
+7	5	6	A
+8	8	369496	A
+8	7	387	A
+8	9	248	A
+8	6	3	A
+8	10	2	A
+9	9	184958	A
+9	8	707	A
+9	10	486	A
+9	7	5	A
+9	11	4	A
+10	10	46	C
+10	9	3	C
+5	5	1354993	C
+5	6	7	C
+6	6	193431	C
+6	5	14	C
+6	7	2	C
+7	7	22171	C
+7	6	4	C
+8	8	2966	C
+8	9	3	C
+8	7	3	C
+9	9	638	C
+9	8	8	C
+9	7	1	C
+10	10	21211	AC
+10	8	3	AC
+10	12	1	AC
+11	11	15048	AC
+11	9	10	AC
+12	12	6043	AC
+12	10	15	AC
+12	14	1	AC
+13	13	5070	AC
+13	11	40	AC
+13	15	1	AC
+14	14	3093	AC
+14	12	44	AC
+14	10	1	AC
+15	15	2848	AC
+15	13	31	AC
+15	17	1	AC
+16	16	1273	AC
+16	14	30	AC
+16	12	2	AC
+17	17	1297	AC
+17	15	27	AC
+18	18	1269	AC
+18	16	43	AC
+18	20	2	AC
+18	14	1	AC
+19	19	679	AC
+19	17	17	AC
+19	21	1	AC
+20	20	645	AC
+20	18	34	AC
+20	22	2	AC
+20	16	1	AC
+21	21	723	AC
+21	19	28	AC
+21	17	1	AC
+21	23	1	AC
+22	22	499	AC
+22	20	29	AC
+22	18	3	AC
+23	23	540	AC
+23	21	30	AC
+23	19	2	AC
+23	25	1	AC
+24	24	385	AC
+24	22	38	AC
+24	26	2	AC
+24	20	1	AC
+25	25	407	AC
+25	23	22	AC
+25	27	2	AC
+25	21	1	AC
+26	26	257	AC
+26	24	30	AC
+26	22	3	AC
+26	28	1	AC
+26	20	1	AC
+27	27	339	AC
+27	25	28	AC
+27	23	3	AC
+27	29	2	AC
+28	28	202	AC
+28	26	17	AC
+28	30	6	AC
+29	29	277	AC
+29	27	29	AC
+29	31	6	AC
+29	25	3	AC
+30	30	117	AC
+30	28	12	AC
+30	32	3	AC
+30	18	1	AC
+31	31	144	AC
+31	29	18	AC
+31	27	4	AC
+31	33	2	AC
+32	32	101	AC
+32	30	23	AC
+32	28	2	AC
+32	34	2	AC
+32	26	1	AC
+33	33	106	AC
+33	31	15	AC
+33	35	3	AC
+33	29	1	AC
+34	34	33	AC
+34	32	7	AC
+35	35	21	AC
+35	33	4	AC
+35	31	1	AC
+36	36	12	AC
+36	34	1	AC
+37	37	10	AC
+37	35	3	AC
+37	31	1	AC
+37	39	1	AC
+38	38	4	AC
+38	36	1	AC
+6	6	1521439	AC
+7	7	513952	AC
+8	8	134603	AC
+8	6	2	AC
+9	9	60741	AC
+9	7	3	AC
+9	11	1	AC
+10	10	21772	AG
+10	8	3	AG
+10	12	1	AG
+11	11	13880	AG
+11	9	10	AG
+11	13	1	AG
+12	12	5628	AG
+12	10	13	AG
+12	14	4	AG
+13	13	4494	AG
+13	11	17	AG
+14	14	1898	AG
+14	12	15	AG
+15	15	2427	AG
+15	13	18	AG
+16	16	1076	AG
+16	14	24	AG
+16	12	1	AG
+17	17	874	AG
+17	15	12	AG
+17	19	1	AG
+17	13	1	AG
+18	18	536	AG
+18	16	20	AG
+18	14	1	AG
+19	19	563	AG
+19	17	25	AG
+20	20	201	AG
+20	18	14	AG
+21	21	260	AG
+21	19	10	AG
+22	22	83	AG
+22	20	5	AG
+23	23	147	AG
+23	21	5	AG
+23	25	1	AG
+24	24	99	AG
+24	22	4	AG
+24	18	1	AG
+25	25	62	AG
+25	23	3	AG
+25	27	1	AG
+26	26	38	AG
+26	24	8	AG
+27	27	24	AG
+27	25	3	AG
+27	23	1	AG
+28	28	14	AG
+28	26	2	AG
+29	29	12	AG
+29	27	5	AG
+29	31	1	AG
+30	30	7	AG
+30	28	2	AG
+31	31	7	AG
+31	27	3	AG
+31	23	1	AG
+32	32	4	AG
+32	28	1	AG
+6	6	1880822	AG
+7	7	684837	AG
+7	9	1	AG
+8	8	183381	AG
+9	9	75547	AG
+9	7	6	AG
+9	11	1	AG
+10	10	18179	AT
+10	8	7	AT
+10	12	4	AT
+11	11	8969	AT
+11	9	5	AT
+11	13	2	AT
+12	12	4888	AT
+12	10	8	AT
+12	14	2	AT
+13	13	2785	AT
+13	11	17	AT
+13	15	1	AT
+14	14	2310	AT
+14	12	40	AT
+14	16	4	AT
+14	10	2	AT
+15	15	1461	AT
+15	13	33	AT
+15	11	1	AT
+15	17	1	AT
+16	16	879	AT
+16	14	42	AT
+16	18	2	AT
+16	12	1	AT
+17	17	599	AT
+17	15	38	AT
+17	19	2	AT
+17	13	1	AT
+18	18	367	AT
+18	16	29	AT
+18	20	7	AT
+18	14	1	AT
+19	19	223	AT
+19	17	34	AT
+19	21	3	AT
+20	20	97	AT
+20	18	14	AT
+20	16	2	AT
+20	22	1	AT
+21	21	60	AT
+21	19	18	AT
+21	17	1	AT
+22	22	53	AT
+22	20	15	AT
+22	24	5	AT
+22	18	3	AT
+23	23	11	AT
+23	21	1	AT
+24	24	7	AT
+24	20	2	AT
+24	22	2	AT
+6	6	1671932	AT
+6	8	1	AT
+7	7	595145	AT
+8	8	195533	AT
+8	10	5	AT
+8	6	2	AT
+9	9	52576	AT
+9	7	3	AT
+10	10	17	CG
+11	11	17	CG
+12	12	6	CG
+6	6	4097	CG
+7	7	678	CG
+8	8	184	CG
+9	9	19	CG
+10	10	19552	AAC
+11	11	19003	AAC
+12	12	6245	AAC
+12	9	1	AAC
+13	13	3406	AAC
+14	14	8448	AAC
+14	11	2	AAC
+15	15	2356	AAC
+15	12	6	AAC
+16	16	1373	AAC
+16	13	4	AAC
+17	17	3140	AAC
+17	14	5	AAC
+18	18	944	AAC
+18	15	2	AAC
+19	19	456	AAC
+19	16	1	AAC
+20	20	1474	AAC
+20	17	3	AAC
+21	21	328	AAC
+21	18	1	AAC
+22	22	178	AAC
+23	23	538	AAC
+23	26	1	AAC
+24	24	112	AAC
+25	25	60	AAC
+26	26	239	AAC
+26	23	1	AAC
+27	27	45	AAC
+28	28	58	AAC
+28	25	2	AAC
+29	29	77	AAC
+30	30	17	AAC
+31	31	38	AAC
+31	28	1	AAC
+32	32	94	AAC
+32	29	3	AAC
+33	33	15	AAC
+35	35	55	AAC
+35	32	1	AAC
+38	38	12	AAC
+41	41	6	AAC
+9	9	57212	AAC
+10	10	31455	AAG
+11	11	11876	AAG
+12	12	3458	AAG
+12	9	6	AAG
+13	13	1141	AAG
+14	14	928	AAG
+15	15	548	AAG
+15	12	4	AAG
+16	16	189	AAG
+17	17	235	AAG
+18	18	63	AAG
+19	19	66	AAG
+20	20	122	AAG
+22	22	11	AAG
+23	23	33	AAG
+9	9	104524	AAG
+10	10	69106	AAT
+11	11	30381	AAT
+12	12	12001	AAT
+12	9	1	AAT
+13	13	7168	AAT
+13	10	2	AAT
+14	14	5470	AAT
+14	11	3	AAT
+15	15	2524	AAT
+15	12	3	AAT
+16	16	1733	AAT
+16	13	1	AAT
+17	17	1324	AAT
+17	14	3	AAT
+18	18	1022	AAT
+18	15	3	AAT
+19	19	502	AAT
+19	16	3	AAT
+20	20	570	AAT
+20	17	2	AAT
+21	21	370	AAT
+21	18	1	AAT
+22	22	98	AAT
+23	23	164	AAT
+23	20	3	AAT
+24	24	143	AAT
+24	21	1	AAT
+25	25	122	AAT
+25	22	1	AAT
+26	26	45	AAT
+26	23	2	AAT
+27	27	32	AAT
+27	24	1	AAT
+28	28	6	AAT
+29	29	64	AAT
+29	26	1	AAT
+30	30	28	AAT
+30	24	1	AAT
+31	31	9	AAT
+32	32	9	AAT
+32	29	1	AAT
+38	38	6	AAT
+9	9	179182	AAT
+9	12	1	AAT
+10	10	14290	ACC
+11	11	5692	ACC
+12	12	1795	ACC
+13	13	1141	ACC
+14	14	545	ACC
+15	15	308	ACC
+16	16	162	ACC
+17	17	107	ACC
+18	18	23	ACC
+19	19	35	ACC
+20	20	44	ACC
+21	21	5	ACC
+22	22	5	ACC
+22	19	1	ACC
+23	23	11	ACC
+25	25	7	ACC
+26	26	7	ACC
+27	27	10	ACC
+28	28	24	ACC
+28	25	1	ACC
+35	35	5	ACC
+9	9	46614	ACC
+10	10	2865	ACG
+11	11	900	ACG
+12	12	325	ACG
+13	13	82	ACG
+14	14	83	ACG
+9	9	9465	ACG
+10	10	6269	ACT
+11	11	2284	ACT
+12	12	634	ACT
+13	13	441	ACT
+14	14	295	ACT
+15	15	118	ACT
+16	16	60	ACT
+17	17	71	ACT
+18	18	58	ACT
+19	19	42	ACT
+20	20	24	ACT
+24	24	5	ACT
+37	37	8	ACT
+41	41	5	ACT
+41	35	1	ACT
+9	9	20025	ACT
+10	10	2897	AGC
+11	11	948	AGC
+12	12	320	AGC
+13	13	97	AGC
+14	14	87	AGC
+15	15	13	AGC
+16	16	9	AGC
+17	17	25	AGC
+17	14	1	AGC
+9	9	9579	AGC
+10	10	21141	AGG
+11	11	8128	AGG
+12	12	2964	AGG
+13	13	1209	AGG
+14	14	860	AGG
+15	15	320	AGG
+16	16	190	AGG
+17	17	225	AGG
+18	18	147	AGG
+20	20	80	AGG
+21	21	9	AGG
+22	22	35	AGG
+23	23	27	AGG
+24	24	8	AGG
+26	26	9	AGG
+9	9	57350	AGG
+10	10	5964	ATC
+11	11	2346	ATC
+12	12	789	ATC
+13	13	386	ATC
+14	14	285	ATC
+15	15	165	ATC
+16	16	93	ATC
+17	17	149	ATC
+18	18	51	ATC
+19	19	6	ATC
+20	20	15	ATC
+21	21	15	ATC
+22	22	29	ATC
+23	23	25	ATC
+24	24	24	ATC
+26	26	34	ATC
+27	27	9	ATC
+28	28	30	ATC
+29	29	8	ATC
+30	30	8	ATC
+31	31	11	ATC
+34	34	11	ATC
+34	31	1	ATC
+36	36	5	ATC
+9	9	19837	ATC
+10	10	11	CCG
+11	11	24	CCG
+14	14	5	CCG
+16	16	5	CCG
+9	9	135	CCG
+12	12	10192	AAAC
+13	13	4917	AAAC
+14	14	4704	AAAC
+15	15	12713	AAAC
+16	16	2415	AAAC
+17	17	1431	AAAC
+18	18	1861	AAAC
+18	14	2	AAAC
+19	19	5254	AAAC
+19	15	2	AAAC
+19	23	1	AAAC
+20	20	913	AAAC
+20	16	1	AAAC
+21	21	615	AAAC
+22	22	509	AAAC
+22	18	2	AAAC
+23	23	2249	AAAC
+23	19	5	AAAC
+23	15	1	AAAC
+24	24	329	AAAC
+24	20	2	AAAC
+25	25	230	AAAC
+25	21	1	AAAC
+26	26	175	AAAC
+27	27	548	AAAC
+27	23	2	AAAC
+28	28	195	AAAC
+28	24	1	AAAC
+29	29	62	AAAC
+30	30	67	AAAC
+31	31	165	AAAC
+31	27	1	AAAC
+32	32	64	AAAC
+33	33	63	AAAC
+34	34	21	AAAC
+35	35	40	AAAC
+36	36	55	AAAC
+37	37	6	AAAC
+38	38	8	AAAC
+39	39	10	AAAC
+40	40	7	AAAC
+45	45	7	AAAC
+12	12	12855	AAAG
+12	16	13	AAAG
+12	20	9	AAAG
+12	18	2	AAAG
+13	13	6727	AAAG
+14	14	3699	AAAG
+14	13	8	AAAG
+15	15	3858	AAAG
+15	17	6	AAAG
+15	13	1	AAAG
+16	16	1244	AAAG
+17	17	750	AAAG
+17	13	1	AAAG
+18	18	380	AAAG
+18	20	5	AAAG
+18	14	1	AAAG
+19	19	1164	AAAG
+19	15	1	AAAG
+20	20	153	AAAG
+21	21	186	AAAG
+22	22	115	AAAG
+23	23	321	AAAG
+23	19	1	AAAG
+24	24	82	AAAG
+25	25	89	AAAG
+26	26	26	AAAG
+26	13	3	AAAG
+27	27	64	AAAG
+28	28	36	AAAG
+29	29	32	AAAG
+31	31	31	AAAG
+33	33	19	AAAG
+35	35	10	AAAG
+36	36	11	AAAG
+38	38	16	AAAG
+41	41	5	AAAG
+12	12	23143	AAAT
+13	13	10045	AAAT
+14	14	6815	AAAT
+15	15	8439	AAAT
+16	16	3102	AAAT
+16	12	2	AAAT
+17	17	2018	AAAT
+17	13	2	AAAT
+18	18	2044	AAAT
+19	19	2955	AAAT
+19	15	1	AAAT
+19	14	1	AAAT
+20	20	909	AAAT
+21	21	711	AAAT
+21	17	2	AAAT
+22	22	500	AAAT
+22	18	2	AAAT
+23	23	993	AAAT
+23	19	3	AAAT
+24	24	382	AAAT
+24	20	3	AAAT
+25	25	190	AAAT
+26	26	185	AAAT
+26	22	1	AAAT
+27	27	281	AAAT
+27	23	2	AAAT
+28	28	165	AAAT
+28	24	2	AAAT
+29	29	48	AAAT
+30	30	46	AAAT
+31	31	101	AAAT
+32	32	28	AAAT
+33	33	19	AAAT
+34	34	24	AAAT
+34	30	1	AAAT
+35	35	41	AAAT
+35	31	2	AAAT
+36	36	16	AAAT
+37	37	6	AAAT
+38	38	5	AAAT
+39	39	20	AAAT
+39	35	1	AAAT
+40	40	5	AAAT
+41	41	10	AAAT
+42	42	6	AAAT
+45	45	6	AAAT
+12	12	1468	AACC
+13	13	590	AACC
+14	14	318	AACC
+15	15	163	AACC
+16	16	102	AACC
+17	17	106	AACC
+18	18	18	AACC
+19	19	34	AACC
+20	20	7	AACC
+22	22	7	AACC
+23	23	13	AACC
+24	24	16	AACC
+25	25	9	AACC
+31	31	9	AACC
+12	12	214	AACG
+13	13	135	AACG
+14	14	39	AACG
+15	15	45	AACG
+12	12	522	AACT
+13	13	142	AACT
+14	14	143	AACT
+15	15	88	AACT
+16	16	16	AACT
+17	17	51	AACT
+18	18	7	AACT
+20	20	21	AACT
+21	21	27	AACT
+23	23	7	AACT
+24	24	11	AACT
+30	30	5	AACT
+12	12	346	AAGC
+13	13	83	AAGC
+14	14	60	AAGC
+15	15	40	AAGC
+16	16	21	AAGC
+18	18	9	AAGC
+19	19	7	AAGC
+12	12	4943	AAGG
+13	13	2714	AAGG
+14	14	1385	AAGG
+14	15	3	AAGG
+15	15	949	AAGG
+16	16	612	AAGG
+16	14	4	AAGG
+17	17	331	AAGG
+18	18	362	AAGG
+19	19	204	AAGG
+20	20	138	AAGG
+21	21	149	AAGG
+22	22	68	AAGG
+23	23	49	AAGG
+24	24	27	AAGG
+25	25	44	AAGG
+26	26	8	AAGG
+27	27	14	AAGG
+28	28	14	AAGG
+29	29	14	AAGG
+30	30	12	AAGG
+31	31	23	AAGG
+34	34	11	AAGG
+43	43	6	AAGG
+12	12	2676	AAGT
+13	13	1438	AAGT
+14	14	940	AAGT
+15	15	649	AAGT
+16	16	305	AAGT
+17	17	291	AAGT
+18	18	181	AAGT
+19	19	55	AAGT
+20	20	73	AAGT
+21	21	8	AAGT
+22	22	43	AAGT
+22	26	1	AAGT
+23	23	32	AAGT
+23	19	1	AAGT
+24	24	18	AAGT
+25	25	19	AAGT
+26	26	8	AAGT
+27	27	12	AAGT
+29	29	18	AAGT
+30	30	12	AAGT
+31	31	12	AAGT
+32	32	11	AAGT
+33	33	35	AAGT
+34	34	9	AAGT
+35	35	6	AAGT
+12	12	594	AATC
+13	13	205	AATC
+14	14	88	AATC
+15	15	112	AATC
+16	16	20	AATC
+17	17	81	AATC
+18	18	23	AATC
+21	21	13	AATC
+22	22	8	AATC
+24	24	19	AATC
+26	26	7	AATC
+28	28	9	AATC
+33	33	6	AATC
+12	12	2293	AATG
+13	13	1226	AATG
+14	14	678	AATG
+15	15	455	AATG
+16	16	222	AATG
+17	17	211	AATG
+18	18	104	AATG
+19	19	79	AATG
+20	20	40	AATG
+21	21	33	AATG
+22	22	73	AATG
+23	23	24	AATG
+24	24	16	AATG
+25	25	18	AATG
+26	26	15	AATG
+27	27	22	AATG
+27	23	1	AATG
+28	28	5	AATG
+32	32	17	AATG
+33	33	16	AATG
+12	12	2633	AATT
+13	13	1086	AATT
+14	14	1052	AATT
+15	15	386	AATT
+16	16	393	AATT
+17	17	98	AATT
+18	18	104	AATT
+19	19	105	AATT
+20	20	34	AATT
+21	21	12	AATT
+22	22	20	AATT
+25	25	18	AATT
+26	26	25	AATT
+27	27	7	AATT
+29	29	7	AATT
+35	35	12	AATT
+12	12	1406	ACAG
+13	13	964	ACAG
+14	14	300	ACAG
+15	15	130	ACAG
+16	16	102	ACAG
+17	17	49	ACAG
+18	18	30	ACAG
+19	19	88	ACAG
+20	20	5	ACAG
+23	23	5	ACAG
+12	12	4868	ACAT
+12	15	4	ACAT
+13	13	3216	ACAT
+14	14	957	ACAT
+15	15	1052	ACAT
+16	16	588	ACAT
+17	17	422	ACAT
+18	18	239	ACAT
+19	19	238	ACAT
+19	15	1	ACAT
+20	20	25	ACAT
+21	21	79	ACAT
+22	22	20	ACAT
+23	23	38	ACAT
+27	27	42	ACAT
+29	29	18	ACAT
+31	31	5	ACAT
+32	32	5	ACAT
+35	35	6	ACAT
+36	36	9	ACAT
+41	41	14	ACAT
+44	44	8	ACAT
+44	40	1	ACAT
+50	50	12	ACAT
+12	12	833	ACCC
+13	13	345	ACCC
+14	14	190	ACCC
+15	15	60	ACCC
+16	16	12	ACCC
+17	17	15	ACCC
+19	19	8	ACCG
+12	12	416	ACCT
+13	13	123	ACCT
+14	14	140	ACCT
+15	15	69	ACCT
+16	16	41	ACCT
+17	17	45	ACCT
+19	19	18	ACCT
+20	20	27	ACCT
+21	21	19	ACCT
+22	22	6	ACCT
+27	27	13	ACCT
+28	28	7	ACCT
+29	29	9	ACCT
+30	30	7	ACCT
+34	34	6	ACCT
+45	45	5	ACCT
+12	12	84	ACGC
+13	13	52	ACGC
+15	15	63	ACGC
+12	12	433	ACGG
+13	13	163	ACGG
+14	14	38	ACGG
+15	15	44	ACGG
+16	16	7	ACGG
+17	17	11	ACGG
+19	19	6	ACGG
+25	25	10	ACGG
+12	12	1119	ACGT
+13	13	509	ACGT
+14	14	338	ACGT
+15	15	16	ACGT
+16	16	66	ACGT
+17	17	7	ACGT
+19	19	27	ACGT
+12	12	2211	ACTC
+13	13	685	ACTC
+14	14	188	ACTC
+15	15	151	ACTC
+16	16	91	ACTC
+18	18	17	ACTC
+19	19	24	ACTC
+20	20	23	ACTC
+21	21	13	ACTC
+23	23	19	ACTC
+45	45	8	ACTC
+12	12	161	ACTG
+13	13	69	ACTG
+14	14	7	ACTG
+15	15	14	ACTG
+16	16	15	ACTG
+12	12	3118	AGAT
+13	13	1216	AGAT
+14	14	1084	AGAT
+15	15	869	AGAT
+16	16	508	AGAT
+17	17	322	AGAT
+18	18	159	AGAT
+19	19	258	AGAT
+20	20	63	AGAT
+21	21	84	AGAT
+22	22	69	AGAT
+22	14	6	AGAT
+23	23	112	AGAT
+24	24	107	AGAT
+25	25	36	AGAT
+26	26	113	AGAT
+27	27	42	AGAT
+28	28	58	AGAT
+29	29	37	AGAT
+30	30	16	AGAT
+31	31	32	AGAT
+32	32	24	AGAT
+33	33	10	AGAT
+34	34	43	AGAT
+35	35	6	AGAT
+36	36	13	AGAT
+36	32	1	AGAT
+37	37	35	AGAT
+38	38	34	AGAT
+39	39	20	AGAT
+39	35	2	AGAT
+40	40	27	AGAT
+41	41	29	AGAT
+42	42	30	AGAT
+43	43	87	AGAT
+44	44	67	AGAT
+45	45	20	AGAT
+46	46	15	AGAT
+47	47	28	AGAT
+48	48	26	AGAT
+49	49	13	AGAT
+50	50	11	AGAT
+52	52	5	AGAT
+54	54	6	AGAT
+12	12	236	AGCC
+13	13	109	AGCC
+14	14	17	AGCC
+15	15	14	AGCC
+16	16	8	AGCC
+18	18	12	AGCC
+21	21	18	AGCC
+23	23	13	AGCC
+12	12	23	AGCG
+13	13	19	AGCG
+18	18	9	AGCG
+12	12	272	AGCT
+13	13	89	AGCT
+14	14	108	AGCT
+15	15	49	AGCT
+16	16	19	AGCT
+17	17	19	AGCT
+18	18	19	AGCT
+19	19	44	AGCT
+22	22	12	AGCT
+27	27	16	AGCT
+12	12	87	AGGC
+13	13	19	AGGC
+14	14	16	AGGC
+18	18	7	AGGC
+12	12	3610	AGGG
+13	13	1980	AGGG
+14	14	1095	AGGG
+15	15	624	AGGG
+16	16	159	AGGG
+17	17	59	AGGG
+18	18	43	AGGG
+19	19	60	AGGG
+20	20	49	AGGG
+21	21	12	AGGG
+23	23	10	AGGG
+12	12	531	ATCC
+13	13	323	ATCC
+14	14	221	ATCC
+15	15	58	ATCC
+16	16	78	ATCC
+17	17	38	ATCC
+18	18	12	ATCC
+19	19	19	ATCC
+20	20	17	ATCC
+21	21	44	ATCC
+22	22	12	ATCC
+23	23	39	ATCC
+24	24	11	ATCC
+25	25	12	ATCC
+27	27	10	ATCC
+32	32	6	ATCC
+39	39	8	ATCC
+40	40	6	ATCC
+48	48	7	ATCC
+12	12	272	ATCG
+13	13	89	ATCG
+14	14	108	ATCG
+15	15	49	ATCG
+16	16	19	ATCG
+17	17	19	ATCG
+18	18	19	ATCG
+19	19	44	ATCG
+22	22	12	ATCG
+27	27	16	ATCG
+12	12	1119	ATGC
+13	13	509	ATGC
+14	14	338	ATGC
+15	15	16	ATGC
+16	16	66	ATGC
+17	17	7	ATGC
+19	19	27	ATGC
+12	12	13	CCCG
+12	12	178	AGTC
+13	13	77	AGTC
+14	14	13	AGTC
+15	15	12	AGTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/combineprob_out.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,7 @@
+read_depth	allele	heterozygous_prob	motif
+2	10_11	0.485943568663	A
+2	11_12	0.472130683091	A
+2	9_10	0.494635026326	A
+3	10_11	0.71878954705	A
+3	11_12	0.688571908761	A
+3	9_10	0.73801798345	A
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatcompat_in.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,3 @@
+15	64416346	64416378	AT	32	16	18	22	61	TA	0	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC	CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@?	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	15	64416324	64416346	64416346	64416378	64416378	64416439	32	ATATATATATATATATATATATATATATATAT
+17	52191125	52191133	GA	8	4	8	26	67	AC	0	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	17	52191099	52191125	52191125	52191133	52191133	52191200	8	ACACACAC
+17	52191125	52191133	AC	8	4	8	26	67	AG	0	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	17	52191099	52191125	52191125	52191133	52191133	52191200	8	ACACACAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatcompat_out.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,3 @@
+15	64416346	64416378	AT	32	16	18	22	61	TA	0	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC	CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@?	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	15	64416324	64416346	64416346	64416378	64416378	64416439	32	ATATATATATATATATATATATATATATATAT
+17	52191125	52191133	GA	8	4	8	26	67	AC	0	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	17	52191099	52191125	52191125	52191133	52191133	52191200	8	ACACACAC
+17	52191125	52191133	AC	8	4	8	26	67	AG	0	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	17	52191099	52191125	52191125	52191133	52191133	52191200	8	ACACACAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatellite_flanking_L.fastq	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,4 @@
+@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT
++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatellite_flanking_R.fastq	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,4 @@
+@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG
++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatpurity_in.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,3 @@
+15	64416346	64416378	AT	32	16	18	22	61	AT	0	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC	CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@?	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	15	64416324	64416346	64416346	64416378	64416378	64416439	32	ATATATATATATATATATATATATATATATAT
+15	64416346	64416378	AT	32	16	18	22	61	AT	0	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC	CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@?	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	15	64416324	64416346	64416346	64416378	64416378	64416439	32	ATATATATATATATATATTATATATATATAT
+17	52191125	52191133	AC	8	4	8	26	67	AC	0	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	17	52191099	52191125	52191125	52191133	52191133	52191200	8	ACACACAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatpurity_out.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+15	64416346	64416378	AT	32	16	18	22	61	AT	0	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC	CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@?	ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1	15	64416324	64416346	64416346	64416378	64416378	64416439	32	ATATATATATATATATATATATATATATATAT
+17	52191125	52191133	AC	8	4	8	26	67	AC	0	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC	ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1	17	52191099	52191125	52191125	52191133	52191133	52191200	8	ACACACAC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nice1tab.py	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,6 @@
+import sys
+fd=open(sys.argv[1])
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split()
+    print '\t'.join(temp)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/probvalueforhetero_in.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,9 @@
+chr	9,10	A	hetero	-1.27220836321	10	10	9
+chr	10,11	A	hetero	-0.939119957032	11	11	10
+chr	11,12	A	hetero	-0.720375026792	12	12	11
+chr	9,9,10	A	hetero	-1.6841441619	9	9	10
+chr	9,10,10	A	hetero	-0.97233405327	10	10	9
+chr	10,10,11	A	hetero	-1.29451118958	10	10	11
+chr	10,11,11	A	hetero	-0.641022011041	11	11	10
+chr	11,11,12	A	hetero	-1.01921634129	11	11	12
+chr	11,12,12	A	hetero	-0.425116661902	12	12	11
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/probvalueforhetero_out.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,9 @@
+chr	9,10	A	hetero	-1.27220836321	10	10	9	0.247317513163	2	0.494635026326	2
+chr	10,11	A	hetero	-0.939119957032	11	11	10	0.242971784331	2	0.485943568663	2
+chr	11,12	A	hetero	-0.720375026792	12	12	11	0.236065341545	2	0.472130683091	2
+chr	9,9,10	A	hetero	-1.6841441619	9	9	10	0.124528157268	3	0.373584471803	3
+chr	9,10,10	A	hetero	-0.97233405327	10	10	9	0.121477837216	3	0.364433511647	3
+chr	10,10,11	A	hetero	-1.29451118958	10	10	11	0.122575544751	3	0.367726634253	3
+chr	10,11,11	A	hetero	-0.641022011041	11	11	10	0.117020970932	3	0.351062912797	3
+chr	11,11,12	A	hetero	-1.01921634129	11	11	12	0.11865253007	3	0.35595759021	3
+chr	11,12,12	A	hetero	-0.425116661902	12	12	11	0.110871439517	3	0.332614318551	3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/profilegenerator_in.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,6 @@
+9	9	100000
+10	10	91456
+10	9	1259
+11	11	39657
+11	10	1211
+11	12	514
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/profilegenerator_out.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,30 @@
+chr	9,9	A
+chr	9,10	A
+chr	9,11	A
+chr	9,12	A
+chr	10,10	A
+chr	10,11	A
+chr	10,12	A
+chr	11,11	A
+chr	11,12	A
+chr	12,12	A
+chr	9,9,9	A
+chr	9,9,10	A
+chr	9,9,11	A
+chr	9,9,12	A
+chr	9,10,10	A
+chr	9,10,11	A
+chr	9,10,12	A
+chr	9,11,11	A
+chr	9,11,12	A
+chr	9,12,12	A
+chr	10,10,10	A
+chr	10,10,11	A
+chr	10,10,12	A
+chr	10,11,11	A
+chr	10,11,12	A
+chr	10,12,12	A
+chr	11,11,11	A
+chr	11,11,12	A
+chr	11,12,12	A
+chr	12,12,12	A
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/readdepth2seqdepth.out	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+repeat_length	read_length	informative_read_depth	=locus_specific_sequencing_depth	=genome_wide_sequencing_depth
+10	100	10	20	26
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplePESAM_2_profile_C.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,5 @@
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1	shifted	540	713	713	719	719	759	6	GGGGGG
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2	shifted	4007	4082	4082	4088	4088	4258	6	TTTTTT
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1	shifted	1849	1930	1930	1936	1936	2100	6	CCCCCC
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2	shifted	1849	2025	2025	2030	2030	2100	5	GGGGG
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1	shifted	1428	1517	1517	1522	1522	1543	5	AAAAA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleTRgenotypingcorrection	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+chr1	14,13,13,13	A	hetero	-0.429451855856	13	13	14
+chr1	5,6,6,6,6,7,7,8,8	A	hetero	-14.8744881854	7	6	8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleTRprofile_C.txt	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+chr1	14,13,13,13	A
+chr1	5,6,6,6,6,7,7,8,8	A
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplefq.snoope	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,1 @@
+6	40	54	G	0	SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1	TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG	GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplefq.snoope.new	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,1 @@
+6	40	54	G	0	SRR345592.75000006_HS2000-192_107:1:63:5822:176818_1_per1_1	TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG	GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleprofilegenerator_in	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,6 @@
+9	9	100000
+10	10	91456
+10	9	1259
+11	11	39657
+11	10	1211
+11	12	514
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleprofilegenerator_out	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,30 @@
+chr	9,9	A
+chr	9,10	A
+chr	9,11	A
+chr	9,12	A
+chr	10,10	A
+chr	10,11	A
+chr	10,12	A
+chr	11,11	A
+chr	11,12	A
+chr	12,12	A
+chr	9,9,9	A
+chr	9,9,10	A
+chr	9,9,11	A
+chr	9,9,12	A
+chr	9,10,10	A
+chr	9,10,11	A
+chr	9,10,12	A
+chr	9,11,11	A
+chr	9,11,12	A
+chr	9,12,12	A
+chr	10,10,10	A
+chr	10,10,11	A
+chr	10,10,12	A
+chr	10,11,11	A
+chr	10,11,12	A
+chr	10,12,12	A
+chr	11,11,11	A
+chr	11,11,12	A
+chr	11,12,12	A
+chr	12,12,12	A
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplesortedPESAM_C.sam	Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,10 @@
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1	113	shifted	720	37	40M	=	541	-46	TTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACC	HHFG@IIHHHHHIHHFHHGFGGGGDBDDEDDDBBB?????	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:40
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1	177	shifted	541	37	173M	=	720	46	CTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAAC	::GECC:*:)D<GEGGGECCCEC?00E?::CCCCEEECC:C*GEC4'.>ACGGEC:CC?>><DCE?C:EC?GECE?:CCECGEEC*GEECEC:GEEGE?GGECC:ECA2CC*CCC8DEGGEGC=CGECEAEGEEDGGEDEGD=EBGGGFDHHHHHHHHEEHHHHHIIHFIIHH	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:173
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2	113	shifted	4089	37	170M	=	4008	-176	GCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGAAGCCATACCAAACGACGAGCGTGACACCACGATGCCTGTAGCAATGGCAACAACGTTGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCTTCCCGGCAACAATTAATAG	GECGGGGGGGGGGGGEGEGGGGD>2GEGGGGGEEGGGGGGGGGGGGGEEECEGEAGGEEGEB>=GGFGEAGHHHEHHHFHFF?ED;HFIHHIIIIHIIHHHHIHHHHIHHHHHHHHIIIIHIHHHHIHHHHHIIHHIIHHIIHIIIIIGGGGGGDDDDDDDDBBB????<	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:170
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2	177	shifted	4008	37	75M	=	4089	176	TGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGC	CEGGEEEECC?:EEGECGGGGECGGGGEEGGEEGCCGEGGGGGGGGGGDGGGGGE>EEGGGGGGGGGGGAGGGGE	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:75
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1	129	shifted	1937	37	164M	=	1850	-87	TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT	HHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG	XT:A:U	NM:i:1	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:1	XO:i:0	XG:i:0	MD:Z:138T25
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1	65	shifted	1850	37	81M	=	1937	87	CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGA	?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGH	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:81
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2	129	shifted	2031	37	70M	=	1850	-181	TAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT	GGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG	XT:A:U	NM:i:1	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:1	XO:i:0	XG:i:0	MD:Z:44T25
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2	65	shifted	1850	37	176M	=	2031	181	CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTT	?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGHIIIHHHHHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGG	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:176
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1	129	shifted	1523	37	21M	=	1429	-94	GTCTTTAACTCCACCATTAGC	GGGEGGEGGGGGCGGGGGEGG	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:21
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1	65	shifted	1429	37	89M	=	1523	94	CTATGCATCCAACGCGTTGGGAGCTCTCCCATATGGTCGACCTGCAGGCGGCCGCGAATTCACTAGTGATTTCCAAGGACAAATCAGAG	?????BBBDDDDDDDDGGGFGGFEHIIIIIIIHIIIHIHHHHHIIHFHHHHHHHHHHHHHHHHHHHHGGGGGGGGGGGGGGGGGGEGEE	XT:A:U	NM:i:0	SM:i:37	AM:i:37	X0:i:1	X1:i:0	XM:i:0	XO:i:0	XG:i:0	MD:Z:89
Binary file test-data/shifted.2bit has changed