Repository 'microsatellite_ngs'
hg clone https://toolshed.g2.bx.psu.edu/repos/arkarachai-fungtammasan/microsatellite_ngs

Changeset 0:20ab85af9505 (2014-10-03)
Next changeset 1:f265e26ab550 (2014-10-24)
Commit message:
Uploaded
added:
.DS_Store
GenotypeTRcorrection.py
GenotypingSTR.xml
PEsortedSAM2readprofile.py
PEsortedSAM2readprofile.xml
changespacetounderscore_readname.py
combinedprobforallelecombination.py
combineprobforallelecombination.xml
fetchflank.xml
heteroprob.py
microsatcompat.py
microsatcompat.xml
microsatellite.py
microsatellite.xml
microsatpurity.py
microsatpurity.xml
pair_fetch_DNA_ff.py
probvalueforhetero.xml
profilegenerator.py
profilegenerator.xml
readdepth2sequencingdepth.xml
sequencingdepthconversion_G.py
space2underscore_readname.xml
test-data/.DS_Store
test-data/C_sample_fastq
test-data/C_sample_snoope
test-data/PCRinclude.allrate.bymajorallele
test-data/combineprob_out.txt
test-data/microsatcompat_in.txt
test-data/microsatcompat_out.txt
test-data/microsatellite_flanking_L.fastq
test-data/microsatellite_flanking_R.fastq
test-data/microsatpurity_in.txt
test-data/microsatpurity_out.txt
test-data/nice1tab.py
test-data/probvalueforhetero_in.txt
test-data/probvalueforhetero_out.txt
test-data/profilegenerator_in.txt
test-data/profilegenerator_out.txt
test-data/readdepth2seqdepth.out
test-data/samplePESAM_2_profile_C.txt
test-data/sampleTRgenotypingcorrection
test-data/sampleTRprofile_C.txt
test-data/samplefq.snoope
test-data/samplefq.snoope.new
test-data/sampleprofilegenerator_in
test-data/sampleprofilegenerator_out
test-data/samplesortedPESAM_C.sam
test-data/shifted.2bit
b
diff -r 000000000000 -r 20ab85af9505 .DS_Store
b
Binary file .DS_Store has changed
b
diff -r 000000000000 -r 20ab85af9505 GenotypeTRcorrection.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GenotypeTRcorrection.py Fri Oct 03 20:54:30 2014 -0400
[
b'@@ -0,0 +1,250 @@\n+### import libraries ###\n+import sys\n+import collections, math\n+import heapq\n+from galaxy import eggs\n+\n+\n+    \n+\n+\n+### basic function ###\n+def stop_err(msg):\n+    sys.stderr.write(msg)\n+    sys.exit()\n+        \n+def averagelist(a,b,expectedlevelofminor):\n+    product=[]\n+    for i in range(len(a)):\n+        product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])\n+  \n+    return product\n+        \n+def complement_base(read):\n+    collect=\'\'\n+    for i in read:\n+        if i.upper()==\'A\':\n+            collect+=\'T\'\n+        elif i.upper()==\'T\':\n+            collect+=\'A\'\n+        elif i.upper()==\'C\':\n+            collect+=\'G\'\n+        elif i.upper()==\'G\':\n+            collect+=\'C\'\n+    return collect\n+def makeallpossible(read):\n+    collect=[]\n+    for i in range(len(read)):\n+        tmp= read[i:]+read[:i]\n+        collect.append(tmp)\n+        collect.append(complement_base(tmp))\n+    return collect\n+\n+def motifsimplify(base):\n+    \'\'\'str--> str\n+    \'\'\'\n+    motiflength=len(base)\n+    temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))\n+    \n+    return temp[0]\n+\n+def majorallele(seq):\n+    binseq=list(set(seq))  \n+    binseq.sort(reverse=True)   # highly mutate mode\n+    #binseq.sort()              # majority mode\n+    storeform=\'\'\n+    storevalue=0\n+    for i in binseq:\n+        if seq.count(i)>storevalue:\n+            storeform=i\n+            storevalue=seq.count(i)\n+            \n+    return int(storeform)\n+\n+### decide global parameter ###\n+COORDINATECOLUMN=1\n+ALLELECOLUMN=2\n+MOTIFCOLUMN=3\n+  ##(0.01-0.5)\n+MINIMUMMUTABLE=1.2*(1.0/(10**8))  #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012\n+\n+\n+## Fixed global variable\n+inputname=sys.argv[1]\n+errorprofile=sys.argv[2]\n+Genotypingcorrected=sys.argv[3]\n+EXPECTEDLEVELOFMINOR=float(sys.argv[4])\n+if EXPECTEDLEVELOFMINOR >0.5:\n+\ttry:\n+\t\texpected_contribution_of_minor_allele=int(\'expected_contribution_of_minor_allele\')\n+\texcept Exception, eee:\n+\t\tprint eee\n+\t\tstop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")\n+ALLREPEATTYPE=[1,2,3,4]\n+ALLREPEATTYPENAME=[\'mono\',\'di\',\'tri\',\'tetra\']\n+monomotif=[\'A\',\'C\']\n+dimotif=[\'AC\',\'AG\',\'AT\',\'CG\']\n+trimotif=[\'AAC\',\'AAG\',\'AAT\',\'ACC\',\'ACG\',\'ACT\',\'AGC\',\'AGG\',\'ATC\',\'CCG\']\n+tetramotif=[\'AAAC\',\'AAAG\',\'AAAT\',\'AACC\',\'AACG\',\'AACT\',\'AAGC\',\'AAGG\',\'AAGT\',\'AATC\',\'AATG\',\'AATT\',\\\n+\'ACAG\',\'ACAT\',\'ACCC\',\'ACCG\',\'ACCT\',\'ACGC\',\'ACGG\',\'ACGT\',\'ACTC\',\'ACTG\',\'AGAT\',\'AGCC\',\'AGCG\',\'AGCT\',\\\n+\'AGGC\',\'AGGG\',\'ATCC\',\'ATCG\',\'ATGC\',\'CCCG\',\'CCGG\',\'AGTC\']\n+ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}\n+monorange=range(5,60)\n+dirange=range(6,60)\n+trirange=range(9,60)\n+tetrarange=range(12,80)\n+ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}\n+\n+#########################################\n+######## Prob calculation sector ########\n+#########################################\n+def multinomial_prob(majorallele,STRlength,motif,probdatabase):\n+    \'\'\'int,int,str,dict-->int\n+    ### get prob for each STRlength to be generated from major allele\n+    \'\'\'\n+    #print (majorallele,STRlength,motif)\n+    prob=probdatabase[len(motif)][motif][majorallele][STRlength]\n+    return prob\n+\n+################################################\n+######## error model database sector ###########\n+################################################\n+\n+## structure generator\n+errormodeldatabase={1:{},2:{},3:{},4:{}}\n+sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}\n+for repeattype in ALLREPEATTYPE:\n+    for motif in ALLMOTIF[repeattype]:\n+        errormodeldatabase[repeattype][motif]={}\n+        sumbymajoralleledatabase[repeattype][motif]={}\n+        for motifsize1 in ALLRANGE[repeattype]:\n+            errormodeldatabase[repeattype][motif][motifsize1]={}\n+            sumbymajoralleledatabase[repeattype][motif][motifsize1]=0\n+            for motifsize2 in ALLRANGE[repeattype]:\n+                errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE\n+\n+#print errormodeldatab'..b'\n+\n+#########################################\n+######## input reading sector ###########\n+#########################################\n+fdout=open(Genotypingcorrected,\'w\')\n+\n+fd = open(inputname)\n+\n+lines=fd.xreadlines()\n+for line in lines:\n+    i_read=[]\n+    i2_read=[]\n+    temp=line.strip().split(\'\\t\')\n+    i_coordinate=temp[COORDINATECOLUMN-1]\n+    i_motif=motifsimplify(temp[MOTIFCOLUMN-1])\n+    i_read=temp[ALLELECOLUMN-1].split(\',\')\n+    i_read=map(int,i_read)\n+    coverage=len(i_read)\n+\n+### Evaluate 1 major allele ###    \n+    i_all_allele=list(set(i_read))\n+    i_major_allele=majorallele(i_read)\n+    f_majorallele=i_read.count(i_major_allele)\n+### Evaluate 2 major allele ### \n+    if len(i_all_allele)>1:\n+        i2_read=filter(lambda a: a != i_major_allele, i_read)\n+        i_major2_allele=majorallele(i2_read)\n+        f_majorallele2=i_read.count(i_major2_allele)\n+        ### Evaluate 3 major allele ### \n+        if len(i_all_allele)>2:\n+            i3_read=filter(lambda a: a != i_major2_allele, i2_read)\n+            i_major3_allele=majorallele(i3_read)\n+            f_majorallele3=i_read.count(i_major3_allele)\n+        ### No 3 major allele ### \n+        elif len(i_all_allele)==2:\n+            i_major3_allele=i_major2_allele\n+    ### No 2 major allele ### \n+    elif len(i_all_allele)==1:\n+        #i_major2_allele=majorallele(i_read)\n+        i_major2_allele=i_major_allele+len(i_motif)\n+        i_major3_allele=i_major2_allele\n+        #print line.strip()+\'\\t\'+\'\\t\'.join([\'homo\',\'only\',str(i_major_allele),str(i_major_allele),\'NA\'])\n+        #continue\n+    else:\n+        print("no allele is reading")\n+        sys.exit()\n+    \n+## scope filter\n+\n+#########################################\n+######## prob calculation option ########\n+#########################################\n+    homozygous_collector=0\n+    heterozygous_collector=0\n+\n+      \n+    alist=[multinomial_prob(i_major_allele,x,i_motif,errormodeldatabase)for x in i_read]\n+    blist=[multinomial_prob(i_major2_allele,x,i_motif,errormodeldatabase)for x in i_read]\n+    clist=[multinomial_prob(i_major3_allele,x,i_motif,errormodeldatabase)for x in i_read]\n+    \n+    ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)\n+    bclist=averagelist(blist,clist,EXPECTEDLEVELOFMINOR)\n+    aclist=averagelist(alist,clist,EXPECTEDLEVELOFMINOR)\n+    \n+    #print alist,blist,clist\n+    majora=sum([math.log(i,10) for i in alist])\n+    majorb=sum([math.log(i,10) for i in blist])    \n+    majorc=sum([math.log(i,10) for i in clist])\n+    homozygous_collector=max(majora,majorb,majorc)\n+    \n+    homomajor1=max([(majora,i_major_allele),(majorb,i_major2_allele),(majorc,i_major3_allele)])[1]\n+    homomajordict={i_major_allele:majora,i_major2_allele:majorb,i_major3_allele:majorc}\n+    \n+    majorab=sum([math.log(i,10) for i in ablist])\n+    majorbc=sum([math.log(i,10) for i in bclist])    \n+    majorac=sum([math.log(i,10) for i in aclist])\n+    heterozygous_collector=max(majorab,majorbc,majorac)\n+    bothheteromajor=max([(majorab,(i_major_allele,i_major2_allele)),(majorbc,(i_major2_allele,i_major3_allele)),(majorac,(i_major_allele,i_major3_allele))])[1]\n+    ##heteromajor1=max(bothheteromajor)\n+    ##heteromajor2=min(bothheteromajor)\n+    pre_heteromajor1=bothheteromajor[0]\n+    pre_heteromajor2=bothheteromajor[1]\n+    heteromajor1=max((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]\n+    heteromajor2=min((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]\n+    \n+    logratio_homo=homozygous_collector-heterozygous_collector\n+    \n+    if logratio_homo>0:\n+        fdout.writelines(line.strip()+\'\\t\'+\'\\t\'.join([\'homo\',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+\'\\n\')\n+    elif logratio_homo<0:\n+        fdout.writelines(line.strip()+\'\\t\'+\'\\t\'.join([\'hetero\',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+\'\\n\')\n+fd.close()\n+fdout.close()  \n'
b
diff -r 000000000000 -r 20ab85af9505 GenotypingSTR.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GenotypingSTR.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,72 @@
+<tool id="GenotypeSTR" name="Correct genotype for microsatellite errors" version="2.0.0">
+  <description> during sequencing and library prep </description>
+  <command interpreter="python2.7">GenotypeTRcorrection.py  $microsat_raw $microsat_error_profile $microsat_corrected  $expectedminorallele </command>
+
+  <inputs>
+    <param name="microsat_raw" type="data" label="Select microsatellite length profile that need to refine genotyping" />
+    <param name="microsat_error_profile" type="data" label="Select microsatellite error profile that correspond to this dataset" />
+ <param name="expectedminorallele" type="float" value="0.5" label="Expected contribution of minor allele when present (0.5 for genotyping)" />
+
+  </inputs>
+  <outputs>
+    <data name="microsat_corrected" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="microsat_raw" value="sampleTRprofile_C.txt"/>
+      <param name="microsat_error_profile" value="PCRinclude.allrate.bymajorallele"/>
+      <param name="expectedminorallele" value="0.5"/>
+      <output name="microsat_corrected" file="sampleTRgenotypingcorrection"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will correct for microsatellite sequencing and library preparation errors using error rates estimated from hemizygous male X chromosome or any rates provided by user. The read profile for each locus will be processed independently. 
+- First, this tool will find three most common read lengths from input read length profile. If the read profile has only one length of TR, the length of one motif longer than the observed length will be used as the second most common read length. 
+- Second, it will calculate probability of three forms of homozygous and use the form which give the highest probability. The same goes for heterozygous. 
+- Third, this tools will calculate log based 10 of (the probability of homozygous/the probability of heterozygous). If this value is more than 0, it will predict this locus to homozygous. If this value is less than 0, it will predict this locus to heterozygous. If this value is 0, read profile at this locus will be discard. 
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+- The input files need to contain at least three columns. 
+- Column 1 = location of microsatellite locus. 
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. 
+
+**Output**
+
+The output will be contain original three (or more) column as the input. However, it will also have these following columns. 
+
+- Additional column 1 = homozygous/heterozygous label.
+- Additional column 2 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Additional column 3 = Allele for most probable homozygous form.
+- Additional column 4 = Allele 1 for most probable heterozygous form.
+- Additional column 5 = Allele 2 for most probable heterozygous form.
+
+**Example**
+
+- Suppose that we sequence one locus of microsatellite with NGS. This locus has **A** motif and the following length (bp) profile. ::
+
+ chr1_100_106 5, 6, 6, 6, 6, 7, 7, 8, 8 A
+
+- We want to figure out if this locus is a homolozygous or heterozygous and the corresponding allele(s). Therefore, we use this tool to refine genotype.
+- This tool will calculate the probability of homozygous A6A6, A7A7, and A8A8 to generate observed length profile. Among this A7A7 has the highest probability. Therefore, we use this form as the representative for homozygous.
+- Then, this tool will calculate the probability of heterozygous A6A7, A7A8, and A6A8 to generate observed length profile. Among this A6A8 has the highest probability. Therefore, we use this form as the representative for heterozygous.    
+- The A6A7 has higher probability than A7A7. Therefore, the program will report that this locus is a heterozygous locus. ::
+
+ chr1 5,6,6,6,6,7,7,8,8 A hetero -14.8744881854 7 6 8
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 PEsortedSAM2readprofile.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PEsortedSAM2readprofile.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+import sys
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import bx.seq.twobit
+
+##output columns: read_name chr prefix_start    prefix_end  TR_start    TR_end  suffix_start    suffix_end  TR_length   TR_sequence
+
+samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname
+seq_path = sys.argv[2] #Path to the reference genome in 2bit format
+
+##maxTRlength=int(sys.argv[4])
+##maxoriginalreadlength=int(sys.argv[5])
+maxTRlength=int(sys.argv[3])
+maxoriginalreadlength=int(sys.argv[4])
+outfile=sys.argv[5]
+fout = open(outfile,'w')
+
+twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
+
+skipped=0
+while True:
+    read = samf.readline().strip()
+    if not(read): #EOF reached
+        break
+    if read[0] == "@":
+        #print read
+        continue
+    mate = samf.readline().strip()
+    if not(mate): #EOF reached
+        break
+    read_elems = read.split()
+    mate_elems = mate.split()
+    read_name = read_elems[0].strip()
+    mate_name = mate_elems[0].strip()
+    while True:
+        if read_name == mate_name:
+            break
+        elif read_name != mate_name:
+            #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry."
+            #break
+            skipped += 1
+            read = mate
+            read_elems = mate_elems
+            mate = samf.readline().strip()
+            read_name = read_elems[0].strip()
+            mate_name = mate_elems[0].strip()
+            if not(mate): #EOF reached
+                break
+            mate_elems = mate.split()
+    #extract XT:A tag
+    #for e in  read_elems:
+    #    if e.startswith('XT:A'):
+    #        read_xt = e
+    #for e in  mate_elems:
+    #    if e.startswith('XT:A'):
+    #        mate_xt = e
+    #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems:   #both read and it's mate need to be mapped uniquely
+    #    continue
+    read_chr = read_elems[2]
+    read_start = int(read_elems[3])
+    read_cigar = read_elems[5]
+    if len(read_cigar.split('M')) != 2:     #we want perfect matches only..cigar= <someInt>M
+        continue
+    read_len = int(read_cigar.split('M')[0])
+    mate_chr = mate_elems[2]
+    mate_start = int(mate_elems[3])
+    mate_cigar = mate_elems[5]
+    if len(mate_cigar.split('M')) != 2:     #we want perfect matches only..cigar= <someInt>M
+        continue
+    mate_len = int(mate_cigar.split('M')[0])
+    if read_chr != mate_chr:            # check that they were mapped to the same chromosome
+        continue
+    if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength):
+        continue
+    if read_start < mate_start:
+        pre_s = read_start-1
+        pre_e = read_start-1+read_len
+        tr_s = read_start-1+read_len
+        tr_e = mate_start-1
+        suf_s = mate_start-1
+        suf_e = mate_start-1+mate_len
+    else:
+        pre_s = mate_start-1
+        pre_e = mate_start-1+mate_len
+        tr_s = mate_start-1+mate_len
+        tr_e = read_start-1
+        suf_s = read_start-1
+        suf_e = read_start-1+read_len
+    tr_len = abs(tr_e - tr_s)
+    if tr_len > maxTRlength:
+        continue
+    if pre_e >= suf_s:  #overlapping prefix and suffix
+        continue
+    tr_ref_seq = twobitfile[read_chr][tr_s:tr_e]
+    ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
+    fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n')
+
+print  "Skipped %d unpaired reads" %(skipped)
b
diff -r 000000000000 -r 20ab85af9505 PEsortedSAM2readprofile.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PEsortedSAM2readprofile.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,62 @@
+<tool id="PEsortedSAM2readprofile" name="Combine mapped flaked bases" version="1.0.0">
+  <description> from SAM file sorted by readname  </description>
+  <command interpreter="python2.7">PEsortedSAM2readprofile.py  $flankedbasesSAM $twobitref $maxTRlength $maxoriginalreadlength $output </command>
+
+  <inputs>
+    <param name="flankedbasesSAM" type="data" format="sam" label="Select sorted SAM file (by readname) of flaked bases" />
+    <param name="twobitref" type="data" label="Select twobit file reference genome" />
+ <param name="maxTRlength" type="integer" value="100" label="Maximum expected microsatellite length (bp)" />
+ <param name="maxoriginalreadlength" type="integer" value="101" label="Maxinum original read length" />
+
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="flankedbasesSAM" value="samplesortedPESAM_C.sam"/>
+      <param name="twobitref" value="shifted.2bit"/>
+      <param name="maxTRlength" value="100"/>
+      <param name="maxoriginalreadlength" value="250"/>
+      <output name="output" file="samplePESAM_2_profile_C.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will take SAM file sorted by read name, remove unpaired reads, report microsatellites sequences in the reference genome that correspond to the space between paired end reads. Coordinate of start and stop for left and right flanking regions of microsatellites and microsatellite itself as inferred from paired end reads will also be reported.
+- These microsatellites in reference can be used to filter out reads that do not contain microsatellites that concur with microsatellites in reference where the reads mapped to.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+- Sorted SAM files by read name
+
+**Output**
+
+The output will combined two lines of input which are paired. The output format is as follow.
+
+- Column 1 = read name
+- Column 2 = chromosome 
+- Column 3 = left flanking region start
+- Column 4 = left flanking region stop
+- Column 5 = microsatellite start
+- Column 6 = microsatellite stop
+- Column 7 = right flanking region start
+- Column 8 = right flanking region stop
+- Column 9 = microsatellite length in reference
+- Column 10= microsatellite sequence in reference
+
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 changespacetounderscore_readname.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/changespacetounderscore_readname.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,15 @@
+import sys
+fd=open(sys.argv[1])
+output=open(sys.argv[2],'w')
+columntochange=int(sys.argv[3])-1  # default is 6-1=5
+lines=fd.xreadlines()
+for line in lines:
+ temp=line.strip().split('\t')
+ temp=filter(None,temp)
+ temp2=temp[columntochange].replace(' ','_')
+ product=temp[:columntochange]
+ product.append(temp2)
+ product.extend(temp[columntochange+1:])
+ output.writelines('\t'.join(product)+'\n')
+fd.close()
+output.close()
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 combinedprobforallelecombination.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combinedprobforallelecombination.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,41 @@
+import sys
+import collections
+import math
+SAMPLINGCOL=11
+ALLELE1COL=7
+ALLELE2COL=8
+SIGNCOL=4
+readprofileCOL=2
+motifCOL=3
+filaname=sys.argv[1]
+fd=open(filaname)
+lines=fd.readlines()
+binomialcombine=collections.defaultdict(list)
+for line in lines:
+    temp=line.strip().split('\t')
+    allelelist=[]
+    allelelist.append(int(temp[ALLELE1COL-1]))
+    allelelist.append(int(temp[ALLELE2COL-1]))
+    allelelist.sort()
+    #allelelist=map(str,allelelist)
+    alleleave=str(allelelist[0])+'_'+str(allelelist[1])
+    #alleleave=str(sum(allelelist)/2.0)
+    ##alleleave=str(allelelist[0])+'_'+str(allelelist[1])
+    totalcov=len(temp[readprofileCOL-1].split(','))
+    motif=temp[motifCOL-1]
+    samplingvalue=float(temp[SAMPLINGCOL-1])
+    SIGN=1 
+    binomialcombine[(totalcov,alleleave,motif)].append(SIGN*samplingvalue)
+allkeys= binomialcombine.keys()
+allkeys.sort()
+##print allkeys
+print 'read_depth'+'\t'+'allele'+'\t'+'heterozygous_prob'+'\t'+'motif'
+for key in allkeys:
+    ##templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2],str(map(str,(binomialcombine[key])))]
+    templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2]]
+
+    print '\t'.join(templist)
+#print allkeys#,binomialcombine
+    
+    
+        
b
diff -r 000000000000 -r 20ab85af9505 combineprobforallelecombination.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combineprobforallelecombination.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,67 @@
+<tool id="combineproballelecom" name="Combine probability to generate read profile " version="2.0.0">
+  <description>from the same allele combination</description>
+  <command interpreter="python2.7">combinedprobforallelecombination.py  $input > $output </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select microsatellite length profile" />

+  </inputs>
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="probvalueforhetero_out.txt"/>
+      <output name="output" file="combineprob_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will combine probability that the allele combination can generated any read profile in the input. This is the last step to calculate probability to detect heterozygous for each allele combination and each depth.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+The input format is the same as output from **Evaluate the probability of the allele combination to generate read profile** tool.
+
+- Column 1 = location of microsatellite locus. 
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. 
+- Column 4 = homozygous/heterozygous label.
+- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Column 6 = Allele for most probable homozygous form.
+- Column 7 = Allele 1 for most probable heterozygous form.
+- Column 8 = Allele 2 for most probable heterozygous form.
+- Column 9 = Probability of the allele combination to generate given read profile.
+- Column 10 = Number of possible rearrangement of given read profile.
+- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10)
+- Column 12 = Read depth
+
+Only column 2,3,4,7,8,11 were used in calculation. 
+
+**Output**
+
+
+The output will contain the following header and column

+- Line 1 header: read_depth allele heterozygous_prob motif
+- Column 1 = read depth
+- Column 2 = allele combination
+- Column 3 = probability to detect heterozygous of that allele combination
+- Column 4 = motif
+
+
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 fetchflank.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fetchflank.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,73 @@
+<tool id="fetchflank" name="Fetch flanking bases" version="1.0.0">
+  <description> of microsatellites and output as two fastq files in forward-forward orientation</description>
+  <command interpreter="python">pair_fetch_DNA_ff.py  $microsat_in_read $Leftflanking $Rightflanking $qualitycutoff $lengthofbasetocheckquality  </command>
+
+  <inputs>
+    <param name="microsat_in_read" type="data" label="Select data of microsatellites in reads" />
+    <param name="qualitycutoff" type="integer" value="20" label="Minimum quality score (Phred+33) for microsatellites and flanking regions" />
+    <param name="lengthofbasetocheckquality" type="integer" value="20" label="Length of flanking regions that require quality screening" />        
+  </inputs>
+  <outputs>
+    <data format="fastq" name="Leftflanking" />
+    <data format="fastq" name="Rightflanking" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="microsat_in_read" value="samplefq.snoope"/>
+      <param name="qualitycutoff" value="20"/>
+      <param name="lengthofbasetocheckquality" value="20"/>
+      <output name="Leftflanking" file="microsatellite_flanking_L.fastq"/>
+      <output name="Rightflanking" file="microsatellite_flanking_R.fastq"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool will fetch flanking regions around microsatellites, screen for quality score at microsatellites and adjacent flanking regions, and output two fastq files containing flanking regions in forward-forward direction.
+
+- This tool assumes that the quality score is Phred+33, such as Sanger fastq.
+- Reads that have either left or right flanking regions shorter than the length of flanking regions that require quality screening will be removed.
+
+**Citation**
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+The input files need to be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score**
+
+**Output**
+
+The output will be the two fastq files. The first file contains left flank regions. The second file contains right flanking regions.
+
+**Example**
+
+- Suppose we detected the microsatellites from short reads ::
+
+ 6 40 54 G 0 SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA
+    
+
+- We want to get fastq files of flanking regions around microsatellite with quality score at least 20 on Phred +33  
+  
+- Then the program will report these two fastq files ::
+
+ @SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT
+ +SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
+
+
+ @SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG
+ +SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA
+  
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 heteroprob.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/heteroprob.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,199 @@
+### import libraries ###
+import sys
+import collections, math
+import heapq
+import itertools
+
+
+
+### basic function ###
+def permuterepeat(n,rlist):
+    f = math.factorial
+    nfac=f(n)
+    rfaclist=[f(i) for i in rlist]
+    for rfac in rfaclist:
+        nfac=nfac/rfac
+    return nfac
+
+def nCr(n,r):
+    f = math.factorial
+    return f(n) / f(r) / f(n-r)
+    
+def averagelist(a,b,expectedlevelofminor):
+    product=[]
+    for i in range(len(a)):
+        product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
+  
+    return product
+        
+def complement_base(read):
+    collect=''
+    for i in read:
+        if i.upper()=='A':
+            collect+='T'
+        elif i.upper()=='T':
+            collect+='A'
+        elif i.upper()=='C':
+            collect+='G'
+        elif i.upper()=='G':
+            collect+='C'
+    return collect
+def makeallpossible(read):
+    collect=[]
+    for i in range(len(read)):
+        tmp= read[i:]+read[:i]
+        collect.append(tmp)
+        collect.append(complement_base(tmp))
+    return collect
+
+def motifsimplify(base):
+    '''str--> str
+    '''
+    motiflength=len(base)
+    temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
+    
+    return temp[0]
+
+def majorallele(seq):
+    binseq=list(set(seq))  
+    binseq.sort(reverse=True)   # highly mutate mode
+    #binseq.sort()              # majority mode
+    storeform=''
+    storevalue=0
+    for i in binseq:
+        if seq.count(i)>storevalue:
+            storeform=i
+            storevalue=seq.count(i)
+            
+    return int(storeform)
+
+### decide global parameter ###
+COORDINATECOLUMN=1
+ALLELECOLUMN=2
+MOTIFCOLUMN=3
+inputname=sys.argv[1]
+errorprofile=sys.argv[2]
+EXPECTEDLEVELOFMINOR=float(sys.argv[3])
+if EXPECTEDLEVELOFMINOR >0.5:
+ try:
+ errorexpectcontribution=int('a')
+ except Exception, eee:
+ print eee
+ stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
+MINIMUMMUTABLE=0 ###1.2*(1.0/(10**8))  #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
+
+
+## Fixed global variable
+ALLREPEATTYPE=[1,2,3,4]
+ALLREPEATTYPENAME=['mono','di','tri','tetra']
+monomotif=['A','C']
+dimotif=['AC','AG','AT','CG']
+trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
+tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
+'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
+'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
+ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
+monorange=range(5,60)
+dirange=range(6,60)
+trirange=range(9,60)
+tetrarange=range(12,80)
+ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
+
+#########################################
+######## Prob calculation sector ########
+#########################################
+def multinomial_prob(majorallele,STRlength,motif,probdatabase):
+    '''int,int,str,dict-->int
+    ### get prob for each STRlength to be generated from major allele
+    '''
+    #print (majorallele,STRlength,motif)
+    prob=probdatabase[len(motif)][motif][majorallele][STRlength]
+    return prob
+
+################################################
+######## error model database sector ###########
+################################################
+
+## structure generator
+errormodeldatabase={1:{},2:{},3:{},4:{}}
+sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
+for repeattype in ALLREPEATTYPE:
+    for motif in ALLMOTIF[repeattype]:
+        errormodeldatabase[repeattype][motif]={}
+        sumbymajoralleledatabase[repeattype][motif]={}
+        for motifsize1 in ALLRANGE[repeattype]:
+            errormodeldatabase[repeattype][motif][motifsize1]={}
+            sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
+            for motifsize2 in ALLRANGE[repeattype]:
+                errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
+#print errormodeldatabase
+## read database
+
+## get read count for each major allele
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_count=int(temp[2])
+    motif=temp[3]
+    sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
+fd.close()
+##print sumbymajoralleledatabase
+
+## get probability
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_read=int(temp[1])
+    t_count=int(temp[2])
+    motif=temp[3]
+    if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
+        errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
+        #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
+        
+    #else:
+    #    errormodeldatabase[repeattype][motif][t_major][t_read]=0
+fd.close()
+#print errormodeldatabase    
+#print math.log(100,10)
+#########################################
+######## input reading sector ###########
+#########################################
+
+
+
+fd = open(inputname)
+##fd=open('sampleinput_C.txt')
+lines=fd.xreadlines()
+for line in lines:
+    i_read=[]
+    i2_read=[]
+    temp=line.strip().split('\t')
+    i_coordinate=temp[COORDINATECOLUMN-1]
+    i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
+    i_read=temp[ALLELECOLUMN-1].split(',')
+    i_read=map(int,i_read)
+    depth=len(i_read)
+    heteromajor1=int(temp[6])
+    heteromajor2=int(temp[7])
+
+### calculate the change to detect combination (using error profile)
+    heterozygous_collector=0  
+    alist=[multinomial_prob(heteromajor1,x,i_motif,errormodeldatabase)for x in i_read]
+    blist=[multinomial_prob(heteromajor2,x,i_motif,errormodeldatabase)for x in i_read]
+      
+    ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
+       
+    if 0 in ablist:
+        continue
+    heterozygous_collector=reduce(lambda y, z: y*z,ablist )
+
+### prob of combination (using multinomial distribution)
+    frequency_distribution=[len(list(group)) for key, group in itertools.groupby(i_read)]
+    ## print frequency_distribution
+    expandbypermutation=permuterepeat(depth,frequency_distribution)
+
+    print line.strip()+'\t'+str(heterozygous_collector)+'\t'+str(expandbypermutation)+'\t'+str(expandbypermutation*heterozygous_collector)+'\t'+str(depth)
b
diff -r 000000000000 -r 20ab85af9505 microsatcompat.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatcompat.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,36 @@
+import sys
+# remove all read that have unmatch microsat
+# check only one line at a time
+def complement_base(read):
+    collect=''
+    for i in read:
+        if i.upper()=='A':
+            collect+='T'
+        elif i.upper()=='T':
+            collect+='A'
+        elif i.upper()=='C':
+            collect+='G'
+        elif i.upper()=='G':
+            collect+='C'
+    return collect
+   
+def makeallpossible(read):
+    collect=[]
+    for i in range(len(read)):
+        tmp= read[i:]+read[:i]
+        collect.append(tmp)
+        collect.append(complement_base(tmp))
+    return collect
+
+
+fd=open(sys.argv[1])
+lines=fd.xreadlines()
+firstcolumn=int(sys.argv[2])-1 #4
+secondcolumn=int(sys.argv[3])-1 # 10
+for line in lines:
+    temp=line.strip().split('\t')
+    temp=filter(None,temp)
+    micro1=temp[firstcolumn]
+    micro2=temp[secondcolumn]
+    if micro1 in makeallpossible(micro2):
+        print line.strip()
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 microsatcompat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatcompat.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,76 @@
+<tool id="microsatcompat" name="Check microsatellites motif compatibility" version="1.0.0">
+  <description> </description>
+  <command interpreter="python">microsatcompat.py $input $column1 $column2 > $output </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select input" />
+    <param name="column1" type="integer" value="4" label="First column number" />
+    <param name="column2" type="integer" value="10" label="Second column number" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="microsatcompat_in.txt"/>
+      <param name="column1" value="4"/>      
+      <param name="column2" value="10"/>
+      <output name="output" file="microsatcompat_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to select only the input lines which have compatible microsatellite motifs between two columns. Compatible here is defined as the microsatellites motif that are complementary or have the same sequence when change starting point of motif. For example, **A** is the same as **T**. Also, **AGG** is the same as **GAG**.
+
+For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to make sure that the microsatellites in the reads have the same motif as the microsatellites in the reference at the corresponding mapped location. 
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+The input files can be any tab delimited file. 
+
+If this tool is used in TRFM microsatellite profiling, it should contains:
+
+- Column 1 = microsatellite location in reference chromosome
+- Column 2 = microsatellite location in reference start
+- Column 3 = microsatellite location in reference stop
+- Column 4 = microsatellite location in reference motif
+- Column 5 = microsatellite location in reference length
+- Column 6 = microsatellite location in reference motif size
+- Column 7 = length of microsatellites (bp)
+- Column 8 = length of left flanking regions (bp)
+- Column 9 = length of right flanking regions (bp)
+- Column 10 = repeat motif (bp)
+- Column 11 = hamming distance 
+- Column 12 = read name
+- Column 13 = read sequence with soft masking of microsatellites
+- Column 14 = read quality (the same Phred score scale as input)
+- Column 15 = read name (The same as column 12)
+- Column 16 = chromosome 
+- Column 17 = left flanking region start
+- Column 18 = left flanking region stop
+- Column 19 = microsatellite start as infer from pair-end
+- Column 20 = microsatellite stop as infer from pair-end
+- Column 21 = right flanking region start
+- Column 22 = right flanking region stop
+- Column 23 = microsatellite length in reference
+- Column 24 = microsatellite sequence in reference
+
+**Output**
+
+The same as input format.
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 microsatellite.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatellite.py Fri Oct 03 20:54:30 2014 -0400
[
b'@@ -0,0 +1,1271 @@\n+#!/usr/bin/env python\r\n+"""\r\n+Snoop thru a fasta file looking for microsatellite repeats of given periods\r\n+Output format: length_of_repeat left_flank_length right_flank_length  repeat_motif  hamming_distance  read_name read_sequence read_quality  (additional columns)\r\n+\r\n+If --r option turned on, output format will have additional columns behind:\r\n+read_name read_chr  pre_s pre_e tr_s  tr_e  suf_s suf_e tr_len  tr_ref_seq\r\n+\r\n+pre_s           where the read start\r\n+pre_e           the last position before microsatellite\r\n+tr_s            where microsatellite start\r\n+tr_e            where microsatellite end\r\n+suf_s           first base after microsatellite\r\n+tr_ref_seq      reference sequence corresponding to microsatellite\r\n+\r\n+* output positions are 0 based\r\n+\r\n+:Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu)\r\n+          \r\n+modifing log:\r\n+\r\n+09/27/2013\r\n+replace function dense_intervals with function non_negative_intervals, which do not need to import such file.\r\n+\r\n+10/18/2013\r\n+modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel\r\n+\r\n+02/25/2014\r\n+add function that can deal with mapped reads\r\n+with additional output\r\n+\r\n+02/28/2014\r\n+modify the 0-based end point, as in 0-base area, it is half-open [ )\r\n+so the 0-based site, should always be added by 1\r\n+\r\n+03/05/2014\r\n+deal with multi-fasta\r\n+"""\r\n+from sys          import argv,stdin,stderr,exit\r\n+from string       import maketrans\r\n+from md5          import new as md5_new\r\n+import re\r\n+#from pyfracluster import dense_intervals\r\n+\r\n+def usage(s=None):\r\n+    message = """\r\n+usage: microsat_snoop [fasta_file] [options]\r\n+  <fasta_file>                Name of file to read sequences from;  if absent,\r\n+                              sequences are read from stdin\r\n+  --fasta                     Input file is in fasta format\r\n+                              (this is the default)\r\n+  --fastq                     Input file is in fastq format\r\n+                              (default is fasta unless filename is .fastq)\r\n+  --fastq:noquals             Input file is in fastq format, but discard quals\r\n+  --sam                       Input file is SAM file \r\n+  --r                         Indicate additional output information, if indicated,\r\n+                              --ref option is mendatory\r\n+  --ref=<filepath>            Reference file (absolute) path\r\n+  --period=<length>           (mandatory,cumulative) repeat length(s) to be\r\n+                              searched for\r\n+                              <length> is expected to be small, less than 10\r\n+                              <length> can also be a comma-separated list, or\r\n+                              a range <low>..<high>\r\n+  --rate=<fraction>           control the candidate repeat interval detector;\r\n+                              it will consider intervals with at least\r\n+                              <fraction> of matches when shifted by the period;\r\n+                              <fraction> is between 0 and 1 and can be either a\r\n+                              real number or <n>/<d>\r\n+                              (default is 6/7)\r\n+  --minlength=<length>        minimum length of intervals reported, in bp\r\n+                              (default is 20)\r\n+  --progress=<count>          how often to report the sequence we\'re searching\r\n+                              (default is no progress report)\r\n+  --allowduplicates           process all input sequences\r\n+                              (this is the default)\r\n+  --noduplicates              ignore any input sequence that\'s the same as an\r\n+                              earlier sequence\r\n+  --nonearduplicates          ignore any input sequence that has the same first\r\n+                              100 bp as an earlier sequence\r\n+  --nonearduplicate=<length>  ignore any input sequence that has the same first\r\n+                 '..b'Note that we convert to upper case,\r\n+#    and convert any letter other than ACGT to N.\r\n+\r\n+def fastq_sequences(f):\r\n+    lineNum = 0\r\n+    for line in f:\r\n+        lineNum += 1\r\n+        line = line.strip()\r\n+\r\n+        if (lineNum % 4 == 1):\r\n+            assert (line.startswith("@")), \\\r\n+                   "bad read name at line %d" % lineNum\r\n+            seqName = line[1:]\r\n+            continue\r\n+\r\n+        if (lineNum % 4 == 2):\r\n+            seqNucs = line\r\n+            continue\r\n+\r\n+        if (lineNum % 4 == 3):\r\n+            assert (line.startswith("+")), \\\r\n+                   "can\'t understand line %d:\\n%s" % (lineNum,line)\r\n+            continue\r\n+\r\n+        quals = line\r\n+        assert (len(quals) == len(seqNucs)), \\\r\n+               "length mismatch read vs. qualities at line %d" % lineNum\r\n+        yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals)\r\n+\r\n+    assert (lineNum % 4 == 0), \\\r\n+           "incomplete read at end of file"\r\n+\r\n+def sam_sequences(f):\r\n+    lineNum = 0\r\n+    for line in f:\r\n+        lineNum += 1\r\n+        line = line.strip()\r\n+\r\n+        if line.startswith("@"):\r\n+            continue\r\n+\r\n+        columns = line.split("\\t")\r\n+        seqName = columns[0]\r\n+        refName = columns[2]\r\n+        pre_s = int(columns[3]) - 1\r\n+        cigar = columns[5]\r\n+        seqNucs = columns[9]\r\n+        \r\n+        yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar)\r\n+\r\n+# sequence_name--\r\n+#    Extract the sequence name from a fasta header.\r\n+#    $$$ this may need to be improved $$$\r\n+\r\n+def sequence_name(s):\r\n+    s = s[1:].strip()\r\n+    if (s == ""): return ""\r\n+    else:         return s.split()[0]\r\n+\r\n+\r\n+# nucleotide_runs--\r\n+#    Yield (start,end) for all runs of valid nucleotides in a sequence.\r\n+\r\n+def nucleotide_runs(s):\r\n+    runs  = []\r\n+    start = None\r\n+    for (ix,nuc) in enumerate(s):\r\n+        if (nuc in "ACGT"):\r\n+            if (start == None):\r\n+                start = ix\r\n+        else:\r\n+            if (start != None):\r\n+                yield (start,ix)\r\n+                start = None\r\n+\r\n+    if (start != None): yield (start,len(s))\r\n+\r\n+\r\n+# contains_repeat--\r\n+#    Determine whether a short sequence contains a repeated element, such as a\r\n+#    6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT).  The repeat\r\n+#    must cover the entire sequence, without mismatches.\r\n+\r\n+def contains_repeat(kmer):\r\n+    kmerLength = len(kmer)\r\n+    hasRepeat = False\r\n+    rptLen = 1\r\n+    while (not hasRepeat) and (2 * rptLen <= kmerLength):\r\n+        if (kmerLength % rptLen != 0):\r\n+            rptLen += 1\r\n+            continue\r\n+        isRepeat = True\r\n+        for i in xrange(rptLen,kmerLength,rptLen):\r\n+            if (kmer[i:i+rptLen] != kmer[:rptLen]):\r\n+                isRepeat = False\r\n+                break\r\n+        if (isRepeat):\r\n+            hasRepeat = True\r\n+            break\r\n+        rptLen += 1\r\n+    return hasRepeat\r\n+\r\n+\r\n+# hash108--\r\n+#    Return a 108-bit hash "value" of a string\r\n+\r\n+def hash108(s):\r\n+    m = md5_new()\r\n+    m.update(s)\r\n+    return m.hexdigest()[:27]\r\n+\r\n+\r\n+# float_or_fraction--\r\n+#    Convert a string to a number, allowing fractions\r\n+\r\n+def float_or_fraction(s):\r\n+    if ("/" in s):\r\n+        (numer,denom) = s.split("/",1)\r\n+        return float(numer)/float(denom)\r\n+    else:\r\n+        return float(s)\r\n+\r\n+\r\n+# int_with_unit--\r\n+#    Parse a string as an integer, allowing unit suffixes\r\n+\r\n+def int_with_unit(s):\r\n+    if (s.endswith("K")):\r\n+        multiplier = 1000\r\n+        s = s[:-1]\r\n+    elif (s.endswith("M")):\r\n+        multiplier = 1000 * 1000\r\n+        s = s[:-1]\r\n+    elif (s.endswith("G")):\r\n+        multiplier = 1000 * 1000 * 1000\r\n+        s = s[:-1]\r\n+    else:\r\n+        multiplier = 1\r\n+\r\n+    try:               return               int(s)   * multiplier\r\n+    except ValueError: return int(math.ceil(float(s) * multiplier))\r\n+\r\n+\r\n+if __name__ == "__main__": main()\r\n+\r\n'
b
diff -r 000000000000 -r 20ab85af9505 microsatellite.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatellite.xml Fri Oct 03 20:54:30 2014 -0400
b
b'@@ -0,0 +1,178 @@\n+\xef\xbb\xbf<tool id="microsatellite" name="Microsatellite detection" version="1.0.0">\n+\t<description>for short read, reference, and mapped data</description>\n+\t<command interpreter="python2.7"> microsatellite.py\n+\t"${filePath}"\n+\t#if $inputFileSource.inputFileType == "fasta"\n+\t\t--fasta\n+    #elif $inputFileSource.inputFileType == "fastq"\n+\t\t--fastq\n+    #elif $inputFileSource.inputFileType == "fastq_noquals"\n+\t\t--fastq:noquals\n+\t#elif $inputFileSource.inputFileType == "sam"\n+\t\t--sam\n+    #end if\n+\t\n+\t#if $inputFileSource.inputFileType == "sam"\n+\t\t#if $inputFileSource.referenceFileSource.requireReference\n+\t\t\t--r --ref="${inputFileSource.referenceFileSource.referencePath}"\n+\t\t#end if\n+    #end if\n+\t\n+\t--period="${period}"\n+\t\n+\t#if $partialmotifs == "true"\n+\t\t--partialmotifs\n+    #end if\n+\t\n+\t--minlength="${minlength}"\n+\n+\n+\t--prefix="${prefix}"\n+\t--suffix="${surfix}"\n+\t\n+\t--hamming="${hammingThreshold}"\n+\t\n+\t#if $multipleruns\n+\t\t--multipleruns\n+        #end if\n+\n+\t#if $flankSetting.noflankdisplay\n+\t\t--noflankdisplay\n+\t#else\n+\t\t--flankdisplay=${flankSetting.flankdisplay}\n+\t#end if\n+\t&gt; $stdout\n+\t</command>\n+\t\n+  <inputs>\n+\t<param name="filePath" label="Select input file" type="data"/>\n+\t<conditional name="inputFileSource">\n+\t\t<param name="inputFileType" type="select" label="Select input file type">\n+\t\t\t<option value="fasta">Fasta File</option>\n+\t\t\t<option value="fastq">Fastq File</option>\n+\t\t\t<option value="fastq_noquals">Fastq File without Quality Information</option>\n+\t\t\t<option value="sam">SAM File</option>\n+\t\t</param>\n+\t\t<when value="sam">\n+\t\t    <conditional name="referenceFileSource">\n+\t\t\t\t<param name="requireReference" label="Do you want to extract correspond microsatellites in reference for comparison?" type="boolean">\n+\t\t\t\t</param>\n+\t\t\t\t<when value="true">\n+\t\t\t\t\t<param name="referencePath" label="Select reference file" type="data"/>\n+\t\t\t\t</when>\n+\t\t\t</conditional>\n+\t\t</when>\n+\t</conditional>\n+\t\n+\t<param name="period" label="Motif size of microsatellites of interest (e.g. Mononucleotide microsatellite =1) (must be less than 10)" type="integer" size="2" value="1"/>\n+  <param name="partialmotifs" label="Consider microsatellites with a partial motif?" type="boolean" checked="True"/>\n+\t<param name="minlength" label="Minimal length (bp) of microsatellite sequence reported" type="integer" size="2" value="5"/>\n+\t\n+\n+\t<param name="prefix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>\n+\t<param name="surfix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>\n+\n+\t\n+\t<param name="hammingThreshold" label="Hamming threshold of microsatellite, If greater than 0,  interrupted microsatellites will also be reported" type="integer" size="2" value="0"/>\n+\t<param name="multipleruns" label="Consider all candidate intervals in a sequence. If not check, only the longest one will be considered" type="boolean" checked="True"> </param>\n+\t<conditional name="flankSetting">\n+        \t<param name="noflankdisplay" label="Show the entire flanking regions" type="boolean" checked="True"/>\n+\t\t<when value="false">\n+\t\t\t<param name="flankdisplay" label="Limit length (bp) of flanking regions shown" type="integer" size="4" value="5"/>\n+\t\t</when>\n+\t</conditional>\n+    \n+  </inputs>\n+  <outputs>\n+    <data name="stdout" format="tabular"/>\n+  </outputs>\n+  <tests>\n+    <!-- Test data with valid values -->\n+    <test>\n+      <param name="filePath" value="C_sample_fastq"/>\n+\t  <param name="period" value="1"/>\n+      <param name="partialmotifs" value="true" />\n+\t  <param name="minlength" value="3" />\n+\t  <param name="prefix" value="5"/>\n+\t  <param name="surfix" value="5"/>\n+\t  <param name="hammingThreshold"  value="0"/>\n+\t  <param name="multipleruns" value="true"> </param>\n+      <output name="microsatellite" file="C_sample_snoope"/>\n+    </test>\n+    \n+  </tests>\n+  <help>\n+\n+\n+.. class:'..b'w size sequences, with a step size of k. If a base at a given position matches one k positions earlier it was marked with a plus, if corresponding sites had different bases it was marked with a minus. The first k position is blank.\n+\n+2) Since we do not allow mutations in reported TR, consecutive \xe2\x80\x9c+\xe2\x80\x9d signal sequence means that a k-mer TR is present in this sample. \n+\n+3) Report k-mer TRs if the length is larger than a threshold provided by the user.\n+\n+If hamming distance is set to integer more than zero, the program will concern both uninterrupted and interrupted microsatellites. The process works as follows:\n+\n+(1) Identify intervals that are highly correlated with the interval shifted by \xe2\x80\x98k\xe2\x80\x99 (the repeat period).  These intervals are called "runs" or "candidates". The allowed level of correlation is 6/7. Depending on whether we want to look for more than one microsat, we either find the longest such run (simple algorithm) or many runs (more complicated algorithm). The following steps are then performed on each run.\n+\n+(2) Find the most likely repeat motif in the run.  This is done by counting all kmers (of length P) and choosing the most frequent.  If that kmer is itself covered by a sub-repeat we discard this run.  The idea is that we can ignore a 6-mer like ACGACG because we will find it when we are looking for 3-mers.\n+\n+(3) Once we identify the most likely repeat motif, we then modify the interval, adjusting start and end to find the interval that has the fewest mismatches vs. a sequence of the motif repeated (hamming distance). \n+\n+(4) At this point we have a valid microsat interval (in the eyes of the program). It is subjected to some filtering stages (hamming distance or too close to an end), and if it satisfies those conditions, it\'s reported to the user\n+    \n+For more option, the script to run this program can be downloaded and run with python independently from Galaxy. There are more option for the script mode. Help page is build-in inside the script.\n+\n+**Citation**\n+\n+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**\n+This tool is developed by Chen Sun (cxs1031@cse.psu.edu) and Bob Harris (rsharris@bx.psu.edu)\n+ \n+**Input**\n+\n+- The input files can be fastq, fasta, fastq without quality score, and SAM format.\n+\n+**Output**\n+\n+For fastq, the output will contain the following columns:\n+\n+- Column 1 = length of microsatellites (bp)\n+- Column 2 = length of left flanking regions (bp)\n+- Column 3 = length of right flanking regions (bp)\n+- Column 4 = repeat motif (bp)\n+- Column 5 = hamming distance \n+- Column 6 = read name\n+- Column 7 = read sequence with soft masking of microsatellites\n+- Column 8 = read quality (the same Phred score scale as input)\n+\n+For fasta, fastq without quality score and sam format, column 8 will be replaced with dot(.).\n+\n+If the users have mapped file (SAM) and would like to profile microsatellites from premapped data instead of using flank-based mapping approach, they can select SAM format input and specify that they want correspond microsatellites in reference for comparison. The output will be as follow:\n+\n+- Column 1 = length of microsatellites (bp)\n+- Column 2 = length of left flanking regions (bp)\n+- Column 3 = length of right flanking regions (bp)\n+- Column 4 = repeat motif (bp)\n+- Column 5 = hamming distance \n+- Column 6 = read name\n+- Column 7 = read sequence with soft masking of microsatellites\n+- Column 8 = read quality (the same Phred score scale as input)\n+- Column 9 = read name (The same as column 6)\n+- Column 10 = chromosome \n+- Column 11 = left flanking region start\n+- Column 12 = left flanking region stop\n+- Column 13 = microsatellite start as infer from pair-end\n+- Column 14 = microsatellite stop as infer from pair-end\n+- Column 15 = right flanking region start\n+- Column 16 = right flanking region stop\n+- Column 17 = microsatellite length in reference\n+- Column 18 = microsatellite sequence in reference\n+\n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 20ab85af9505 microsatpurity.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatpurity.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,24 @@
+import sys
+# remove all read that have impure microsat
+# check only one line at a time
+
+
+fd=open(sys.argv[1])
+lines=fd.xreadlines()
+##motifIx=int(sys.argv[2])
+period=int(sys.argv[2])
+tr_ref_seqIx=int(sys.argv[3])-1
+##output=(sys.argv[4])
+##fout=open(output,'w')
+for line in lines:
+    temp=line.strip().split('\t')
+    temp=filter(None,temp)
+    #motif=temp[motifIx]
+    tr_ref_seq=temp[tr_ref_seqIx]
+    ##period=len(motif)
+    cand_motif=tr_ref_seq[:period]
+    len_microsat=len(tr_ref_seq)
+    expand_microsat_cand=cand_motif*(len_microsat/period) + cand_motif[:(len_microsat%period)]
+    if tr_ref_seq == expand_microsat_cand:
+     print line.strip()
+        ##print line.strip() >> fout
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 microsatpurity.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatpurity.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,79 @@
+<tool id="microsatpurity" name="Select uninterrupted microsatellites" version="1.0.0">
+  <description> of a specific column</description>
+  <command interpreter="python">microsatpurity.py $input $period $column_n > $output </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select input" />
+    <param name="period" type="integer" label="motif size" value="1"/>
+    <param name="column_n" type="integer" value="0" label="Select column that contains microsatellites of interest (0 = last column)" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="microsatpurity_in.txt"/>
+      <param name="period" value="2"/>      
+      <param name="column_n" value="0"/>
+      <output name="output" file="microsatpurity_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to select only the uninterrupted microsatellites. Interrupted microsatellites (e.g. ATATATATAATATAT) or sequences of microsatellites with non-microsatellite parts (e.g. ATATATATATG) will be removed.
+
+For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to avoid the cases that flanking bases were misread as microsatellite. Thus, the read profile will only reflect the variation of TR length from expansion/contraction.
+For example, suppose that the sequence around microsatellite is AGCGACGaaaaaaGCGATCA. If we observe read with sequence AGCGACGaaaaaaaaaaGCGATCA, we can indicate that this is microsatellite expansion. However, if we observe AGCGACGaaaaaaaCGATCA, this is more like a substitution of G to A. These incidents can be removed with this tool.
+You can use the tool **combine mapped flaked bases** to get the microsatellites in reference that correspond to sequence between mapped reads. If the user map these reads around the uninterrupted microsatelites in reference, the corresponding sequences between these pairs should be the uninterrupted microsatellites regardless of expansion/contraction of microsatellites in short read data. However, if the substitution of flanking base or if the fluorescent signal from the previous run make it look like substitution, the corresponding sequences in reference in between the pairs will not be uninterrupted microsatellites. Thus this tool can remove those cases and keep only microsatellite expansion/contraction.
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+The input files can be any tab delimited file. 
+
+If this tool is used in TRFM microsatellite profiling, it should contains:
+
+- Column 1 = microsatellite location in reference chromosome
+- Column 2 = microsatellite location in reference start
+- Column 3 = microsatellite location in reference stop
+- Column 4 = microsatellite location in reference motif
+- Column 5 = microsatellite location in reference length
+- Column 6 = microsatellite location in reference motif size
+- Column 7 = length of microsatellites (bp)
+- Column 8 = length of left flanking regions (bp)
+- Column 9 = length of right flanking regions (bp)
+- Column 10 = repeat motif (bp)
+- Column 11 = hamming distance 
+- Column 12 = read name
+- Column 13 = read sequence with soft masking of microsatellites
+- Column 14 = read quality (the same Phred score scale as input)
+- Column 15 = read name (The same as column 12)
+- Column 16 = chromosome 
+- Column 17 = left flanking region start
+- Column 18 = left flanking region stop
+- Column 19 = microsatellite start as infer from pair-end
+- Column 20 = microsatellite stop as infer from pair-end
+- Column 21 = right flanking region start
+- Column 22 = right flanking region stop
+- Column 23 = microsatellite length in reference
+- Column 24 = microsatellite sequence in reference
+
+**Output**
+
+The same as input format.
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 pair_fetch_DNA_ff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pair_fetch_DNA_ff.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# pair_fetch_DNA_ff.py
+# Function: filter microsat and flanking region by quality score;
+# remove read with any base that has lower quality score than "quality_require" within "flanking_base" and convert from snoope to fastq
+# Note that require flanking length need to be screen by Bob snoope script first
+
+# Author: Arkarachai Fungtammasan
+# Version 1.0.0 (15 July 2012)
+# Input format: length_of_repeat[0]   left_flank_length[1] right_flank_length[2] repeat_motif[3] hamming_distance[4] read_name[5] read_sequence[6] read_quality[7]
+# Output format: two fastq file. First file contain left flank. Second file contain right flank.
+# Command: python pair_fetch_DNA_ff.py input.txt
+
+import sys
+from galaxy import eggs
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+# read file name
+
+
+
+filename=sys.argv[1]
+L_filename=sys.argv[2]
+R_filename=sys.argv[3]
+quality_require=sys.argv[4]
+flanking_base=sys.argv[5]
+try:
+ quality_require=int(quality_require)
+ flanking_base=int(flanking_base)
+except Exception, eee:
+ print eee
+ stop_err("Quality score cutoff and Length of flanking regions that require quality screening must be integer")
+
+fd=open(filename)
+fdd1=open(L_filename,'w')
+fdd2=open(R_filename,'w')
+lines=fd.xreadlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    temp=filter(None,temp)
+    #get index
+    left_flank=(0,int(temp[1]))
+    microsat=(int(temp[1]),int(temp[1])+int(temp[0]))
+    right_flank=(int(temp[1])+int(temp[0]),int(temp[1])+int(temp[0])+int(temp[2]))
+    flag=0
+    #filter length of left and right flank
+    if (right_flank[1]-right_flank[0])<flanking_base:
+     continue
+    if (left_flank[1]-left_flank[0])<flanking_base:
+     continue
+    #filter quality score
+    for i in temp[7][microsat[0]-flanking_base:microsat[1]+flanking_base]:
+        if ord(i)<(quality_require+33):
+            flag=1
+        else:
+            flag=flag
+    #print out to seperated files
+    if flag ==0:
+        newname= temp[5]##+'_'+temp[3]+'_'+temp[0]
+        fdd1.writelines('@'+newname+'\n')
+        fdd2.writelines('@'+newname+'\n')
+        fdd1.writelines(temp[6][left_flank[0]:left_flank[1]]+'\n')
+        fdd2.writelines(temp[6][right_flank[0]:right_flank[1]]+'\n')
+        fdd1.writelines('+'+newname+'\n')
+        fdd2.writelines('+'+newname+'\n')
+        fdd1.writelines(temp[7][left_flank[0]:left_flank[1]]+'\n')
+        fdd2.writelines(temp[7][right_flank[0]:right_flank[1]]+'\n')
+
+fd.close()
+fdd1.close()
+fdd2.close()
+
+
b
diff -r 000000000000 -r 20ab85af9505 probvalueforhetero.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/probvalueforhetero.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,66 @@
+<tool id="heteroprob" name="Evaluate the probability of the allele combination to generate read profile" version="2.0.0">
+  <description></description>
+  <command interpreter="python2.7">heteroprob.py  $microsat_raw $microsat_error_profile  $expectedminorallele > $microsat_corrected </command>
+
+  <inputs>
+    <param name="microsat_raw" type="data" label="Select microsatellite length profile and allele combination file" />
+    <param name="microsat_error_profile" type="data" label="Select microsatellite error profile that correspond to this dataset" />
+ <param name="expectedminorallele" type="float" value="0.5" label="Expected contribution of minor allele when present (0.5 for genotyping)" />
+
+  </inputs>
+  <outputs>
+    <data name="microsat_corrected" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="microsat_raw" value="probvalueforhetero_in.txt"/>
+      <param name="microsat_error_profile" value="PCRinclude.allrate.bymajorallele"/>
+      <param name="expectedminorallele" value="0.5"/>
+      <output name="microsat_corrected" file="probvalueforhetero_out.txt"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will calculate the probability that the allele combination can generated the given read profile. This tool is part of the pipeline to estimate minimum read depth.
+- The calculation of probability is very similar to the tool **Correct genotype for microsatellite errors**. However, this tool will restrict the calculation to only the allele combination indicated in input. Also, when it encounter allele combination that cannot be generated from error profile, the total probability will be zero instead of using base substitution rate. 
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+The input format is the same as output from **Correct genotype for microsatellite errors** tool.
+
+- Column 1 = location of microsatellite locus. 
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column. 
+- Column 4 = homozygous/heterozygous label.
+- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Column 6 = Allele for most probable homozygous form.
+- Column 7 = Allele 1 for most probable heterozygous form.
+- Column 8 = Allele 2 for most probable heterozygous form.
+
+Only column 2,3,7,8 were used in calculation. 
+
+**Output**
+
+
+The output will be contain original eight column from the input. However, it will also add these following columns. 
+- Column 9 = Probability of the allele combination to generate given read profile.
+- Column 10 = Number of possible rearrangement of given read profile.
+- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10)
+- Column 12 = Read depth
+
+
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 profilegenerator.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/profilegenerator.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,66 @@
+import collections
+import itertools
+import sys
+
+filename=sys.argv[1]
+MOTIF=sys.argv[2]
+MOTIFSIZE=len(MOTIF)
+MaxDEPTH=int(sys.argv[3])
+MINIMUMPROB=float(sys.argv[4])##1.0/(10**4)
+MININUMCOUNT=1
+fd=open(filename)
+lines=fd.readlines()
+countbymajorallele=collections.defaultdict(list)
+for line in lines:
+    temp=line.strip().split('\t')
+    t_major=int(temp[0])
+    t_count=int(temp[2])
+    countbymajorallele[t_major].append(t_count)
+fd.close()
+sumbymajorallele=collections.defaultdict(int)
+for t_majorallele in countbymajorallele.keys():
+    sumbymajorallele[t_majorallele]=sum(countbymajorallele[t_majorallele])
+
+fd=open(filename)
+##fd=open('PCRinclude.mono.A.bymajorallele')
+lines=fd.readlines()
+allmajor=collections.defaultdict(list)
+for line in lines:
+    temp=line.strip().split()
+    if int(temp[0])%MOTIFSIZE==0:
+        if (int(temp[2])/(sumbymajorallele[int(temp[0])]*1.0))>=MINIMUMPROB:
+            if int(temp[2])>=MININUMCOUNT:
+                allmajor[int(temp[0])].append(int(temp[1]))
+##print allmajor
+allkey=allmajor.keys()
+allkey.sort()
+#print allkey
+keycount=0
+combinelist_collection=[]
+for dummycount in range(len(allkey)-1):
+    pair1,pair2=allkey[keycount],allkey[keycount+1]
+    pair1list=allmajor[pair1]
+    pair2list=allmajor[pair2]
+    #print pair1list,pair2list
+    pair1list.extend(pair2list)
+    combinelist=list(set(pair1list))
+    combinelist.sort()
+    ##print combinelist
+    combinelist_collection.append(tuple(combinelist))
+    keycount+=1
+combinelist_collection=list(set(combinelist_collection))
+newcombinelist_collection=combinelist_collection[:]
+#combinelist_collection=set(combinelist_collection)
+for smallset1 in combinelist_collection:
+    for smallset2 in combinelist_collection:
+        if set(smallset1).issubset(set(smallset2)) and smallset1 != smallset2:
+            newcombinelist_collection.remove(smallset1)
+            break
+##print combinelist_collection
+    
+for depth in range(2,MaxDEPTH+1):
+    for member_list in newcombinelist_collection:
+        for member in itertools.combinations_with_replacement(member_list,depth):
+            print 'chr'+'\t'+','.join(map(str,member))+'\t'+MOTIF
+                
+    
b
diff -r 000000000000 -r 20ab85af9505 profilegenerator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/profilegenerator.xml Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,110 @@
+<tool id="Profilegenerator" name="Generate all possible combination of read profile" version="2.0.0">
+  <description> of the consecutive allele from given error profile </description>
+  <command interpreter="python2.7">profilegenerator.py  $error_profile $MOTIF $Maxdepth $minprob > $output </command>
+
+  <inputs>
+    <param name="error_profile" type="data" label="Select error profile" />
+    <param name="MOTIF" type="text" value="A" label="Type in a motif of interest (e.g. AGC)" />
+ <param name="Maxdepth" type="integer" value="30" label="Maximum read depth of interest" />
+ <param name="minprob" type="float" value="0.00000001" label="Minimum error rate to be considered" />
+
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="error_profile" value="sampleprofilegenerator_in"/>
+      <param name="MOTIF" value="A"/>
+      <param name="Maxdepth" value="3"/>
+      <param name="minprob" file="0.00000001"/>
+      <output name="output" file="sampleprofilegenerator_out"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool will generate all possible combination of observed read profile of the consecutive alleles from given error profile. The range of observed read length can be filtered to contain only those that are frequently occur using "Minimum error rate to be considered" parameter.
+
+This problem will collect the lists of valid (pass "Minimum error rate to be considered" threshold) observed length profiles from combination of consecutive allele lengths. The lists that are equivalent or the subset of the other lists will be removed. For each depth and each list, length profile were generated from combination with replacement which compatible with python 2.7. There could be redundant error profiles generated from different lists if more than one combination of allele is generated due to overlap range of observed microsatellite lengths. The user need to remove them which can be done easily using **sort | uniq** command in unix.
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+- The error profile needs to contain these three columns. 
+- Column 1 = Correct microsatellite length 
+- Column 2 = Observed microsatellite length 
+- Column 3 = Number of observation
+
+**Output**
+
+- Column 1 = Place holder for location of microsatellite locus. (just "chr")
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format). 
+- Column 3 = motif of microsatellite in this locus. 

+**Example**
+
+- Suppose that we provide the following read profile ::
+
+ 9 9 100000
+ 10 10 91456
+ 10 9 1259
+ 11 11 39657
+ 11 10 1211
+ 11 12 514
+
+
+- Using default minimum probability to be consider and motif = A, all observed read lengths are valid. The program will generated lists of observed length profiles from consecutive allele length. ::
+
+ 9:10 = [9,10]
+ 10:11 = [9,10,11,12]
+
+- Lists that are subsets of other lists will be removed. Thus, [9,10] will not be considered. 
+
+- Then the program will generate all combination with replacement for each depth from each list. Using **maximum read depth =3**, we will ge the following output. ::
+
+
+ chr 9,9 A
+ chr 9,10 A
+ chr 9,11 A
+ chr 9,12 A
+ chr 10,10 A
+ chr 10,11 A
+ chr 10,12 A
+ chr 11,11 A
+ chr 11,12 A
+ chr 12,12 A
+ chr 9,9,9 A
+ chr 9,9,10 A
+ chr 9,9,11 A
+ chr 9,9,12 A
+ chr 9,10,10 A
+ chr 9,10,11 A
+ chr 9,10,12 A
+ chr 9,11,11 A
+ chr 9,11,12 A
+ chr 9,12,12 A
+ chr 10,10,10 A
+ chr 10,10,11 A
+ chr 10,10,12 A
+ chr 10,11,11 A
+ chr 10,11,12 A
+ chr 10,12,12 A
+ chr 11,11,11 A
+ chr 11,11,12 A
+ chr 11,12,12 A
+ chr 12,12,12 A
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 readdepth2sequencingdepth.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readdepth2sequencingdepth.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,57 @@
+<tool id="readdepth2seqdepth" name="Convert informative read depth to sequencing depth" version="1.0.0">
+  <description>for flank-based mapping of microsatellites</description>
+  <command interpreter="python2.7">sequencingdepthconversion_G.py $repeatlength $flanksize $readlength $infodepth $probprediction > $output </command>
+
+  <inputs>
+    <param name="repeatlength" type="integer" value="10" label="Repeat length (bp)" />
+    <param name="flanksize" type="integer" value="20" label="Required flank bases on each side in mapping" />
+    <param name="readlength" type="integer" value="100" label="Read length (treat all read as single end read)" />
+    <param name="infodepth" type="integer" value="5" label="Required read depth" />
+    <param name="probprediction" type="float" value="0.9" label="Proportion of genome that need certain level of read depth" />
+  </inputs>
+  <outputs>
+    <data format="input" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+ <param name="repeatlength" value="10"/>
+     <param name="flanksize" value="20" />
+     <param name="readlength" value="100" />
+     <param name="infodepth" value="5" />
+ <param name="probprediction"  value="0.9" />
+ <output name="output" file="readdepth2seqdepth.out"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to convert informative read depth (specified by user) to sequencing depth when the microsatellites is mapped using TRFM pipeline.
+The locus specific sequencing depth is the sequencing depth that will make a certain loci have certain read depth based on uniform mapped of read. It is calculated as: ::
+
+ yrequired = ( X * L ) / (L - (2F+r-1))
+
+Where X = read depth, L = read length, F = the number of flanked bases required on each flanking regions, r = the expected repeat length of microsatellite of interest.
+
+The genome wide sequencing depth is the sequencing depth that will make certain percentage of genome (e.g. 90 percent or 95 percent) to have certain locus specific sequencing depth. It's calculated using numerical guessing to find smallest lambda that: ::
+
+  0.90 (or other proportion specified by user) &lt; = P(Y=0) + P(Y=1) + …+ P(Y=yrequired-1)  
+  
+  P(Y=y) = (lambda^(y) * e ^(-lambda)) /y!
+
+ y = specific level of sequencing depth. Lambda = genome wide sequencing depth
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 sequencingdepthconversion_G.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sequencingdepthconversion_G.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,54 @@
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+        
+def info2require(X,L,F,r):
+    '''infodepth,readlength,flanksize,repeatlength
+    '''
+    return int(math.ceil((X*L*1.0)/(L-(1*((2*F)+r-1)))))
+    
+def poissondef(meancov,specificcov):
+    nominator=1.0*(meancov**specificcov)*(math.e**(-1*meancov))
+    denominator=math.factorial(specificcov)
+    return nominator/denominator
+
+def require2recommend(needprob,mindepth):
+    i=mindepth
+    reverseneedprob=1-needprob
+    sumprob=1
+    while sumprob>reverseneedprob: #mean cov
+        sumprob=0
+        for j in range(0,mindepth): #specific cov
+            sumprob+=poissondef(i,j)
+        i+=1
+        
+    return i-1
+
+import sys,math
+
+repeatlength=int(sys.argv[1])
+flanksize=int(sys.argv[2])#20
+readlength=int(sys.argv[3])#100
+infodepth=int(sys.argv[4])#5
+probdetection=float(sys.argv[5])#0.90
+
+if probdetection >1:
+    try:
+        probvalue=int('probvalue')
+    except Exception, eee:
+        print eee
+        stop_err("Proportion of genome to have certain locus specific must be between 0 and 1")
+
+print 'repeat_length'+'\t'+'read_length'+'\t'+'informative_read_depth''\t'+'=locus_specific_sequencing_depth'+'\t'+'=genome_wide_sequencing_depth'
+t_requiredepth=info2require(infodepth,readlength,flanksize,repeatlength)
+t_recomendseq=require2recommend(probdetection,t_requiredepth)
+preplotlist=[repeatlength,readlength,infodepth,t_requiredepth,t_recomendseq]
+plotlist=map(str,preplotlist)
+print '\t'.join(plotlist)
+
+#print info2require(infodepth,readlength,flanksize,repeatlength)
+#print poissondef(10,3)
+#print require2recommend(0.90,80)
+#informative_read_depth
+#required_seq_depth
+#recommend_seq_depth
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 space2underscore_readname.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/space2underscore_readname.xml Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,47 @@
+<tool id="space2underscore_readname" name="Read name modifier" version="1.0.0">
+  <description>--change space to underscore of a specific column</description>
+  <command interpreter="python">changespacetounderscore_readname.py  $input $output $column_n </command>
+
+  <inputs>
+    <param name="input" type="data" label="Select input" />
+    <param name="column_n" type="integer" value="6" label="Select column to modify" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+    
+  </outputs>
+  <tests>
+    <!-- Test data with valid values -->
+    <test>
+      <param name="input" value="samplefq.snoope"/>
+      <param name="column_n" value="6"/>
+      <output name="output" file="samplefq.snoope.new"/>
+    </test>
+    
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to change space to underscore. For TRFM pipeline (profiling microsatellites in short read data), this tool is used to change space in read name to underscore to prevent the downstream tools which might recognize incorrect column number due to space in read name. If the input do not have space in read name, this step can be skipped.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**

+**Input**
+
+The input files can be any tab delimited file. 
+
+If this tool is used in TRFM microsatellite profiling, it should be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score**
+
+**Output**
+
+The same as input format.
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 test-data/.DS_Store
b
Binary file test-data/.DS_Store has changed
b
diff -r 000000000000 -r 20ab85af9505 test-data/C_sample_fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/C_sample_fastq Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,8 @@
+@IL2_40_2_1_735_755
+ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGAAATAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+@IL2_40_2_1_919_700
+ATAAGGAAAAAAAAAAAAAAAACCAGGTCTTTTTTTTTTTTTTTTTGTTAT
++
+IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
b
diff -r 000000000000 -r 20ab85af9505 test-data/C_sample_snoope
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/C_sample_snoope Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,4 @@
+3 33 15 A 0 IL2_40_2_1_735_755_1_per1_2 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTaaaGTGCTGAAATAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+3 42 6 A 0 IL2_40_2_1_735_755_1_per1_3 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGaaaTAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+16 6 29 A 0 IL2_40_2_1_919_700_1_per1_1 ATAAGGaaaaaaaaaaaaaaaaCCAGGTCTTTTTTTTTTTTTTTTTGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
+17 29 5 T 0 IL2_40_2_1_919_700_1_per1_2 ATAAGGAAAAAAAAAAAAAAAACCAGGTCtttttttttttttttttGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
b
diff -r 000000000000 -r 20ab85af9505 test-data/PCRinclude.allrate.bymajorallele
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/PCRinclude.allrate.bymajorallele Fri Oct 03 20:54:30 2014 -0400
b
b'@@ -0,0 +1,997 @@\n+10\t10\t91456\tA\n+10\t9\t1259\tA\n+10\t11\t605\tA\n+10\t8\t16\tA\n+10\t12\t8\tA\n+10\t7\t2\tA\n+11\t11\t39657\tA\n+11\t10\t1211\tA\n+11\t12\t514\tA\n+11\t9\t54\tA\n+11\t13\t9\tA\n+11\t8\t3\tA\n+11\t14\t1\tA\n+12\t12\t18850\tA\n+12\t11\t986\tA\n+12\t13\t417\tA\n+12\t10\t73\tA\n+12\t14\t8\tA\n+12\t9\t1\tA\n+12\t8\t1\tA\n+13\t13\t10201\tA\n+13\t12\t885\tA\n+13\t14\t320\tA\n+13\t11\t83\tA\n+13\t15\t12\tA\n+13\t10\t8\tA\n+14\t14\t3649\tA\n+14\t13\t409\tA\n+14\t15\t151\tA\n+14\t12\t62\tA\n+14\t11\t6\tA\n+14\t16\t5\tA\n+14\t10\t1\tA\n+15\t15\t847\tA\n+15\t14\t140\tA\n+15\t16\t60\tA\n+15\t13\t20\tA\n+15\t17\t4\tA\n+15\t12\t3\tA\n+16\t16\t182\tA\n+16\t15\t60\tA\n+16\t17\t14\tA\n+16\t14\t12\tA\n+16\t13\t1\tA\n+16\t12\t1\tA\n+16\t18\t1\tA\n+17\t17\t11\tA\n+17\t16\t5\tA\n+17\t15\t2\tA\n+17\t18\t1\tA\n+18\t18\t4\tA\n+18\t17\t2\tA\n+5\t5\t10047169\tA\n+5\t6\t44\tA\n+6\t6\t2808071\tA\n+6\t5\t195\tA\n+6\t7\t69\tA\n+7\t7\t1097174\tA\n+7\t6\t313\tA\n+7\t8\t83\tA\n+7\t5\t6\tA\n+8\t8\t369496\tA\n+8\t7\t387\tA\n+8\t9\t248\tA\n+8\t6\t3\tA\n+8\t10\t2\tA\n+9\t9\t184958\tA\n+9\t8\t707\tA\n+9\t10\t486\tA\n+9\t7\t5\tA\n+9\t11\t4\tA\n+10\t10\t46\tC\n+10\t9\t3\tC\n+5\t5\t1354993\tC\n+5\t6\t7\tC\n+6\t6\t193431\tC\n+6\t5\t14\tC\n+6\t7\t2\tC\n+7\t7\t22171\tC\n+7\t6\t4\tC\n+8\t8\t2966\tC\n+8\t9\t3\tC\n+8\t7\t3\tC\n+9\t9\t638\tC\n+9\t8\t8\tC\n+9\t7\t1\tC\n+10\t10\t21211\tAC\n+10\t8\t3\tAC\n+10\t12\t1\tAC\n+11\t11\t15048\tAC\n+11\t9\t10\tAC\n+12\t12\t6043\tAC\n+12\t10\t15\tAC\n+12\t14\t1\tAC\n+13\t13\t5070\tAC\n+13\t11\t40\tAC\n+13\t15\t1\tAC\n+14\t14\t3093\tAC\n+14\t12\t44\tAC\n+14\t10\t1\tAC\n+15\t15\t2848\tAC\n+15\t13\t31\tAC\n+15\t17\t1\tAC\n+16\t16\t1273\tAC\n+16\t14\t30\tAC\n+16\t12\t2\tAC\n+17\t17\t1297\tAC\n+17\t15\t27\tAC\n+18\t18\t1269\tAC\n+18\t16\t43\tAC\n+18\t20\t2\tAC\n+18\t14\t1\tAC\n+19\t19\t679\tAC\n+19\t17\t17\tAC\n+19\t21\t1\tAC\n+20\t20\t645\tAC\n+20\t18\t34\tAC\n+20\t22\t2\tAC\n+20\t16\t1\tAC\n+21\t21\t723\tAC\n+21\t19\t28\tAC\n+21\t17\t1\tAC\n+21\t23\t1\tAC\n+22\t22\t499\tAC\n+22\t20\t29\tAC\n+22\t18\t3\tAC\n+23\t23\t540\tAC\n+23\t21\t30\tAC\n+23\t19\t2\tAC\n+23\t25\t1\tAC\n+24\t24\t385\tAC\n+24\t22\t38\tAC\n+24\t26\t2\tAC\n+24\t20\t1\tAC\n+25\t25\t407\tAC\n+25\t23\t22\tAC\n+25\t27\t2\tAC\n+25\t21\t1\tAC\n+26\t26\t257\tAC\n+26\t24\t30\tAC\n+26\t22\t3\tAC\n+26\t28\t1\tAC\n+26\t20\t1\tAC\n+27\t27\t339\tAC\n+27\t25\t28\tAC\n+27\t23\t3\tAC\n+27\t29\t2\tAC\n+28\t28\t202\tAC\n+28\t26\t17\tAC\n+28\t30\t6\tAC\n+29\t29\t277\tAC\n+29\t27\t29\tAC\n+29\t31\t6\tAC\n+29\t25\t3\tAC\n+30\t30\t117\tAC\n+30\t28\t12\tAC\n+30\t32\t3\tAC\n+30\t18\t1\tAC\n+31\t31\t144\tAC\n+31\t29\t18\tAC\n+31\t27\t4\tAC\n+31\t33\t2\tAC\n+32\t32\t101\tAC\n+32\t30\t23\tAC\n+32\t28\t2\tAC\n+32\t34\t2\tAC\n+32\t26\t1\tAC\n+33\t33\t106\tAC\n+33\t31\t15\tAC\n+33\t35\t3\tAC\n+33\t29\t1\tAC\n+34\t34\t33\tAC\n+34\t32\t7\tAC\n+35\t35\t21\tAC\n+35\t33\t4\tAC\n+35\t31\t1\tAC\n+36\t36\t12\tAC\n+36\t34\t1\tAC\n+37\t37\t10\tAC\n+37\t35\t3\tAC\n+37\t31\t1\tAC\n+37\t39\t1\tAC\n+38\t38\t4\tAC\n+38\t36\t1\tAC\n+6\t6\t1521439\tAC\n+7\t7\t513952\tAC\n+8\t8\t134603\tAC\n+8\t6\t2\tAC\n+9\t9\t60741\tAC\n+9\t7\t3\tAC\n+9\t11\t1\tAC\n+10\t10\t21772\tAG\n+10\t8\t3\tAG\n+10\t12\t1\tAG\n+11\t11\t13880\tAG\n+11\t9\t10\tAG\n+11\t13\t1\tAG\n+12\t12\t5628\tAG\n+12\t10\t13\tAG\n+12\t14\t4\tAG\n+13\t13\t4494\tAG\n+13\t11\t17\tAG\n+14\t14\t1898\tAG\n+14\t12\t15\tAG\n+15\t15\t2427\tAG\n+15\t13\t18\tAG\n+16\t16\t1076\tAG\n+16\t14\t24\tAG\n+16\t12\t1\tAG\n+17\t17\t874\tAG\n+17\t15\t12\tAG\n+17\t19\t1\tAG\n+17\t13\t1\tAG\n+18\t18\t536\tAG\n+18\t16\t20\tAG\n+18\t14\t1\tAG\n+19\t19\t563\tAG\n+19\t17\t25\tAG\n+20\t20\t201\tAG\n+20\t18\t14\tAG\n+21\t21\t260\tAG\n+21\t19\t10\tAG\n+22\t22\t83\tAG\n+22\t20\t5\tAG\n+23\t23\t147\tAG\n+23\t21\t5\tAG\n+23\t25\t1\tAG\n+24\t24\t99\tAG\n+24\t22\t4\tAG\n+24\t18\t1\tAG\n+25\t25\t62\tAG\n+25\t23\t3\tAG\n+25\t27\t1\tAG\n+26\t26\t38\tAG\n+26\t24\t8\tAG\n+27\t27\t24\tAG\n+27\t25\t3\tAG\n+27\t23\t1\tAG\n+28\t28\t14\tAG\n+28\t26\t2\tAG\n+29\t29\t12\tAG\n+29\t27\t5\tAG\n+29\t31\t1\tAG\n+30\t30\t7\tAG\n+30\t28\t2\tAG\n+31\t31\t7\tAG\n+31\t27\t3\tAG\n+31\t23\t1\tAG\n+32\t32\t4\tAG\n+32\t28\t1\tAG\n+6\t6\t1880822\tAG\n+7\t7\t684837\tAG\n+7\t9\t1\tAG\n+8\t8\t183381\tAG\n+9\t9\t75547\tAG\n+9\t7\t6\tAG\n+9\t11\t1\tAG\n+10\t10\t18179\tAT\n+10\t8\t7\tAT\n+10\t12\t4\tAT\n+11\t11\t8969\tAT\n+11\t9\t5\tAT\n+11\t13\t2\tAT\n+12\t12\t4888\tAT\n+12\t10\t8\tAT\n+12\t14\t2\tAT\n+13\t13\t2785\tAT\n+13\t11\t17\tAT\n+13\t15\t1\tAT\n+14\t14\t2310\tAT\n+14\t12\t40\tAT\n+14\t16\t4\tAT\n+14\t10\t2\tAT\n+15\t15\t1461\tAT\n+15\t13\t33\tAT\n+15\t11\t1\tAT\n+15\t17\t1\tAT\n+16\t16\t879\tAT\n+16\t14\t42\tAT\n+16\t18\t2\tAT\n+16\t12\t1\tAT\n+17\t17\t599\tAT\n+17\t15\t38\tAT\n+17\t19\t2\tAT\n+17\t13\t1\tAT\n+18\t18\t367\tAT\n+18\t16\t29\tAT\n+18\t20\t7\tAT\n+18\t14\t1\tAT\n+19\t19\t223\tAT\n+19\t17\t34\tAT\n+19\t21\t3\tAT\n+20\t20\t97\tAT\n+20\t18\t14\tAT\n+20\t16\t2\tAT\n+20\t22\t1\tAT\n+21\t21\t60\tAT\n+21\t19\t18\tAT\n+21\t17\t1\tAT\n+22\t22\t53\tAT\n+22\t20\t15\tAT\n+22\t24\t5\tAT\n+22\t18\t3\tAT\n+23\t23\t11\tAT\n+23\t21\t1\tAT\n+24\t24\t7\tAT\n+24\t20\t2\tAT\n+24\t22\t2\tAT\n+6\t6\t1671932\tAT\n+6\t8\t1\tAT\n+7\t7\t595145\tAT\n+8\t8\t195533\tAT\n+8\t10\t5\tAT\n+8\t6\t2\tAT\n+9\t9\t52576\tAT\n+9\t7\t3\tAT\n+10\t10\t17\tCG\n+11\t11\t17\tCG\n'..b'5\tAAGT\n+34\t34\t9\tAAGT\n+35\t35\t6\tAAGT\n+12\t12\t594\tAATC\n+13\t13\t205\tAATC\n+14\t14\t88\tAATC\n+15\t15\t112\tAATC\n+16\t16\t20\tAATC\n+17\t17\t81\tAATC\n+18\t18\t23\tAATC\n+21\t21\t13\tAATC\n+22\t22\t8\tAATC\n+24\t24\t19\tAATC\n+26\t26\t7\tAATC\n+28\t28\t9\tAATC\n+33\t33\t6\tAATC\n+12\t12\t2293\tAATG\n+13\t13\t1226\tAATG\n+14\t14\t678\tAATG\n+15\t15\t455\tAATG\n+16\t16\t222\tAATG\n+17\t17\t211\tAATG\n+18\t18\t104\tAATG\n+19\t19\t79\tAATG\n+20\t20\t40\tAATG\n+21\t21\t33\tAATG\n+22\t22\t73\tAATG\n+23\t23\t24\tAATG\n+24\t24\t16\tAATG\n+25\t25\t18\tAATG\n+26\t26\t15\tAATG\n+27\t27\t22\tAATG\n+27\t23\t1\tAATG\n+28\t28\t5\tAATG\n+32\t32\t17\tAATG\n+33\t33\t16\tAATG\n+12\t12\t2633\tAATT\n+13\t13\t1086\tAATT\n+14\t14\t1052\tAATT\n+15\t15\t386\tAATT\n+16\t16\t393\tAATT\n+17\t17\t98\tAATT\n+18\t18\t104\tAATT\n+19\t19\t105\tAATT\n+20\t20\t34\tAATT\n+21\t21\t12\tAATT\n+22\t22\t20\tAATT\n+25\t25\t18\tAATT\n+26\t26\t25\tAATT\n+27\t27\t7\tAATT\n+29\t29\t7\tAATT\n+35\t35\t12\tAATT\n+12\t12\t1406\tACAG\n+13\t13\t964\tACAG\n+14\t14\t300\tACAG\n+15\t15\t130\tACAG\n+16\t16\t102\tACAG\n+17\t17\t49\tACAG\n+18\t18\t30\tACAG\n+19\t19\t88\tACAG\n+20\t20\t5\tACAG\n+23\t23\t5\tACAG\n+12\t12\t4868\tACAT\n+12\t15\t4\tACAT\n+13\t13\t3216\tACAT\n+14\t14\t957\tACAT\n+15\t15\t1052\tACAT\n+16\t16\t588\tACAT\n+17\t17\t422\tACAT\n+18\t18\t239\tACAT\n+19\t19\t238\tACAT\n+19\t15\t1\tACAT\n+20\t20\t25\tACAT\n+21\t21\t79\tACAT\n+22\t22\t20\tACAT\n+23\t23\t38\tACAT\n+27\t27\t42\tACAT\n+29\t29\t18\tACAT\n+31\t31\t5\tACAT\n+32\t32\t5\tACAT\n+35\t35\t6\tACAT\n+36\t36\t9\tACAT\n+41\t41\t14\tACAT\n+44\t44\t8\tACAT\n+44\t40\t1\tACAT\n+50\t50\t12\tACAT\n+12\t12\t833\tACCC\n+13\t13\t345\tACCC\n+14\t14\t190\tACCC\n+15\t15\t60\tACCC\n+16\t16\t12\tACCC\n+17\t17\t15\tACCC\n+19\t19\t8\tACCG\n+12\t12\t416\tACCT\n+13\t13\t123\tACCT\n+14\t14\t140\tACCT\n+15\t15\t69\tACCT\n+16\t16\t41\tACCT\n+17\t17\t45\tACCT\n+19\t19\t18\tACCT\n+20\t20\t27\tACCT\n+21\t21\t19\tACCT\n+22\t22\t6\tACCT\n+27\t27\t13\tACCT\n+28\t28\t7\tACCT\n+29\t29\t9\tACCT\n+30\t30\t7\tACCT\n+34\t34\t6\tACCT\n+45\t45\t5\tACCT\n+12\t12\t84\tACGC\n+13\t13\t52\tACGC\n+15\t15\t63\tACGC\n+12\t12\t433\tACGG\n+13\t13\t163\tACGG\n+14\t14\t38\tACGG\n+15\t15\t44\tACGG\n+16\t16\t7\tACGG\n+17\t17\t11\tACGG\n+19\t19\t6\tACGG\n+25\t25\t10\tACGG\n+12\t12\t1119\tACGT\n+13\t13\t509\tACGT\n+14\t14\t338\tACGT\n+15\t15\t16\tACGT\n+16\t16\t66\tACGT\n+17\t17\t7\tACGT\n+19\t19\t27\tACGT\n+12\t12\t2211\tACTC\n+13\t13\t685\tACTC\n+14\t14\t188\tACTC\n+15\t15\t151\tACTC\n+16\t16\t91\tACTC\n+18\t18\t17\tACTC\n+19\t19\t24\tACTC\n+20\t20\t23\tACTC\n+21\t21\t13\tACTC\n+23\t23\t19\tACTC\n+45\t45\t8\tACTC\n+12\t12\t161\tACTG\n+13\t13\t69\tACTG\n+14\t14\t7\tACTG\n+15\t15\t14\tACTG\n+16\t16\t15\tACTG\n+12\t12\t3118\tAGAT\n+13\t13\t1216\tAGAT\n+14\t14\t1084\tAGAT\n+15\t15\t869\tAGAT\n+16\t16\t508\tAGAT\n+17\t17\t322\tAGAT\n+18\t18\t159\tAGAT\n+19\t19\t258\tAGAT\n+20\t20\t63\tAGAT\n+21\t21\t84\tAGAT\n+22\t22\t69\tAGAT\n+22\t14\t6\tAGAT\n+23\t23\t112\tAGAT\n+24\t24\t107\tAGAT\n+25\t25\t36\tAGAT\n+26\t26\t113\tAGAT\n+27\t27\t42\tAGAT\n+28\t28\t58\tAGAT\n+29\t29\t37\tAGAT\n+30\t30\t16\tAGAT\n+31\t31\t32\tAGAT\n+32\t32\t24\tAGAT\n+33\t33\t10\tAGAT\n+34\t34\t43\tAGAT\n+35\t35\t6\tAGAT\n+36\t36\t13\tAGAT\n+36\t32\t1\tAGAT\n+37\t37\t35\tAGAT\n+38\t38\t34\tAGAT\n+39\t39\t20\tAGAT\n+39\t35\t2\tAGAT\n+40\t40\t27\tAGAT\n+41\t41\t29\tAGAT\n+42\t42\t30\tAGAT\n+43\t43\t87\tAGAT\n+44\t44\t67\tAGAT\n+45\t45\t20\tAGAT\n+46\t46\t15\tAGAT\n+47\t47\t28\tAGAT\n+48\t48\t26\tAGAT\n+49\t49\t13\tAGAT\n+50\t50\t11\tAGAT\n+52\t52\t5\tAGAT\n+54\t54\t6\tAGAT\n+12\t12\t236\tAGCC\n+13\t13\t109\tAGCC\n+14\t14\t17\tAGCC\n+15\t15\t14\tAGCC\n+16\t16\t8\tAGCC\n+18\t18\t12\tAGCC\n+21\t21\t18\tAGCC\n+23\t23\t13\tAGCC\n+12\t12\t23\tAGCG\n+13\t13\t19\tAGCG\n+18\t18\t9\tAGCG\n+12\t12\t272\tAGCT\n+13\t13\t89\tAGCT\n+14\t14\t108\tAGCT\n+15\t15\t49\tAGCT\n+16\t16\t19\tAGCT\n+17\t17\t19\tAGCT\n+18\t18\t19\tAGCT\n+19\t19\t44\tAGCT\n+22\t22\t12\tAGCT\n+27\t27\t16\tAGCT\n+12\t12\t87\tAGGC\n+13\t13\t19\tAGGC\n+14\t14\t16\tAGGC\n+18\t18\t7\tAGGC\n+12\t12\t3610\tAGGG\n+13\t13\t1980\tAGGG\n+14\t14\t1095\tAGGG\n+15\t15\t624\tAGGG\n+16\t16\t159\tAGGG\n+17\t17\t59\tAGGG\n+18\t18\t43\tAGGG\n+19\t19\t60\tAGGG\n+20\t20\t49\tAGGG\n+21\t21\t12\tAGGG\n+23\t23\t10\tAGGG\n+12\t12\t531\tATCC\n+13\t13\t323\tATCC\n+14\t14\t221\tATCC\n+15\t15\t58\tATCC\n+16\t16\t78\tATCC\n+17\t17\t38\tATCC\n+18\t18\t12\tATCC\n+19\t19\t19\tATCC\n+20\t20\t17\tATCC\n+21\t21\t44\tATCC\n+22\t22\t12\tATCC\n+23\t23\t39\tATCC\n+24\t24\t11\tATCC\n+25\t25\t12\tATCC\n+27\t27\t10\tATCC\n+32\t32\t6\tATCC\n+39\t39\t8\tATCC\n+40\t40\t6\tATCC\n+48\t48\t7\tATCC\n+12\t12\t272\tATCG\n+13\t13\t89\tATCG\n+14\t14\t108\tATCG\n+15\t15\t49\tATCG\n+16\t16\t19\tATCG\n+17\t17\t19\tATCG\n+18\t18\t19\tATCG\n+19\t19\t44\tATCG\n+22\t22\t12\tATCG\n+27\t27\t16\tATCG\n+12\t12\t1119\tATGC\n+13\t13\t509\tATGC\n+14\t14\t338\tATGC\n+15\t15\t16\tATGC\n+16\t16\t66\tATGC\n+17\t17\t7\tATGC\n+19\t19\t27\tATGC\n+12\t12\t13\tCCCG\n+12\t12\t178\tAGTC\n+13\t13\t77\tAGTC\n+14\t14\t13\tAGTC\n+15\t15\t12\tAGTC\n'
b
diff -r 000000000000 -r 20ab85af9505 test-data/combineprob_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/combineprob_out.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,7 @@
+read_depth allele heterozygous_prob motif
+2 10_11 0.485943568663 A
+2 11_12 0.472130683091 A
+2 9_10 0.494635026326 A
+3 10_11 0.71878954705 A
+3 11_12 0.688571908761 A
+3 9_10 0.73801798345 A
b
diff -r 000000000000 -r 20ab85af9505 test-data/microsatcompat_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatcompat_in.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,3 @@
+15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+17 52191125 52191133 GA 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
+17 52191125 52191133 AC 8 4 8 26 67 AG 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
b
diff -r 000000000000 -r 20ab85af9505 test-data/microsatcompat_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatcompat_out.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,3 @@
+15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+17 52191125 52191133 GA 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
+17 52191125 52191133 AC 8 4 8 26 67 AG 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
b
diff -r 000000000000 -r 20ab85af9505 test-data/microsatellite_flanking_L.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatellite_flanking_L.fastq Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,4 @@
+@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT
++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
b
diff -r 000000000000 -r 20ab85af9505 test-data/microsatellite_flanking_R.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatellite_flanking_R.fastq Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,4 @@
+@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG
++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
b
diff -r 000000000000 -r 20ab85af9505 test-data/microsatpurity_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatpurity_in.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,3 @@
+15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATTATATATATATAT
+17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
b
diff -r 000000000000 -r 20ab85af9505 test-data/microsatpurity_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatpurity_out.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,2 @@
+15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
b
diff -r 000000000000 -r 20ab85af9505 test-data/nice1tab.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nice1tab.py Fri Oct 03 20:54:30 2014 -0400
[
@@ -0,0 +1,6 @@
+import sys
+fd=open(sys.argv[1])
+lines=fd.readlines()
+for line in lines:
+    temp=line.strip().split()
+    print '\t'.join(temp)
\ No newline at end of file
b
diff -r 000000000000 -r 20ab85af9505 test-data/probvalueforhetero_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/probvalueforhetero_in.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,9 @@
+chr 9,10 A hetero -1.27220836321 10 10 9
+chr 10,11 A hetero -0.939119957032 11 11 10
+chr 11,12 A hetero -0.720375026792 12 12 11
+chr 9,9,10 A hetero -1.6841441619 9 9 10
+chr 9,10,10 A hetero -0.97233405327 10 10 9
+chr 10,10,11 A hetero -1.29451118958 10 10 11
+chr 10,11,11 A hetero -0.641022011041 11 11 10
+chr 11,11,12 A hetero -1.01921634129 11 11 12
+chr 11,12,12 A hetero -0.425116661902 12 12 11
b
diff -r 000000000000 -r 20ab85af9505 test-data/probvalueforhetero_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/probvalueforhetero_out.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,9 @@
+chr 9,10 A hetero -1.27220836321 10 10 9 0.247317513163 2 0.494635026326 2
+chr 10,11 A hetero -0.939119957032 11 11 10 0.242971784331 2 0.485943568663 2
+chr 11,12 A hetero -0.720375026792 12 12 11 0.236065341545 2 0.472130683091 2
+chr 9,9,10 A hetero -1.6841441619 9 9 10 0.124528157268 3 0.373584471803 3
+chr 9,10,10 A hetero -0.97233405327 10 10 9 0.121477837216 3 0.364433511647 3
+chr 10,10,11 A hetero -1.29451118958 10 10 11 0.122575544751 3 0.367726634253 3
+chr 10,11,11 A hetero -0.641022011041 11 11 10 0.117020970932 3 0.351062912797 3
+chr 11,11,12 A hetero -1.01921634129 11 11 12 0.11865253007 3 0.35595759021 3
+chr 11,12,12 A hetero -0.425116661902 12 12 11 0.110871439517 3 0.332614318551 3
b
diff -r 000000000000 -r 20ab85af9505 test-data/profilegenerator_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/profilegenerator_in.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,6 @@
+9 9 100000
+10 10 91456
+10 9 1259
+11 11 39657
+11 10 1211
+11 12 514
b
diff -r 000000000000 -r 20ab85af9505 test-data/profilegenerator_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/profilegenerator_out.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,30 @@
+chr 9,9 A
+chr 9,10 A
+chr 9,11 A
+chr 9,12 A
+chr 10,10 A
+chr 10,11 A
+chr 10,12 A
+chr 11,11 A
+chr 11,12 A
+chr 12,12 A
+chr 9,9,9 A
+chr 9,9,10 A
+chr 9,9,11 A
+chr 9,9,12 A
+chr 9,10,10 A
+chr 9,10,11 A
+chr 9,10,12 A
+chr 9,11,11 A
+chr 9,11,12 A
+chr 9,12,12 A
+chr 10,10,10 A
+chr 10,10,11 A
+chr 10,10,12 A
+chr 10,11,11 A
+chr 10,11,12 A
+chr 10,12,12 A
+chr 11,11,11 A
+chr 11,11,12 A
+chr 11,12,12 A
+chr 12,12,12 A
b
diff -r 000000000000 -r 20ab85af9505 test-data/readdepth2seqdepth.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/readdepth2seqdepth.out Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,2 @@
+repeat_length read_length informative_read_depth =locus_specific_sequencing_depth =genome_wide_sequencing_depth
+10 100 10 20 26
b
diff -r 000000000000 -r 20ab85af9505 test-data/samplePESAM_2_profile_C.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplePESAM_2_profile_C.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,5 @@
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 shifted 540 713 713 719 719 759 6 GGGGGG
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 shifted 4007 4082 4082 4088 4088 4258 6 TTTTTT
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 shifted 1849 1930 1930 1936 1936 2100 6 CCCCCC
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 shifted 1849 2025 2025 2030 2030 2100 5 GGGGG
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 shifted 1428 1517 1517 1522 1522 1543 5 AAAAA
b
diff -r 000000000000 -r 20ab85af9505 test-data/sampleTRgenotypingcorrection
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleTRgenotypingcorrection Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,2 @@
+chr1 14,13,13,13 A hetero -0.429451855856 13 13 14
+chr1 5,6,6,6,6,7,7,8,8 A hetero -14.8744881854 7 6 8
b
diff -r 000000000000 -r 20ab85af9505 test-data/sampleTRprofile_C.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleTRprofile_C.txt Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,2 @@
+chr1 14,13,13,13 A
+chr1 5,6,6,6,6,7,7,8,8 A
b
diff -r 000000000000 -r 20ab85af9505 test-data/samplefq.snoope
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplefq.snoope Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,1 @@
+6 40 54 G 0 SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
b
diff -r 000000000000 -r 20ab85af9505 test-data/samplefq.snoope.new
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplefq.snoope.new Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,1 @@
+6 40 54 G 0 SRR345592.75000006_HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
b
diff -r 000000000000 -r 20ab85af9505 test-data/sampleprofilegenerator_in
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleprofilegenerator_in Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,6 @@
+9 9 100000
+10 10 91456
+10 9 1259
+11 11 39657
+11 10 1211
+11 12 514
b
diff -r 000000000000 -r 20ab85af9505 test-data/sampleprofilegenerator_out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleprofilegenerator_out Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,30 @@
+chr 9,9 A
+chr 9,10 A
+chr 9,11 A
+chr 9,12 A
+chr 10,10 A
+chr 10,11 A
+chr 10,12 A
+chr 11,11 A
+chr 11,12 A
+chr 12,12 A
+chr 9,9,9 A
+chr 9,9,10 A
+chr 9,9,11 A
+chr 9,9,12 A
+chr 9,10,10 A
+chr 9,10,11 A
+chr 9,10,12 A
+chr 9,11,11 A
+chr 9,11,12 A
+chr 9,12,12 A
+chr 10,10,10 A
+chr 10,10,11 A
+chr 10,10,12 A
+chr 10,11,11 A
+chr 10,11,12 A
+chr 10,12,12 A
+chr 11,11,11 A
+chr 11,11,12 A
+chr 11,12,12 A
+chr 12,12,12 A
b
diff -r 000000000000 -r 20ab85af9505 test-data/samplesortedPESAM_C.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplesortedPESAM_C.sam Fri Oct 03 20:54:30 2014 -0400
b
@@ -0,0 +1,10 @@
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 113 shifted 720 37 40M = 541 -46 TTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACC HHFG@IIHHHHHIHHFHHGFGGGGDBDDEDDDBBB????? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:40
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 177 shifted 541 37 173M = 720 46 CTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAAC ::GECC:*:)D<GEGGGECCCEC?00E?::CCCCEEECC:C*GEC4'.>ACGGEC:CC?>><DCE?C:EC?GECE?:CCECGEEC*GEECEC:GEEGE?GGECC:ECA2CC*CCC8DEGGEGC=CGECEAEGEEDGGEDEGD=EBGGGFDHHHHHHHHEEHHHHHIIHFIIHH XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:173
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 113 shifted 4089 37 170M = 4008 -176 GCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGAAGCCATACCAAACGACGAGCGTGACACCACGATGCCTGTAGCAATGGCAACAACGTTGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCTTCCCGGCAACAATTAATAG GECGGGGGGGGGGGGEGEGGGGD>2GEGGGGGEEGGGGGGGGGGGGGEEECEGEAGGEEGEB>=GGFGEAGHHHEHHHFHFF?ED;HFIHHIIIIHIIHHHHIHHHHIHHHHHHHHIIIIHIHHHHIHHHHHIIHHIIHHIIHIIIIIGGGGGGDDDDDDDDBBB????< XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:170
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 177 shifted 4008 37 75M = 4089 176 TGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGC CEGGEEEECC?:EEGECGGGGECGGGGEEGGEEGCCGEGGGGGGGGGGDGGGGGE>EEGGGGGGGGGGGAGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:75
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 129 shifted 1937 37 164M = 1850 -87 TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT HHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:138T25
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 65 shifted 1850 37 81M = 1937 87 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGA ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGH XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:81
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 129 shifted 2031 37 70M = 1850 -181 TAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT GGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:44T25
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 65 shifted 1850 37 176M = 2031 181 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTT ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGHIIIHHHHHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:176
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 129 shifted 1523 37 21M = 1429 -94 GTCTTTAACTCCACCATTAGC GGGEGGEGGGGGCGGGGGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:21
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 65 shifted 1429 37 89M = 1523 94 CTATGCATCCAACGCGTTGGGAGCTCTCCCATATGGTCGACCTGCAGGCGGCCGCGAATTCACTAGTGATTTCCAAGGACAAATCAGAG ?????BBBDDDDDDDDGGGFGGFEHIIIIIIIHIIIHIHHHHHIIHFHHHHHHHHHHHHHHHHHHHHGGGGGGGGGGGGGGGGGGEGEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:89
b
diff -r 000000000000 -r 20ab85af9505 test-data/shifted.2bit
b
Binary file test-data/shifted.2bit has changed