Repository 'pcr_markers'
hg clone https://toolshed.g2.bx.psu.edu/repos/john-mccallum/pcr_markers

Changeset 0:21053f7f9ed1 (2012-06-14)
Next changeset 1:a0689dc29b7f (2012-07-31)
Commit message:
First upload of PCR Marker tools
added:
CAPS2gff.sh
CAPS2gff.xml
GVF_Features_Extracter.xml
convert_gsMapper_gff3.xml
design_primers.py
design_primers.xml
find_CAPS.py
find_CAPS.xml
gsmapper2gff.sh
parse_primersearch.pl
parse_primersearch.xml
patman.xml
patman2gff.xml
uniq.xml
vcf2gvf.sh
vcf2gvf.xml
b
diff -r 000000000000 -r 21053f7f9ed1 CAPS2gff.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/CAPS2gff.sh Thu Jun 14 19:29:26 2012 -0400
[
@@ -0,0 +1,21 @@
+#!/bin/sh
+##convert output of CAPS detection tool to GFF3
+#Copyright 2012 John McCallum
+#New Zealand Institute for Plant and Food Research
+
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+inputfile=$1
+outputfile=$2
+awk -F '\t' 'split($4,ID,":") {print ID[1], "FINDCAPS",ID[3],ID[4],ID[4],".",".",".","ID="$1";Enzyme="$5";Phase="$6}' $inputfile > $outputfile
b
diff -r 000000000000 -r 21053f7f9ed1 CAPS2gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/CAPS2gff.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<tool id="CAPS2gff" name="CAPS Output to GFF3">
+  <description>convert output of CAPS detection to gvf/gff3</description>
+  <command interpreter="bash">CAPS2gff.sh $inputFile  $outputfile</command>
+  <inputs>
+    <param format="tabular" name="inputFile" type="data" label="Input CAPS File" help="tabular CAPS from find_CAPs tool" />
+  </inputs>
+  <outputs>
+<data format="gff3" name="outputfile" />
+
+  </outputs>
+<help>
+This tool provides a simple conversion from CAPS 3 column output to GFF3
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
+
+</help>
+</tool>
+
b
diff -r 000000000000 -r 21053f7f9ed1 GVF_Features_Extracter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GVF_Features_Extracter.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<tool id="GVF Features  Extracter_1" name="GVF Feature ID Extracter">
+  <description>Extract unique feature IDs from a GVF/GFF3 file </description>
+  <command >awk '!/^#/ {print $1":"$2":"$3":"$4}'   $inputgff3File | sort | uniq >  $column_outputfile</command>
+  <inputs>
+    <param format="gff3" name="inputgff3File" type="data" label="GVF/Gff3 File"/>
+  </inputs>
+  <outputs>
+     <data format="tabular"  name="column_outputfile" />
+  </outputs>
+<help>
+
+.. class:: infomark
+
+**TIP**
+
+This tool extracts a unique feature identifier from  GVF/GFF3 format, primarily to enable downstream analysis of SNPs or marker design
+
+
+----
+
+**Example**
+
+*input GFF3*
+
+::
+
+ ##gff-version 3
+ 1119927 gsMapper        SNP     434     434     .       .       .       ID=1119927:gsMapper:SNP:434;Reference_seq=G;Variant_seq=A;Total_reads=10;Variant_freq=100;Enzyme=AluI;Phase=variant
+ 1120709 gsMapper        SNP     361     361     .       .       .       ID=1120709:gsMapper:SNP:361;Reference_seq=T;Variant_seq=C;Total_reads=22;Variant_freq=68;Enzyme=TaqI;Phase=variant
+ 1120709 gsMapper        SNP     704     704     .       .       .       ID=1120709:gsMapper:SNP:704;Reference_seq=A;Variant_seq=G;Total_reads=20;Variant_freq=90;Enzyme=RsaI;Phase=variant
+
+*output tabular text*
+
+::
+
+ 1119927:gsMapper:SNP:434
+ 1120709:gsMapper:SNP:361
+ 1120709:gsMapper:SNP:704
+
+ -----------------------------------------------------

+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+</help>
+
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 convert_gsMapper_gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_gsMapper_gff3.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,70 @@
+<?xml version="1.0"?>
+<tool id="gsMapper_to_gff3_1" name="Convert gsMapper to gff3">
+  <description>Convert Roche gsMapper to GFF3</description>
+  <command interpreter="bash">gsmapper2gff.sh  $inputGsFile  $outputfile</command>
+  <inputs>
+    <param format="txt" name="inputGsFile" type="data" label="Roche gsMapper 454HCDiffs.txt or 454AllDiffs.txt file"/>
+
+  </inputs>
+  <outputs>
+     <data format="gff3" name="outputfile" />
+  </outputs>
+<help>
+.. class:: infomark
+
+**TIP**
+
+This tool parses Roche gsMapper read mapping output into GVF/GFF3 format
+
+
+----
+
+**Example**
+
+*input*
+
+::
+
+ >Reference      Start   End     Ref     Var     Total   Var     Ref     Var     Coding  Region  Known   # Fwd   # Rev   # Fwd   # Rev
+ >Accno           Pos    Pos     Nuc     Nuc     Depth   Freq    AA      AA      Frame   Name    SNP's   w/ var  w/ var  Total   Total
+ ______________________________
+
+ >1118212        673     673     A       C       7       100%                                            6       1       6       1

+ Reads with Difference:
+ 1118212                     648+ GTTGGTCCACTATTACTCTCAGATT-ATTTCATAACAATAATGG----A-TAC-AA 696
+                                                           **
+ FX289JP01DVQR7               53- GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGGGCTGA-TACTA  1
+ FX289JP02IJT2O  (7)          82+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 130
+ FX289JP01B8R7V               84+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATA-TGG----A-TAC-AA 131
+ FX289JP02FX58L               68+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----AC-AC-AA 116
+ FX289JP02JXAX7  (7)          67+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 115
+ FX289JP02JOOQZ  (2)          69+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 117
+ FX289JP02GPHPX               45+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 93
+                                                          **
+
+
+*output*
+
+::
+
+ ##gff-version 3
+ 1118212 gsMapper SNP 673 673 . . . ID=1118212:gsMapper:SNP:673;Reference_seq=A;Variant_seq=C;Total_reads=7;Variant_freq=100;
+ 1118212 gsMapper SNP 730 730 . . . ID=1118212:gsMapper:SNP:730;Reference_seq=A;Variant_seq=G;Total_reads=13;Variant_freq=31;
+ 1118212 gsMapper SNP 782 782 . . . ID=1118212:gsMapper:SNP:782;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_freq=92;
+ 1118212 gsMapper SNP 1319 1319 . . . ID=1118212:gsMapper:SNP:1319;Reference_seq=G;Variant_seq=A;Total_reads=7;Variant_freq=100;
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+

+</help>
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 design_primers.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/design_primers.py Thu Jun 14 19:29:26 2012 -0400
[
b'@@ -0,0 +1,153 @@\n+#!/usr/local/bin/python2.6\n+##design primers to features in multiple sequences\n+##usage: python  design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>\n+\n+\n+#Copyright 2012 John McCallum & Leshi Chen\n+#New Zealand Institute for Plant and Food Research\n+#This program is free software: you can redistribute it and/or modify\n+#     it under the terms of the GNU General Public License as published by\n+#    the Free Software Foundation, either version 3 of the License, or\n+#    (at your option) any later version.\n+#\n+#    This program is distributed in the hope that it will be useful,\n+#    but WITHOUT ANY WARRANTY; without even the implied warranty of\n+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+#    GNU General Public License for more details.\n+#\n+#    You should have received a copy of the GNU General Public License\n+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.\n+\n+\n+import os\n+import StringIO\n+import re\n+import tempfile\n+import subprocess\n+import copy\n+import sys\n+from BCBio import GFF\n+from BCBio.GFF import GFFExaminer\n+from Bio import SeqIO\n+from Bio.Emboss.Applications import Primer3Commandline\n+from Bio.Emboss import Primer3\n+\n+\n+in_file = sys.argv[1]\n+gff_file = sys.argv[2] \n+target_file =  sys.argv[3]\n+prod_min_size = int(sys.argv[4])\n+prod_max_size = int(sys.argv[5])\n+\n+max_tm_diff=1                                        ##\n+opt_GC_percent=50                                    ##\n+maxpolyx=4                                           ##\n+optimum_length=20\n+##target is specified in start, end format \n+productsizerange = str(prod_min_size) + "," + str(prod_max_size)\n+#open input files\n+in_seq_handle = open(in_file)\n+in_gff_handle = open(gff_file)\n+in_target_handle=open(target_file)\n+#read  target feature IDs into list\n+targets=in_target_handle.readlines()\n+in_target_handle.close()\n+##and create a hit list of sequences from this\n+target_seq_id_list = list(set([line.split(":")[0] for line in targets]))\n+##create iterator returning sequence records\n+for myrec in SeqIO.parse(in_seq_handle, "fasta"):\n+    #check if this sequence is included in the target list\n+    if myrec.id in target_seq_id_list:\n+        ##create sequence dictionary so we can add in gff annotations\n+        seq_dict = {myrec.id : myrec}\n+        ##just limit to gff annotations for this sequence\n+        limit_info = dict(gff_id = [ myrec.id ])\n+        ##rewind gff filehandle\n+        in_gff_handle.seek(0)\n+        ##read annotations into sequence dictionary for this sequence record only \n+        annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]\n+        ##if there are any annotations, then  proceed. \n+        if annotations:\n+            rec=annotations[0]\n+            ##iterate over list of target IDs\n+            for t in targets:\n+                target_ID = t.strip(\'\\n\')\n+                target_annotations = [f for f in rec.features if f.id == target_ID ]\n+                if target_annotations:\n+                    mytarget = target_annotations[0]\n+                    #create temporary files\n+                    tempfastaFile = tempfile.mktemp()\n+                    tempproutfile = tempfile.mktemp()\n+                    #just consider slice of sequence in a window of +/- prod_max_size  bp\n+                    ##from feature UNLESS feature is close to end\n+                    ##Note that slice is zero-based\n+                    featLocation = mytarget.location.start.position \n+                    if featLocation > prod_max_size:\n+                        slice_start = featLocation - prod_max_size\n+                        featPosition = prod_max_size  \n+                    else:\n+                        slice_start = 0\n+                        featPosition = featLocation\n+                    if (len(rec) - featLocation) < prod_max_size:\n+                        slice_end = len(rec)\n+                   '..b'   matching_feature = [f for f in targetRec.features if f.id == mytarget.id]\n+                    if matching_feature:\n+                        target_feat = matching_feature[0]\n+                        if target_feat.location.start.position == 0:\n+                            target_feat.location.start.position = 1\n+                        #we get the mask features by removing the target...all features are masked as just using snp and indels\n+                        ##a smarter filter could be added \n+                        ##note use of list copy to avoid possible side-effects\n+                        exclude_feat = list(targetRec.features)\n+                        exclude_feat.remove(target_feat)\n+                        ##print\'targetRec.features\',  targetRec.features ##for debug\n+                        mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)\n+                        #mask_str=map(lambda f: str(f.location).strip(\'[]\'),exclude_feat)\n+                        p3_exclude_str = str(mask_str).replace(\'\\\', \\\'\',\':\')\n+                        p3_target = str(target_feat.location.start.position +1)  + "," + str(target_feat.location.end.position +1)\n+                        #write sequence record into template file as  fasta\n+                        t_output_handle = open(tempfastaFile, "w")\n+                        SeqIO.write([targetRec], t_output_handle, "fasta")\n+                        t_output_handle.close()\n+                        #create Primer3Commandline() for emboss tool\n+                        primer_cl = Primer3Commandline()\n+                        #set the emboss tool to suppress  output as this will make Galaxy  think it is error message although it is a message to state run success\n+                        primer_cl.set_parameter("-auto",\'1\')\n+                        #pass  sequence file to emboss\n+                        primer_cl.set_parameter("-sequence",tempfastaFile)\n+                        #add target location\n+                        primer_cl.set_parameter("-target", p3_target)\n+                        ##mask off other features...dumb masking of everything at present, beware\n+                        if (p3_exclude_str != ""):\n+                            primer_cl.set_parameter("-excludedregion", p3_exclude_str)\n+                        #add temporary output file to get the result\n+                        primer_cl.set_parameter("-outfile", tempproutfile)\n+                        #specify maximum different of tm\n+                        primer_cl.set_parameter("-maxdifftm",max_tm_diff )\n+                        #other useful parameters\n+                        primer_cl.set_parameter("-ogcpercent", opt_GC_percent)\n+                        primer_cl.set_parameter("-opolyxmax", maxpolyx)  \n+                        primer_cl.set_parameter("-osize", optimum_length)\n+                        #set product size range\n+                        primer_cl.set_parameter("-prange", productsizerange)\n+                        #using python subprocess method to run emboss command line programe with the parameters given\n+                        fnull = open(os.devnull, \'w\')\n+                        result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)\n+                        #read temporary outputfile\n+                        handle = open(tempproutfile)\n+                        record = Primer3.read(handle)\n+                        ##just return first set, if there is one\n+                        if len(record.primers) > 0:\n+                            primer= record.primers[0]\n+                            outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]\n+                        else:\n+                            outputstr=[mytarget.id,"NONE","NONE","NONE"]\n+                        print(\'\\t\'.join(map(str,outputstr)))\n+\n+                        \n+in_gff_handle.close()\n+in_seq_handle.close()\n'
b
diff -r 000000000000 -r 21053f7f9ed1 design_primers.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/design_primers.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,79 @@
+<?xml version="1.0"?>
+<tool id="Genetic_Marker_Design_2" name="Design primers to features">
+  <description>Design PCR Primers to Features</description>
+  <command interpreter="python">design_primers.py $inputfastaFile $inputSNPfile $inputTargetfile $min_size $max_size >  $primer_outputfile  </command>
+  <inputs>
+    <param format="fasta" name="inputfastaFile" type="data" label="Multifasta Source file" />
+    <param format="gff3" name="inputSNPfile" type="data" label="annotation file(Gff3)" />
+    <param format="txt" name="inputTargetfile" type="data" optional="false" label="Target file" help="Format:Sequence id:source:type:start, such as 1174806:gsMapper:SNP:292" ></param>
+<param name="min_size" size="20" type="text" value="75" label="Minimum Product Size Range" />
+     <param name="max_size" size="20" type="text" value="100" label="Maximum Product Size Range" />
+  </inputs>
+  <outputs>
+     <data format="tabular"  name="primer_outputfile" />
+  </outputs>
+<help>
+.. class:: infomark
+
+**TIP**
+
+This tool designs primer pairs to flank features
+
+It takes 
+
+* an input reference sequence file
+* a gff3 file containing feature information
+* a single column file containing list of features  
+
+----
+
+**Example**
+
+--input GFF
+
+::
+
+ PGSC0003DMB000000001    samtools        SNP     6345    6346    4.84    .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6345;Variant_seq=C;Reference_seq=T;Total_reads=2
+ PGSC0003DMB000000001    samtools        SNP     6453    6454    18      .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6453;Variant_seq=T;Reference_seq=G;Total_reads=8
+ PGSC0003DMB000000001    samtools        SNP     7255    7256    149     .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7255;Variant_seq=G;Reference_seq=T;Total_reads=14
+ PGSC0003DMB000000001    samtools        SNP     7371    7372    86.8    .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7371;Variant_seq=C;Reference_seq=T;Total_reads=9
+ PGSC0003DMB000000001    samtools        SNP     8288    8289    10.7    .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:8288;Variant_seq=A;Reference_seq=G;Total_reads=5
+
+
+--input features
+
+::
+
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1012901
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1021771
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1025761
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026717
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026834
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1029542
+
+
+--output columnar data
+
+::
+
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1012901 AGAGGTCGGCTCTCTAGTAGCA GGGGATCCACTAACTATGTCACTT 86
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1021771 CCTATGCGAGAAAGGGACAC GCCCTTCCATGTTGTACGAG 100
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1025761 TGTGAGTAACTTAGTGTCCTACGTCAA CACTCAATGAGCCAAAGCAA 92
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026717 TTCCTAAGTCATGGGAAAGCA AGTTCATCCAAGGCAAGCAT 76
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026834 AATGAAGTGACTGGGGAGGA TGCTGGTCGAAGCTTTCTTT 98
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1029542 TAACCAGAAAGTCCGGATGG TTCTGAAGTCAAGTGGGGAGA 75
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+</help>
+
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 find_CAPS.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_CAPS.py Thu Jun 14 19:29:26 2012 -0400
[
@@ -0,0 +1,93 @@
+#!/usr/bin/python2.6
+##find snps that condition CAPS
+##usage  find_CAPS.py <reference file>  <gff file>
+
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+import sys
+
+from Bio import SeqIO
+from BCBio import GFF
+from Bio.Restriction import *
+
+###This list is limited to economical enzymes performing well in PCR buffer
+rest_batch = RestrictionBatch(
+    [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI,
+     HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI])
+
+in_file=sys.argv[1]
+gff_file=sys.argv[2]
+
+in_seq_handle = open(in_file)
+in_gff_handle=open(gff_file)
+
+##use iterator
+for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+    ##create single-entry dictionary to accept gff annotations from parser
+    seq_dict = {myrec.id:myrec}
+
+    ##note that this filters out only SNP features
+    limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP'])
+    in_gff_handle.seek(0)
+
+    ##parse annotations into 
+    annotations = [r for r 
+                   in GFF.parse(in_gff_handle, 
+                                base_dict=seq_dict, 
+                                limit_info=limit_info)]
+
+    ##if there are any for this sequence, proceed
+    if annotations:
+        rec=annotations[0]
+        for feat in rec.features:
+            fstart=feat.location.start.position
+            fend=feat.location.end.position
+
+            if   20 < fstart < len(rec) - 20:
+                #just work with +/- 20 bp, ignoring SNPS within this 
+                #distance from ends
+                fseq=rec.seq[fstart-20:fstart+20]
+                ref_seq = rec.seq[fstart-20:fstart+20]
+                variant_seq = ref_seq.tomutable()
+
+                #mutate the variant
+                variant_seq[20]= feat.qualifiers['Variant_seq'][0]
+                variant_seq = variant_seq.toseq()
+
+                #digest the sequences
+                ref_cuts =  rest_batch.search(ref_seq)
+                var_cuts =  rest_batch.search(variant_seq)
+
+                #print 
+                for enz in ref_cuts:
+                    kr = set(ref_cuts[enz])
+                    km = set(var_cuts[enz])
+                    outputstr=[rec.id, fstart +1,fend+1,feat.id,enz]
+                    if len(kr) > len(km):
+                        outputstr.append("reference")
+                        print('\t'.join(map(str,outputstr)))
+                    elif len(kr) < len(km):
+                        outputstr.append("variant")
+                        print('\t'.join(map(str,outputstr)))
+
+in_gff_handle.close()
+in_seq_handle.close()                                                                              
+
+
+
b
diff -r 000000000000 -r 21053f7f9ed1 find_CAPS.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_CAPS.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,81 @@
+<?xml version="1.0"?>
+<tool id="CAPS_Marker_Design_2" name="CAPS Detection">
+  <description>identify SNPs that condition restriction polymorphisms</description>
+  <command interpreter="python">find_CAPS.py $inputFasta $inputSNPGff3File >  $outputfile</command>
+  <inputs>
+    <param format="gff3" name="inputSNPGff3File" type="data" label="GFF3 file containing SNP data"/>
+    <param format="fasta" name="inputFasta" type="data" label="fasta source file" />
+  </inputs>
+  <outputs>
+     <data format="interval" name="outputfile" />
+  </outputs>
+<help>
+.. class:: infomark
+
+**TIP**
+
+This tool identifies SNPs that condition restriction polymorphisms.
+
+Currently it utilizes a fixed list of robust enzymes known to work well in PCR buffers
+
+ AluI,ApaI,BamHI,BbrPI,BfrI,ClaI,DpnI,DraI,EcoRI,HaeIII,HindII,HinfI,HpaI,PvuII,RsaI,SacI,Sau3AI,SmaI,TaqI
+
+It produces a tabular output in interval format
+
+record ID,  start, stop, feature ID,enzyme, phase (ie whether it cuts reference or variant sequence)
+
+
+
+
+
+**Example**
+
+
+
+*input GFF*
+
+::
+
+ JR843866 gsmapper SNP 63 63 . . . ID=JR843866:gsmapperSNP:63;Reference_seq=T;Variant_seq=C;Total_reads=22;Variant_reads=20
+ JR843866 gsmapper SNP 146 146 . . . ID=JR843866:gsmapperSNP:146;Reference_seq=T;Variant_seq=C;Total_reads=26;Variant_reads=10
+ JR843866 gsmapper SNP 258 258 . . . ID=JR843866:gsmapperSNP:258;Reference_seq=T;Variant_seq=G;Total_reads=4;Variant_reads=3
+ JR848320 gsmapper SNP 157 157 . . . ID=JR848320:gsmapperSNP:157;Reference_seq=C;Variant_seq=T;Total_reads=10;Variant_reads=10
+ JR848554 gsmapper SNP 54 54 . . . ID=JR848554:gsmapperSNP:54;Reference_seq=T;Variant_seq=G;Total_reads=5;Variant_reads=5
+ JR848554 gsmapper SNP 74 74 . . . ID=JR848554:gsmapperSNP:74;Reference_seq=C;Variant_seq=T;Total_reads=7;Variant_reads=7
+ JR848554 gsmapper SNP 123 123 . . . ID=JR848554:gsmapperSNP:123;Reference_seq=T;Variant_seq=A;Total_reads=11;Variant_reads=11
+ JR848554 gsmapper SNP 147 147 . . . ID=JR848554:gsmapperSNP:147;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_reads=13
+ JR848554 gsmapper SNP 161 161 . . . ID=JR848554:gsmapperSNP:161;Reference_seq=C;Variant_seq=T;Total_reads=13;Variant_reads=13
+
+
+
+*output columnar data*
+
+::
+
+ JR843866 63 64 JR843866:gsmapperSNP:63 HaeIII variant
+ JR848320 157 158 JR848320:gsmapperSNP:157 TaqI variant
+ JR848320 157 158 JR848320:gsmapperSNP:157 HinfI variant
+ JR848554 162 163 JR848554:gsmapperSNP:162 TaqI variant
+ JR848554 162 163 JR848554:gsmapperSNP:162 ClaI variant
+ JR848554 306 307 JR848554:gsmapperSNP:306 TaqI variant
+ JR848554 652 653 JR848554:gsmapperSNP:652 TaqI variant
+
+
+-------------------------------------------------------------------------------
+
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 gsmapper2gff.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gsmapper2gff.sh Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,50 @@
+#!/bin/sh
+##convert gsMapper output into gff3/GVF format
+
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+infile=$1
+outfile=$2
+
+awk '
+BEGIN {OFS="\t"}
+/^>/  && sub(/%/,"",$7)  {
+  ID=substr($1,2)
+  if  (length($4) > 1 || match($4,"-") || length($5) > 1 || match($5,"-")) 
+    type="indel"
+    else
+      type="SNP"
+start=$2
+end=$3
+Col9_ID=ID ":gsmapper:" type ":"start
+
+Reference_seq=$4
+Variant_seq=$5
+Total_reads=$6
+Variant_reads=Total_reads * $7 /100 - (Total_reads * $7 % 100)/100 
+
+
+
+ print ID,"gsmapper",type,start,end,".",".",".","ID="Col9_ID";Reference_seq="Reference_seq";Variant_seq="Variant_seq";Total_reads="Total_reads";Variant_reads="Variant_reads
+}' "$infile" > "$outfile" 
+
+
+
+
+
+
+
+
b
diff -r 000000000000 -r 21053f7f9ed1 parse_primersearch.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/parse_primersearch.pl Thu Jun 14 19:29:26 2012 -0400
[
@@ -0,0 +1,37 @@
+#!/usr/bin/perl
+#parse_primersearch.pl
+#reformat EMBOSS primersearch output into columnar Galaxy interval format 
+
+#Copyright 2012 John McCallum 
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+open (IN, "<$ARGV[0]");
+open (OUT, ">$ARGV[1]");
+
+#print OUT  "primerset_id","\t","sequence_id","\t","hit_start","\","mismatches","\t","amplimer_size",\n";
+
+
+
+while (<IN>) {
+         /^Primer name (\S+)/  && ($name = $1);  # get primer set name
+         # Modified to cope with unnamed sequence input 28/7/05
+        /Sequence: (\S+)/ && print OUT  $name,"\t",$1;
+        /Sequence:(\s{4,})/ && print OUT $name,"\t","unnamed_seq";
+        /hits forward strand at (\d+) with (\d) mismatches/ && ($start = $1) &&  print OUT  "\t",$2,"\t",$start,;
+        /Amplimer length: (\S+)/ && ($amp_length = $1) &&  print OUT "\t",$start + $amp_length,"\t",$1,"\n";
+        }
+
+close( IN );
+close( OUT );
b
diff -r 000000000000 -r 21053f7f9ed1 parse_primersearch.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/parse_primersearch.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,68 @@
+<?xml version="1.0"?>
+<tool id="parse_primersearch_1" name="Parse Primer search">
+  <description>Parse EMBOSS primer search output to tabular </description>
+  <command interpreter="perl">parse_primersearch.pl $inputPrimersearchFile $outputfile </command>
+  <inputs>
+    <param format="primersearch" name="inputPrimersearchFile" type="data" label="Primersearch  output file"/>
+  </inputs>
+  <outputs>
+
+     <data format="tabular" name="outputfile" />
+  </outputs>
+<help>
+
+.. class:: infomark
+
+**TIP**
+
+This tool parses EMBOSS primersearch_ output into columnar format suitable for use as interval 
+
+
+
+Columns are:
+
+1. Primer set ID
+2. Hit ID
+3. Number of mismatches
+4. Amplimer start
+5. Amplimer end
+6. Amplicon length
+
+
+----
+
+**Example**
+
+*output*
+
+::
+
+ ACP032  isotig07062     0       214     363     149
+ ACP223  isotig04647     0       362     574     212
+ ACP224  isotig04647     0       303     519     216
+ ACP225  isotig04647     0       153     355     202
+ ACP363  isotig10393     0       93      193     100
+ ACP394  isotig00271     0       894     986     92
+ ACP394  isotig00273     0       805     897     92
+ ACP440  isotig05277     0       506     601     95
+ ACP615  isotig00271     0       894     978     84
+ ACP615  isotig00273     0       805     889     84
+ AJK295  isotig06005     0       182     651     469
+
+
+.. _primersearch: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/primersearch.html
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 patman.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/patman.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<tool id="patman_1" name="search for patterns in DNA using PatMaN">
+  <description>search for approximate patterns in DNA libraries</description>
+  <command>patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile  </command>
+  <inputs>
+  <param  name="edits" type="integer" label="max N mismatches and or gaps" value="0" size="1" />
+  <param name="gaps" type="integer" label="max N gaps" value="0" size="1"/>
+    <param format="fasta" name="inputfastaFile" type="data" label="Multifasta target file" />
+    <param format="fasta" name="inputPatfile" type="data" label="Pattern fasta" />
+
+  </inputs>
+  <outputs>
+     <data format="tabular"  name="patman_outputfile" />
+  </outputs>
+<help>
+
+
+
+This is a Galaxy wrapper for PatMaN: a DNA pattern matcher for short sequences
+
+* Website https://bioinf.eva.mpg.de/patman/
+* PubMed Citation_ 
+
+*Inputs*
+
+* Patterns in fasta format (create from tabular using tabular-to-fasta tool)
+* Multifasta file of target sequences
+
+*Output*
+
+* 6 Column tabular interval data
+* Columns Chrom, Pattern Name, Start, End, strand, N mismatches
+
+:: 
+
+ isotig05934 ACP818 368 389 + 0
+ isotig05934 ACP859 377 396 + 0
+ isotig06765 ACP822 448 468 + 0
+ isotig07088 ACP825 49 75 + 0
+ isotig07652 ACP830 199 218 + 0
+ isotig07652 ACP831 257 276 + 0
+ isotig10333 ACP837 474 497 + 0
+ isotig10393 ACP838 10 34 + 0

+
+
+.. _Citation: http://www.ncbi.nlm.nih.gov/pubmed/18467344?dopt=Abstract
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 patman2gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/patman2gff.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<tool id="patman2gff" name="convert patman to GFF3">
+  <description>convert output of patman pattern detection to gvf/gff3</description>
+  <command>awk 'OFS="\t" {print $1,"patman","primer_binding_site",$3,$4,$6,$5,".","name="$2}' $inputFile  >  $outputfile</command>
+  <inputs>
+    <param format="tabular" name="inputFile" type="data" label="Input patman File" help="tabular output from patman tool" />
+  </inputs>
+  <outputs>
+<data format="gff3" name="outputfile" />
+
+  </outputs>
+<help>
+This tool provides a simple conversion from patman column output to GFF3
+
+-------------------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+</help>
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 uniq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/uniq.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<tool id="uni_Mask_1" name="Unique">
+  <description>Return unique lines</description>
+  <command >cat $inputFile | sort | uniq >  $outputfile</command>
+  <inputs>
+    <param format="txt" name="inputFile" type="data" label="Input  File" help="Any text or tabular file" />
+  </inputs>
+  <outputs>
+     <data format="txt"  name="outputfile" />
+  </outputs>
+
+</tool>
b
diff -r 000000000000 -r 21053f7f9ed1 vcf2gvf.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2gvf.sh Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,46 @@
+#!/bin/sh
+##convert vcf to gvf
+##NOTE This is a very simple basic parser for a complex format.
+
+##usage vcf2gvf.sh <vcf file> <outputfile>
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+inputfile=$1
+outputfile=$2
+
+echo  "##gvf-version 1.05" > $outputfile
+
+awk '
+BEGIN {OFS="\t"}
+
+##get feature type 
+{if (index($8,"INDEL")== 1) {type="INDEL"} else {type="SNP"} }
+##get feature length
+{if (type=="SNP") 
+    {feat_length=1}
+    else {feat_length=length($4)} 
+}
+{end=($2+feat_length)}
+
+!/^#/  { print $1 ,"SAMTOOLS",type,$2,end,$6,".",".","ID="$1":SAMTOOLS:"type":"$2";Variant_seq="$5";Reference_seq="$4";"$8}
+
+END {print ""} 
+' "$inputfile" > "$outputfile"
\ No newline at end of file
b
diff -r 000000000000 -r 21053f7f9ed1 vcf2gvf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2gvf.xml Thu Jun 14 19:29:26 2012 -0400
b
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<tool id="vcf2gvf_1" name="VCF to GFF3">
+  <description>convert vcf to gvf/gff3</description>
+  <command interpreter="bash">vcf2gvf.sh  $inputFile  $outputfile</command>
+  <inputs>
+    <param format="vcf" name="inputFile" type="data" label="Input vcf File" help="vcf file from mpileup" />
+  </inputs>
+  <outputs>
+<data format="gff3" name="outputfile" /> 
+
+  </outputs>
+<help>
+This tool provides a simple conversion from vcf to gvf.
+
+Be sure to read the documentation to determine if it meets your requirements.
+
+* vcf documentation at http://samtools.sourceforge.net/samtools.shtml#6
+* GVF/GFF3 at http://www.sequenceontology.org/resources/gvf.html
+
+
+
+**input**
+
+::
+
+ PGSC0003DMB000000010 2042429 . C A 44.6 . DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1 GT:PL:DP:GQ 0/1:14,0,42:5:23 1/1:27,6,0:2:9 1/1:15,3,0:1:7 1/1:30,6,0:2:9
+ PGSC0003DMB000000038 1756646 . G A 3.69 . DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31 GT:PL:DP:GQ 0/1:20,3,0:1:6 0/1:9,0,67:7:8 0/0:0,15,82:5:17 0/1:16,3,0:1:5
+ PGSC0003DMB000000064 1916664 . T C 8.12 . DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5 GT:PL:DP:GQ 1/1:14,3,0:1:5 1/1:0,0,0:0:3 1/1:13,3,0:1:5 1/1:15,3,0:1:5
+
+
+**output**
+
+
+::
+
+ PGSC0003DMB000000010 samtools SNP 2042429 2042430 44.6 . . ID=PGSC0003DMB000000010:SAMTOOLS:SNP:2042429;Variant_seq=A;Reference_seq=C;DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1
+ PGSC0003DMB000000038 samtools SNP 1756646 1756647 3.69 . . ID=PGSC0003DMB000000038:SAMTOOLS:SNP:1756646;Variant_seq=A;Reference_seq=G;DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31
+ PGSC0003DMB000000064 samtools SNP 1916664 1916665 8.12 . . ID=PGSC0003DMB000000064:SAMTOOLS:SNP:1916664;Variant_seq=C;Reference_seq=T;DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5
+
+
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+</help>
+</tool>