Mercurial > repos > john-mccallum > pcr_markers

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CAPS2gff.sh	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,21 @@
+#!/bin/sh
+##convert output of CAPS detection tool to GFF3
+#Copyright 2012 John McCallum
+#New Zealand Institute for Plant and Food Research
+
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+inputfile=$1
+outputfile=$2
+awk -F '\t' 'split($4,ID,":") {print ID[1], "FINDCAPS",ID[3],ID[4],ID[4],".",".",".","ID="$1";Enzyme="$5";Phase="$6}' $inputfile > $outputfile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CAPS2gff.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<tool id="CAPS2gff" name="CAPS Output to GFF3">
+  <description>convert output of CAPS detection to gvf/gff3</description>
+  <command interpreter="bash">CAPS2gff.sh $inputFile  $outputfile</command>
+  <inputs>
+    <param format="tabular" name="inputFile" type="data" label="Input CAPS File" help="tabular CAPS from find_CAPs tool" />
+  </inputs>
+  <outputs>
+<data format="gff3" name="outputfile" />
+
+  </outputs>
+<help>
+This tool provides a simple conversion from CAPS 3 column output to GFF3
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
+
+</help>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GVF_Features_Extracter.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<tool id="GVF Features  Extracter_1" name="GVF Feature ID Extracter">
+  <description>Extract unique feature IDs from a GVF/GFF3 file </description>
+  <command >awk '!/^#/ {print $1":"$2":"$3":"$4}'   $inputgff3File | sort | uniq >  $column_outputfile</command>
+  <inputs>
+    <param format="gff3" name="inputgff3File" type="data" label="GVF/Gff3 File"/>
+  </inputs>
+  <outputs>
+     <data format="tabular"  name="column_outputfile" />
+  </outputs>
+<help>
+
+.. class:: infomark
+
+**TIP**
+
+This tool extracts a unique feature identifier from  GVF/GFF3 format, primarily to enable downstream analysis of SNPs or marker design
+
+
+----
+
+**Example**
+
+*input GFF3*
+
+::
+
+ ##gff-version 3
+ 1119927 gsMapper        SNP     434     434     .       .       .       ID=1119927:gsMapper:SNP:434;Reference_seq=G;Variant_seq=A;Total_reads=10;Variant_freq=100;Enzyme=AluI;Phase=variant
+ 1120709 gsMapper        SNP     361     361     .       .       .       ID=1120709:gsMapper:SNP:361;Reference_seq=T;Variant_seq=C;Total_reads=22;Variant_freq=68;Enzyme=TaqI;Phase=variant
+ 1120709 gsMapper        SNP     704     704     .       .       .       ID=1120709:gsMapper:SNP:704;Reference_seq=A;Variant_seq=G;Total_reads=20;Variant_freq=90;Enzyme=RsaI;Phase=variant
+
+*output tabular text*
+
+::
+
+ 1119927:gsMapper:SNP:434
+ 1120709:gsMapper:SNP:361
+ 1120709:gsMapper:SNP:704
+
+ -----------------------------------------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_gsMapper_gff3.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,70 @@
+<?xml version="1.0"?>
+<tool id="gsMapper_to_gff3_1" name="Convert gsMapper to gff3">
+  <description>Convert Roche gsMapper to GFF3</description>
+  <command interpreter="bash">gsmapper2gff.sh  $inputGsFile  $outputfile</command>
+  <inputs>
+    <param format="txt" name="inputGsFile" type="data" label="Roche gsMapper 454HCDiffs.txt or 454AllDiffs.txt file"/>
+
+  </inputs>
+  <outputs>
+     <data format="gff3" name="outputfile" />
+  </outputs>
+<help>
+.. class:: infomark
+
+**TIP**
+
+This tool parses Roche gsMapper read mapping output into GVF/GFF3 format
+
+
+----
+
+**Example**
+
+*input*
+
+::
+
+ >Reference      Start   End     Ref     Var     Total   Var     Ref     Var     Coding  Region  Known   # Fwd   # Rev   # Fwd   # Rev
+ >Accno           Pos    Pos     Nuc     Nuc     Depth   Freq    AA      AA      Frame   Name    SNP's   w/ var  w/ var  Total   Total
+ ______________________________
+
+ >1118212        673     673     A       C       7       100%                                            6       1       6       1
+
+ Reads with Difference:
+ 1118212                     648+ GTTGGTCCACTATTACTCTCAGATT-ATTTCATAACAATAATGG----A-TAC-AA 696
+                                                           **
+ FX289JP01DVQR7               53- GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGGGCTGA-TACTA  1
+ FX289JP02IJT2O  (7)          82+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 130
+ FX289JP01B8R7V               84+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATA-TGG----A-TAC-AA 131
+ FX289JP02FX58L               68+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----AC-AC-AA 116
+ FX289JP02JXAX7  (7)          67+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 115
+ FX289JP02JOOQZ  (2)          69+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 117
+ FX289JP02GPHPX               45+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 93
+                                                          **
+
+
+*output*
+
+::
+
+ ##gff-version 3
+ 1118212 gsMapper SNP 673 673 . . . ID=1118212:gsMapper:SNP:673;Reference_seq=A;Variant_seq=C;Total_reads=7;Variant_freq=100;
+ 1118212 gsMapper SNP 730 730 . . . ID=1118212:gsMapper:SNP:730;Reference_seq=A;Variant_seq=G;Total_reads=13;Variant_freq=31;
+ 1118212 gsMapper SNP 782 782 . . . ID=1118212:gsMapper:SNP:782;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_freq=92;
+ 1118212 gsMapper SNP 1319 1319 . . . ID=1118212:gsMapper:SNP:1319;Reference_seq=G;Variant_seq=A;Total_reads=7;Variant_freq=100;
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/design_primers.py	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,153 @@
+#!/usr/local/bin/python2.6
+##design primers to features in multiple sequences
+##usage: python  design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
+
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import os
+import StringIO
+import re
+import tempfile
+import subprocess
+import copy
+import sys
+from BCBio import GFF
+from BCBio.GFF import GFFExaminer
+from Bio import SeqIO
+from Bio.Emboss.Applications import Primer3Commandline
+from Bio.Emboss import Primer3
+
+
+in_file = sys.argv[1]
+gff_file = sys.argv[2]
+target_file =  sys.argv[3]
+prod_min_size = int(sys.argv[4])
+prod_max_size = int(sys.argv[5])
+
+max_tm_diff=1                                        ##
+opt_GC_percent=50                                    ##
+maxpolyx=4                                           ##
+optimum_length=20
+##target is specified in start, end format
+productsizerange = str(prod_min_size) + "," + str(prod_max_size)
+#open input files
+in_seq_handle = open(in_file)
+in_gff_handle = open(gff_file)
+in_target_handle=open(target_file)
+#read  target feature IDs into list
+targets=in_target_handle.readlines()
+in_target_handle.close()
+##and create a hit list of sequences from this
+target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
+##create iterator returning sequence records
+for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+    #check if this sequence is included in the target list
+    if myrec.id in target_seq_id_list:
+        ##create sequence dictionary so we can add in gff annotations
+        seq_dict = {myrec.id : myrec}
+        ##just limit to gff annotations for this sequence
+        limit_info = dict(gff_id = [ myrec.id ])
+        ##rewind gff filehandle
+        in_gff_handle.seek(0)
+        ##read annotations into sequence dictionary for this sequence record only
+        annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
+        ##if there are any annotations, then  proceed.
+        if annotations:
+            rec=annotations[0]
+            ##iterate over list of target IDs
+            for t in targets:
+                target_ID = t.strip('\n')
+                target_annotations = [f for f in rec.features if f.id == target_ID ]
+                if target_annotations:
+                    mytarget = target_annotations[0]
+                    #create temporary files
+                    tempfastaFile = tempfile.mktemp()
+                    tempproutfile = tempfile.mktemp()
+                    #just consider slice of sequence in a window of +/- prod_max_size  bp
+                    ##from feature UNLESS feature is close to end
+                    ##Note that slice is zero-based
+                    featLocation = mytarget.location.start.position
+                    if featLocation > prod_max_size:
+                        slice_start = featLocation - prod_max_size
+                        featPosition = prod_max_size
+                    else:
+                        slice_start = 0
+                        featPosition = featLocation
+                    if (len(rec) - featLocation) < prod_max_size:
+                        slice_end = len(rec)
+                    else:
+                        slice_end = featLocation + prod_max_size
+                    ###grab slice of sequence fom this window.
+                    targetRec = rec[slice_start:slice_end]
+                    matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
+                    if matching_feature:
+                        target_feat = matching_feature[0]
+                        if target_feat.location.start.position == 0:
+                            target_feat.location.start.position = 1
+                        #we get the mask features by removing the target...all features are masked as just using snp and indels
+                        ##a smarter filter could be added
+                        ##note use of list copy to avoid possible side-effects
+                        exclude_feat = list(targetRec.features)
+                        exclude_feat.remove(target_feat)
+                        ##print'targetRec.features',  targetRec.features ##for debug
+                        mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
+                        #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
+                        p3_exclude_str = str(mask_str).replace('\', \'',':')
+                        p3_target = str(target_feat.location.start.position +1)  + "," + str(target_feat.location.end.position +1)
+                        #write sequence record into template file as  fasta
+                        t_output_handle = open(tempfastaFile, "w")
+                        SeqIO.write([targetRec], t_output_handle, "fasta")
+                        t_output_handle.close()
+                        #create Primer3Commandline() for emboss tool
+                        primer_cl = Primer3Commandline()
+                        #set the emboss tool to suppress  output as this will make Galaxy  think it is error message although it is a message to state run success
+                        primer_cl.set_parameter("-auto",'1')
+                        #pass  sequence file to emboss
+                        primer_cl.set_parameter("-sequence",tempfastaFile)
+                        #add target location
+                        primer_cl.set_parameter("-target", p3_target)
+                        ##mask off other features...dumb masking of everything at present, beware
+                        if (p3_exclude_str != ""):
+                            primer_cl.set_parameter("-excludedregion", p3_exclude_str)
+                        #add temporary output file to get the result
+                        primer_cl.set_parameter("-outfile", tempproutfile)
+                        #specify maximum different of tm
+                        primer_cl.set_parameter("-maxdifftm",max_tm_diff )
+                        #other useful parameters
+                        primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
+                        primer_cl.set_parameter("-opolyxmax", maxpolyx)
+                        primer_cl.set_parameter("-osize", optimum_length)
+                        #set product size range
+                        primer_cl.set_parameter("-prange", productsizerange)
+                        #using python subprocess method to run emboss command line programe with the parameters given
+                        fnull = open(os.devnull, 'w')
+                        result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
+                        #read temporary outputfile
+                        handle = open(tempproutfile)
+                        record = Primer3.read(handle)
+                        ##just return first set, if there is one
+                        if len(record.primers) > 0:
+                            primer= record.primers[0]
+                            outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
+                        else:
+                            outputstr=[mytarget.id,"NONE","NONE","NONE"]
+                        print('\t'.join(map(str,outputstr)))
+
+
+in_gff_handle.close()
+in_seq_handle.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/design_primers.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,79 @@
+<?xml version="1.0"?>
+<tool id="Genetic_Marker_Design_2" name="Design primers to features">
+  <description>Design PCR Primers to Features</description>
+  <command interpreter="python">design_primers.py $inputfastaFile $inputSNPfile $inputTargetfile $min_size $max_size >  $primer_outputfile  </command>
+  <inputs>
+    <param format="fasta" name="inputfastaFile" type="data" label="Multifasta Source file" />
+    <param format="gff3" name="inputSNPfile" type="data" label="annotation file(Gff3)" />
+    <param format="txt" name="inputTargetfile" type="data" optional="false" label="Target file" help="Format:Sequence id:source:type:start, such as 1174806:gsMapper:SNP:292" ></param>
+<param name="min_size" size="20" type="text" value="75" label="Minimum Product Size Range" />
+     <param name="max_size" size="20" type="text" value="100" label="Maximum Product Size Range" />
+  </inputs>
+  <outputs>
+     <data format="tabular"  name="primer_outputfile" />
+  </outputs>
+<help>
+.. class:: infomark
+
+**TIP**
+
+This tool designs primer pairs to flank features
+
+It takes
+
+* an input reference sequence file
+* a gff3 file containing feature information
+* a single column file containing list of features
+
+----
+
+**Example**
+
+--input GFF
+
+::
+
+ PGSC0003DMB000000001    samtools        SNP     6345    6346    4.84    .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6345;Variant_seq=C;Reference_seq=T;Total_reads=2
+ PGSC0003DMB000000001    samtools        SNP     6453    6454    18      .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6453;Variant_seq=T;Reference_seq=G;Total_reads=8
+ PGSC0003DMB000000001    samtools        SNP     7255    7256    149     .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7255;Variant_seq=G;Reference_seq=T;Total_reads=14
+ PGSC0003DMB000000001    samtools        SNP     7371    7372    86.8    .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7371;Variant_seq=C;Reference_seq=T;Total_reads=9
+ PGSC0003DMB000000001    samtools        SNP     8288    8289    10.7    .       .       ID=PGSC0003DMB000000001:SAMTOOLS:SNP:8288;Variant_seq=A;Reference_seq=G;Total_reads=5
+
+
+--input features
+
+::
+
+	PGSC0003DMB000000001:SAMTOOLS:SNP:1012901
+	PGSC0003DMB000000001:SAMTOOLS:SNP:1021771
+	PGSC0003DMB000000001:SAMTOOLS:SNP:1025761
+	PGSC0003DMB000000001:SAMTOOLS:SNP:1026717
+	PGSC0003DMB000000001:SAMTOOLS:SNP:1026834
+	PGSC0003DMB000000001:SAMTOOLS:SNP:1029542
+
+
+--output columnar data
+
+::
+
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1012901 AGAGGTCGGCTCTCTAGTAGCA GGGGATCCACTAACTATGTCACTT 86
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1021771 CCTATGCGAGAAAGGGACAC GCCCTTCCATGTTGTACGAG 100
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1025761 TGTGAGTAACTTAGTGTCCTACGTCAA CACTCAATGAGCCAAAGCAA 92
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026717 TTCCTAAGTCATGGGAAAGCA AGTTCATCCAAGGCAAGCAT 76
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026834 AATGAAGTGACTGGGGAGGA TGCTGGTCGAAGCTTTCTTT 98
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1029542 TAACCAGAAAGTCCGGATGG TTCTGAAGTCAAGTGGGGAGA 75
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_CAPS.py	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,93 @@
+#!/usr/bin/python2.6
+##find snps that condition CAPS
+##usage  find_CAPS.py <reference file>  <gff file>
+
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+import sys
+
+from Bio import SeqIO
+from BCBio import GFF
+from Bio.Restriction import *
+
+###This list is limited to economical enzymes performing well in PCR buffer
+rest_batch = RestrictionBatch(
+    [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI,
+     HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI])
+
+in_file=sys.argv[1]
+gff_file=sys.argv[2]
+
+in_seq_handle = open(in_file)
+in_gff_handle=open(gff_file)
+
+##use iterator
+for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+    ##create single-entry dictionary to accept gff annotations from parser
+    seq_dict = {myrec.id:myrec}
+
+    ##note that this filters out only SNP features
+    limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP'])
+    in_gff_handle.seek(0)
+
+    ##parse annotations into
+    annotations = [r for r
+                   in GFF.parse(in_gff_handle,
+                                base_dict=seq_dict,
+                                limit_info=limit_info)]
+
+    ##if there are any for this sequence, proceed
+    if annotations:
+        rec=annotations[0]
+        for feat in rec.features:
+            fstart=feat.location.start.position
+            fend=feat.location.end.position
+
+            if   20 < fstart < len(rec) - 20:
+                #just work with +/- 20 bp, ignoring SNPS within this
+                #distance from ends
+                fseq=rec.seq[fstart-20:fstart+20]
+                ref_seq = rec.seq[fstart-20:fstart+20]
+                variant_seq = ref_seq.tomutable()
+
+                #mutate the variant
+                variant_seq[20]= feat.qualifiers['Variant_seq'][0]
+                variant_seq = variant_seq.toseq()
+
+                #digest the sequences
+                ref_cuts =  rest_batch.search(ref_seq)
+                var_cuts =  rest_batch.search(variant_seq)
+
+                #print
+                for enz in ref_cuts:
+                    kr = set(ref_cuts[enz])
+                    km = set(var_cuts[enz])
+                    outputstr=[rec.id, fstart +1,fend+1,feat.id,enz]
+                    if len(kr) > len(km):
+                        outputstr.append("reference")
+                        print('\t'.join(map(str,outputstr)))
+                    elif len(kr) < len(km):
+                        outputstr.append("variant")
+                        print('\t'.join(map(str,outputstr)))
+
+in_gff_handle.close()
+in_seq_handle.close()
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_CAPS.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,81 @@
+<?xml version="1.0"?>
+<tool id="CAPS_Marker_Design_2" name="CAPS Detection">
+  <description>identify SNPs that condition restriction polymorphisms</description>
+  <command interpreter="python">find_CAPS.py $inputFasta $inputSNPGff3File >  $outputfile</command>
+  <inputs>
+    <param format="gff3" name="inputSNPGff3File" type="data" label="GFF3 file containing SNP data"/>
+    <param format="fasta" name="inputFasta" type="data" label="fasta source file" />
+  </inputs>
+  <outputs>
+     <data format="interval" name="outputfile" />
+  </outputs>
+<help>
+.. class:: infomark
+
+**TIP**
+
+This tool identifies SNPs that condition restriction polymorphisms.
+
+Currently it utilizes a fixed list of robust enzymes known to work well in PCR buffers
+
+ AluI,ApaI,BamHI,BbrPI,BfrI,ClaI,DpnI,DraI,EcoRI,HaeIII,HindII,HinfI,HpaI,PvuII,RsaI,SacI,Sau3AI,SmaI,TaqI
+
+It produces a tabular output in interval format
+
+record ID,  start, stop, feature ID,enzyme, phase (ie whether it cuts reference or variant sequence)
+
+
+
+
+
+**Example**
+
+
+
+*input GFF*
+
+::
+
+	JR843866	gsmapper	SNP	63	63	.	.	.	ID=JR843866:gsmapperSNP:63;Reference_seq=T;Variant_seq=C;Total_reads=22;Variant_reads=20
+	JR843866	gsmapper	SNP	146	146	.	.	.	ID=JR843866:gsmapperSNP:146;Reference_seq=T;Variant_seq=C;Total_reads=26;Variant_reads=10
+	JR843866	gsmapper	SNP	258	258	.	.	.	ID=JR843866:gsmapperSNP:258;Reference_seq=T;Variant_seq=G;Total_reads=4;Variant_reads=3
+	JR848320	gsmapper	SNP	157	157	.	.	.	ID=JR848320:gsmapperSNP:157;Reference_seq=C;Variant_seq=T;Total_reads=10;Variant_reads=10
+	JR848554	gsmapper	SNP	54	54	.	.	.	ID=JR848554:gsmapperSNP:54;Reference_seq=T;Variant_seq=G;Total_reads=5;Variant_reads=5
+	JR848554	gsmapper	SNP	74	74	.	.	.	ID=JR848554:gsmapperSNP:74;Reference_seq=C;Variant_seq=T;Total_reads=7;Variant_reads=7
+	JR848554	gsmapper	SNP	123	123	.	.	.	ID=JR848554:gsmapperSNP:123;Reference_seq=T;Variant_seq=A;Total_reads=11;Variant_reads=11
+	JR848554	gsmapper	SNP	147	147	.	.	.	ID=JR848554:gsmapperSNP:147;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_reads=13
+	JR848554	gsmapper	SNP	161	161	.	.	.	ID=JR848554:gsmapperSNP:161;Reference_seq=C;Variant_seq=T;Total_reads=13;Variant_reads=13
+
+
+
+*output columnar data*
+
+::
+
+	JR843866	63	64	JR843866:gsmapperSNP:63		HaeIII	variant
+	JR848320	157	158	JR848320:gsmapperSNP:157	TaqI	variant
+	JR848320	157	158	JR848320:gsmapperSNP:157	HinfI	variant
+	JR848554	162	163	JR848554:gsmapperSNP:162	TaqI	variant
+	JR848554	162	163	JR848554:gsmapperSNP:162	ClaI	variant
+	JR848554	306	307	JR848554:gsmapperSNP:306	TaqI	variant
+	JR848554	652	653	JR848554:gsmapperSNP:652	TaqI	variant
+
+
+-------------------------------------------------------------------------------
+
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gsmapper2gff.sh	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,50 @@
+#!/bin/sh
+##convert gsMapper output into gff3/GVF format
+
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+infile=$1
+outfile=$2
+
+awk '
+BEGIN {OFS="\t"}
+/^>/  && sub(/%/,"",$7)  {
+  ID=substr($1,2)
+  if  (length($4) > 1 || match($4,"-") || length($5) > 1 || match($5,"-"))
+    type="indel"
+    else
+      type="SNP"
+start=$2
+end=$3
+Col9_ID=ID ":gsmapper:" type ":"start
+
+Reference_seq=$4
+Variant_seq=$5
+Total_reads=$6
+Variant_reads=Total_reads * $7 /100 - (Total_reads * $7 % 100)/100
+
+
+
+ print ID,"gsmapper",type,start,end,".",".",".","ID="Col9_ID";Reference_seq="Reference_seq";Variant_seq="Variant_seq";Total_reads="Total_reads";Variant_reads="Variant_reads
+}' "$infile" > "$outfile"
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parse_primersearch.pl	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,37 @@
+#!/usr/bin/perl
+#parse_primersearch.pl
+#reformat EMBOSS primersearch output into columnar Galaxy interval format
+
+#Copyright 2012 John McCallum
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+open (IN, "<$ARGV[0]");
+open (OUT, ">$ARGV[1]");
+
+#print OUT  "primerset_id","\t","sequence_id","\t","hit_start","\","mismatches","\t","amplimer_size",\n";
+
+
+
+while (<IN>) {
+         /^Primer name (\S+)/  && ($name = $1);  # get primer set name
+         # Modified to cope with unnamed sequence input 28/7/05
+        /Sequence: (\S+)/ && print OUT  $name,"\t",$1;
+        /Sequence:(\s{4,})/ && print OUT $name,"\t","unnamed_seq";
+        /hits forward strand at (\d+) with (\d) mismatches/ && ($start = $1) &&  print OUT  "\t",$2,"\t",$start,;
+        /Amplimer length: (\S+)/ && ($amp_length = $1) &&  print OUT "\t",$start + $amp_length,"\t",$1,"\n";
+        }
+
+close( IN );
+close( OUT );
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parse_primersearch.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,68 @@
+<?xml version="1.0"?>
+<tool id="parse_primersearch_1" name="Parse Primer search">
+  <description>Parse EMBOSS primer search output to tabular </description>
+  <command interpreter="perl">parse_primersearch.pl $inputPrimersearchFile $outputfile </command>
+  <inputs>
+    <param format="primersearch" name="inputPrimersearchFile" type="data" label="Primersearch  output file"/>
+  </inputs>
+  <outputs>
+
+     <data format="tabular" name="outputfile" />
+  </outputs>
+<help>
+
+.. class:: infomark
+
+**TIP**
+
+This tool parses EMBOSS primersearch_ output into columnar format suitable for use as interval
+
+
+
+Columns are:
+
+1. Primer set ID
+2. Hit ID
+3. Number of mismatches
+4. Amplimer start
+5. Amplimer end
+6. Amplicon length
+
+
+----
+
+**Example**
+
+*output*
+
+::
+
+ ACP032  isotig07062     0       214     363     149
+ ACP223  isotig04647     0       362     574     212
+ ACP224  isotig04647     0       303     519     216
+ ACP225  isotig04647     0       153     355     202
+ ACP363  isotig10393     0       93      193     100
+ ACP394  isotig00271     0       894     986     92
+ ACP394  isotig00273     0       805     897     92
+ ACP440  isotig05277     0       506     601     95
+ ACP615  isotig00271     0       894     978     84
+ ACP615  isotig00273     0       805     889     84
+ AJK295  isotig06005     0       182     651     469
+
+
+.. _primersearch: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/primersearch.html
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/patman.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<tool id="patman_1" name="search for patterns in DNA using PatMaN">
+  <description>search for approximate patterns in DNA libraries</description>
+  <command>patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile  </command>
+  <inputs>
+  <param  name="edits" type="integer" label="max N mismatches and or gaps" value="0" size="1" />
+  <param name="gaps" type="integer" label="max N gaps" value="0" size="1"/>
+    <param format="fasta" name="inputfastaFile" type="data" label="Multifasta target file" />
+    <param format="fasta" name="inputPatfile" type="data" label="Pattern fasta" />
+
+  </inputs>
+  <outputs>
+     <data format="tabular"  name="patman_outputfile" />
+  </outputs>
+<help>
+
+
+
+This is a Galaxy wrapper for PatMaN: a DNA pattern matcher for short sequences
+
+* Website https://bioinf.eva.mpg.de/patman/
+* PubMed Citation_
+
+*Inputs*
+
+* Patterns in fasta format (create from tabular using tabular-to-fasta tool)
+* Multifasta file of target sequences
+
+*Output*
+
+* 6 Column tabular interval data
+* Columns Chrom, Pattern Name, Start, End, strand, N mismatches
+
+::
+
+ isotig05934	ACP818	368	389	+	0
+ isotig05934	ACP859	377	396	+	0
+ isotig06765	ACP822	448	468	+	0
+ isotig07088	ACP825	49	75	+	0
+ isotig07652	ACP830	199	218	+	0
+ isotig07652	ACP831	257	276	+	0
+ isotig10333	ACP837	474	497	+	0
+ isotig10393	ACP838	10	34	+	0
+
+
+
+.. _Citation: http://www.ncbi.nlm.nih.gov/pubmed/18467344?dopt=Abstract
+
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/patman2gff.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<tool id="patman2gff" name="convert patman to GFF3">
+  <description>convert output of patman pattern detection to gvf/gff3</description>
+  <command>awk 'OFS="\t" {print $1,"patman","primer_binding_site",$3,$4,$6,$5,".","name="$2}' $inputFile  >  $outputfile</command>
+  <inputs>
+    <param format="tabular" name="inputFile" type="data" label="Input patman File" help="tabular output from patman tool" />
+  </inputs>
+  <outputs>
+<data format="gff3" name="outputfile" />
+
+  </outputs>
+<help>
+This tool provides a simple conversion from patman column output to GFF3
+
+-------------------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/uniq.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<tool id="uni_Mask_1" name="Unique">
+  <description>Return unique lines</description>
+  <command >cat $inputFile | sort | uniq >  $outputfile</command>
+  <inputs>
+    <param format="txt" name="inputFile" type="data" label="Input  File" help="Any text or tabular file" />
+  </inputs>
+  <outputs>
+     <data format="txt"  name="outputfile" />
+  </outputs>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2gvf.sh	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,46 @@
+#!/bin/sh
+##convert vcf to gvf
+##NOTE This is a very simple basic parser for a complex format.
+
+##usage vcf2gvf.sh <vcf file> <outputfile>
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+inputfile=$1
+outputfile=$2
+
+echo  "##gvf-version 1.05" > $outputfile
+
+awk '
+BEGIN {OFS="\t"}
+
+##get feature type
+{if (index($8,"INDEL")== 1) {type="INDEL"} else {type="SNP"} }
+##get feature length
+{if (type=="SNP")
+    {feat_length=1}
+    else {feat_length=length($4)}
+}
+{end=($2+feat_length)}
+
+!/^#/  { print $1 ,"SAMTOOLS",type,$2,end,$6,".",".","ID="$1":SAMTOOLS:"type":"$2";Variant_seq="$5";Reference_seq="$4";"$8}
+
+END {print ""}
+' "$inputfile" > "$outputfile"
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2gvf.xml	Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<tool id="vcf2gvf_1" name="VCF to GFF3">
+  <description>convert vcf to gvf/gff3</description>
+  <command interpreter="bash">vcf2gvf.sh  $inputFile  $outputfile</command>
+  <inputs>
+    <param format="vcf" name="inputFile" type="data" label="Input vcf File" help="vcf file from mpileup" />
+  </inputs>
+  <outputs>
+<data format="gff3" name="outputfile" />
+
+  </outputs>
+<help>
+This tool provides a simple conversion from vcf to gvf.
+
+Be sure to read the documentation to determine if it meets your requirements.
+
+* vcf documentation at http://samtools.sourceforge.net/samtools.shtml#6
+* GVF/GFF3 at http://www.sequenceontology.org/resources/gvf.html
+
+
+
+**input**
+
+::
+
+	PGSC0003DMB000000010	2042429	.	C	A	44.6	.	DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1	GT:PL:DP:GQ	0/1:14,0,42:5:23	1/1:27,6,0:2:9	1/1:15,3,0:1:7	1/1:30,6,0:2:9
+	PGSC0003DMB000000038	1756646	.	G	A	3.69	.	DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31	GT:PL:DP:GQ	0/1:20,3,0:1:6	0/1:9,0,67:7:8	0/0:0,15,82:5:17	0/1:16,3,0:1:5
+	PGSC0003DMB000000064	1916664	.	T	C	8.12	.	DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5	GT:PL:DP:GQ	1/1:14,3,0:1:5	1/1:0,0,0:0:3	1/1:13,3,0:1:5	1/1:15,3,0:1:5
+
+
+**output**
+
+
+::
+
+	PGSC0003DMB000000010	samtools	SNP	2042429	2042430	44.6	.	.	ID=PGSC0003DMB000000010:SAMTOOLS:SNP:2042429;Variant_seq=A;Reference_seq=C;DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1
+	PGSC0003DMB000000038	samtools	SNP	1756646	1756647	3.69	.	.	ID=PGSC0003DMB000000038:SAMTOOLS:SNP:1756646;Variant_seq=A;Reference_seq=G;DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31
+	PGSC0003DMB000000064	samtools	SNP	1916664	1916665	8.12	.	.	ID=PGSC0003DMB000000064:SAMTOOLS:SNP:1916664;Variant_seq=C;Reference_seq=T;DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5
+
+
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+</help>
+</tool>