Galaxy |

Changeset 0:fadef644b886 (2015-05-22)

Next changeset 1:b2125910c8fd (2015-05-22)

Commit message:
Uploaded

added:
2bit.loc.sample
multiplicom_primer_trimming.pl
multiplicom_primer_trimming.xml
tool_data_table_conf.xml.sample
tool_dependencies.xml

diff -r 000000000000 -r fadef644b886 2bit.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/2bit.loc.sample Fri May 22 08:27:03 2015 -0400

@@ -0,0 +1,19 @@
+#This is a sample file distributed with Galaxy that enables the multiplicom primer trimmer
+#tool to locate genome files for use with 2bit. You will
+#need to supply these files and then create a 2bit.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The 2bit.loc
+#file has this format (white space characters are TAB characters):
+#
+#<unique_build_id> <display_name> <2bit_path>
+#
+#for example:
+#
+#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/twobit/hg19.2bit
+#
+#then your /depot/data2/galaxy/twobit/ directory
+#would need to contain the following 2bit files:
+#
+#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg19.2bit
+#
+

diff -r 000000000000 -r fadef644b886 multiplicom_primer_trimming.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/multiplicom_primer_trimming.pl Fri May 22 08:27:03 2015 -0400

[

b'@@ -0,0 +1,537 @@\n+#!/usr/bin/perl\n+\n+## needs twoBitToFa v285, or other versions supporting the -bed option.\n+\n+# load modules\n+use Getopt::Std;\n+\n+##########\n+## opts ##\n+##########\n+## input files\n+# i : input fastq 1\n+# I : input fastq 2 (if paired)\n+# b : bed file with amplicon positions\n+# r : read length (default 250)\n+# o : output fastq1\n+# O : output fastq2\n+# F : failed readpairs\n+# R : short report\n+# t : 2bit file location\n+# w : working directory (defaults to tmp)\n+\n+getopts(\'i:I:b:r:o:O:F:R:t:\', \\%opts) ;\n+ \n+# check input values\n+if (!exists($opts{\'i\'}) || !-e $opts{\'i\'}) {\n+\tdie(\'Fastq File not found\');\n+}\n+if (!exists($opts{\'I\'}) || !-e $opts{\'I\'}) {\n+\tdie(\'FastQ for paired end reads not found\');\n+}\n+if (!exists($opts{\'o\'})) {\n+\tdie(\'No output file specified for forward reads\');\n+}\n+if (!exists($opts{\'O\'})) {\n+\tdie(\'No output file specified for reverse reads\');\n+}\n+if (!exists($opts{\'F\'})) {\n+\tdie(\'No output file specified for failed pairs\');\n+}\n+\n+if (!exists($opts{\'b\'}) || !-e $opts{\'b\'}) {\n+\tdie(\'BED-File not found\');\n+}\n+\n+#if (exists($opts{\'m\'})) {\n+#\t$minmap = $opts{\'m\'};\n+#}\n+#else {\n+#\t$minmap = 3;\n+#}\n+if (exists($opts{\'r\'})) {\n+\t$readl = $opts{\'r\'};\n+}\n+else {\n+\tprint "Assuming default read length of 2x250bp\\n";\n+\t$readl = 250;\n+}\n+if (exists($opts{\'t\'}) && -e $opts{\'t\'}) {\n+\t$tobit = $opts{\'t\'};\n+}\n+else {\n+\tdie("2BIT reference not found");\n+}\n+\n+my $tbtf = `which twoBitToFa`;\n+chomp($tbtf) ;\n+if ($tbtf eq \'\') {\n+\tif (-e "/opt/software/bin/twoBitToFa") {\n+\t\t$tbtf = "/opt/software/bin/twoBitToFa";\n+\t}\n+\telse {\n+\t\tdie("Could not find a twoBitToFa executable.\\n");\n+\t}\n+}\n+\n+\n+# make output directory in (tmp) working dir\n+if (exists($opts{\'w\'}) && -d $opts{\'w\'}) {\n+\tour $wd = $opts{\'w\'};\n+}\n+else {\n+\tour $wd = "/tmp/Trim.".int(rand(1000));\n+\twhile (-d $wd) {\n+\t\t$wd = "/tmp/Trim.".int(rand(1000));\n+\t}\n+\tsystem("mkdir $wd");\n+}\n+#print "Using wd : $wd\\n";\n+\n+\n+## build sequence hash.\n+my %alen = %flen = %rlen = ();\n+open BED, $opts{\'b\'};\n+open OUT, ">$wd/bedfile.zero.bed";\n+my $minf = 999;\n+my $minr = 999;\n+while (<BED>) {\n+\tif ($_ !~ m/^(chr.{1,2}\\t)(\\d+)(\\t.*)/) {\n+\t\tnext;\n+\t}\n+\tchomp($_);\n+\tmy @p = split(/\\t/,$_);\n+\tmy $fl = $p[6] - $p[1]; # p6 holds first non-primer position\n+\tmy $rl = $p[2] - $p[7]; # p7 hold last non-primer position\n+\t## lengths \n+\t$alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1;\n+\t$flen{"F:$p[0]:$p[1]-$p[2]"} = $fl;\n+\tif ($fl < $minf) {\n+\t\t$minf = $fl;\n+\t}\n+\tif ($rl < $minr) {\n+\t\t$minr = $rl;\n+\t}\n+\t$rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl;\n+\tprint OUT "$p[0]\\t".($p[1]-1)."\\t".($p[6]-1)."\\tF:$p[0]:$p[1]-$p[2]\\n";\n+\tprint OUT "$p[0]\\t".$p[7]."\\t".$p[2]."\\tR:$p[0]:$p[1]-$p[2]\\n";\n+\n+}\n+close BED;\n+close OUT;\n+\n+system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa");\n+\n+open IN, "$wd/amplicons.zero.fa";\n+my %fseq = %rseq = ();\n+my %rmm = %fmm = ();\n+my @nts = (\'A\',\'C\',\'T\',\'G\');\n+\n+while(<IN>) {\n+\tmy $pr = $_;\n+\tmy $seq = <IN>;\n+\tchomp($pr);\n+\tchomp($seq);\n+\t$pr = substr($pr,1);\n+\tif (substr($pr,0,1) eq \'F\') {\n+\t\t$fseq{$pr} = $seq;\n+\t\tfor ($i = 0; $i < 10; $i++) {\n+\t\t\tforeach(@nts) {\n+\t\t\t\tmy $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);\n+\t\t\t\t$fmm{$pr}{$mut} = $fseq{$pr};\n+\t\t\t}\n+\t\t}\n+\t}\n+\telse {\n+\t\t$rseq{$pr} = rc($seq);\n+\t\tfor ($i = 0; $i< 10;$i++){\n+\t\t\tforeach(@nts) {\n+\t\t\t\tmy $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);\n+\t\t\t\t$rmm{$pr}{$mut} = $rseq{$pr};\n+\t\t\t}\n+\t\t}\n+\n+\t}\n+}\n+close IN;\n+\n+###############################\n+## generate smallest overlap F##\n+###############################\n+$ntf = $minf;\n+BUILDMIN:\n+my %fmin = ();\n+my %fpairs = ();\n+foreach( keys(%fseq)) {\n+\tmy $sub = substr($fseq{$_},0,$ntf);\n+\t## clash => increase nt.\n+\tif (exists($fmin{$sub})) {\n+\t\t## check if not identical (yes, this is possible...) (same start + same length.)\n+\t\t$_ =~ m/F:chr(.{1,2}):(\\d+)-(\\d+)/;\n+\t\tmy $cchr = $1;\n+\t\tmy $cstart = $2;\n+\t\tmy $cl = $flen{$_};\n+\t\tmy @prev = split(/\\|/,$fmin{$sub});\n+\t\tmy $pprim = $prev[0];\n+\t\t$pprim =~ m/F:chr(.{1'..b'en{substr($rmin{$rmmhash{$rseed}},2)});\n+\t\t\t\t}\n+\n+\t\t\t\t$forok++;\n+\t\t\t\t$foundboth++;\n+\t\t\t\tlast;\n+\t\t\t}\n+\t\t\telse {\n+\t\t\t\tif ($forl < $flen{$_}) {\n+\t\t\t\t\t$forl = $flen{$_};\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\tif ($forok == 0) {\n+\t\t\t## trim by max length of should be forwards.\n+\t\t\t$s1 = substr($s1,$forl);\n+\t\t\t$q1 = substr($q1,$forl);\n+\t\t\t# statistics\n+\t\t\t$bptrimmed += $forl; \n+\t\t\tif ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {\n+\t\t\t\t# trim to alength (incorrect if indels !)\n+\t\t\t\t# statistics\n+\t\t\t\t$toolongf++;\n+\t\t\t\t$bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};\n+\t\t\t\t$s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});\n+\t\t\t\t$q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});\n+\t\t\t}\n+\n+\t\t}\n+\t\t$nrfound++;\n+\t}\n+\telse {\n+\t\t## trim forward 5\'\n+\t\t$s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}});\n+\t\t$q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}});\n+\t\t# statistics\n+\t\t$bptrimmed += $flen{$fmin{$mmhash{$fseed}}};\n+\n+\t\tif ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) {\n+\t\t\t# trim to alength (incorrect if indels !)\n+\t\t\t# statistics\n+\t\t\t$toolongf++;\n+\t\t\t$bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)};\n+\n+\t\t\t$s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});\n+\t\t\t$q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});\n+\t\t}\n+\t\t$byf++;\n+\t\t## trim reverse 5\' \n+\t\tmy @rps = split(/\\|/,$fpairs{$fmin{$mmhash{$fseed}}});\n+\t\t$revok = 0;\n+\t\tmy $revl = 0;\n+\t\tforeach(@rps) {\n+\t\t\tif (exists($rmm{$_}{substr($s2,0,10)})) {\n+\t\t\t\t$s2 = substr($s2,$rlen{$_});\n+\t\t\t\t$q2 = substr($q2,$rlen{$_});\n+\t\t\t\t# statistics\n+\t\t\t\t$bptrimmed += $rlen{$_};\n+\t\t\t\tif ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {\n+\t\t\t\t\t# trim to alength (incorrect if indels !)\n+\t\t\t\t\t# statistics\n+\t\t\t\t\t$toolongr++;\n+\t\t\t\t\t$bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};\n+\n+\t\t\t\t\t$s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});\n+\t\t\t\t\t$q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});\n+\t\t\t\t}\n+\n+\t\t\t\t$revok++;\n+\t\t\t\t$foundboth++;\n+\t\t\t\tlast;\n+\t\t\t}\n+\t\t\telse {\n+\t\t\t\tif ($revl < $rlen{$_}) {\n+\t\t\t\t\t$revl = $rlen{$_};\n+\t\t\t\t}\t\n+\t\t\t}\n+\t\t}\n+\t\tif ($revok == 0) {\n+\t\t\t# trim by max length of should be reverses.\n+\t\t\t$s2 = substr($s2,$revl);\n+\t\t\t$q2 = substr($q2,$revl);\n+\t\t\t# statistics\n+\t\t\t$bptrimmed += $revl;\n+\t\t\tif ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {\n+\t\t\t\t# trim to alength (incorrect if indels !)\n+\t\t\t\t# statistics\n+\t\t\t\t$toolongr++;\n+\t\t\t\t$bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};\n+\t\t\t\t$s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});\n+\t\t\t\t$q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});\n+\t\t\t}\n+\n+\t\t}\n+\t\t$nrfound++;\n+\t}\n+\t$outf .= "$r1$s1\\n+\\n$q1\\n";\n+\t$outr .= "$r2$s2\\n+\\n$q2\\n";\n+\t$count++;\n+\tif ($count > 100000) {\n+\t\tprint OUTF $outf;\n+\t\tprint OUTR $outr; \n+\t\n+\t\t$outf = "\\n";\n+\t\t$outr = "\\n";\n+\t\t$count = 0;\n+\t}\n+\n+\t\n+}\n+chomp($outf);\n+chomp($outr);\n+chomp($failout);\n+print OUTF $outf;\n+print OUTR $outr; \n+print FAIL $failout;\n+close IN;\n+close INR;\n+close OUTF;\n+close OUTR;\n+close FAIL;\n+open REPORT, ">$opts{\'R\'}" or die ("Could not open report file");\n+print REPORT "Results: \\n";\n+print REPORT "########\\n";\n+print REPORT " Read pairs without match: $nrmissed\\n";\n+print REPORT " Read pairs with a valid match: $nrfound\\n";\n+print REPORT " Initial match on Forward: $byf\\n";\n+print REPORT " Initial match on Reverse: $byr\\n";\n+print REPORT " Both F and R Matched: $foundboth\\n";\n+print REPORT " Forward reads trimmed to amplicon length: $toolongf\\n";\n+print REPORT " Reverse reads trimmed to amplicon length: $toolongr\\n";\t\n+print REPORT " Total basepairs in fastq files: $totalbp\\n";\n+print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\\n"; \n+close REPORT;\n+\n+\n+\n+\n+\n+\n+if (!exists($opts{\'w\'})) {\n+\t## clean up\n+\tsystem("rm -Rf $wd");\n+}\n+\n+\n+sub rc {\n+\tmy $seq = shift;\n+\t$seq =~ tr/ACGT/TGCA/;\n+\t$seq = reverse($seq);\n+\treturn $seq;\n+\n+}\n'

diff -r 000000000000 -r fadef644b886 multiplicom_primer_trimming.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/multiplicom_primer_trimming.xml Fri May 22 08:27:03 2015 -0400

@@ -0,0 +1,45 @@
+<tool id="multiplicom_primer_trimming" name="Multiplicom Primer Trimmer" version="0.0.1">
+  <description></description>
+  <requirements>
+    <requirement type="package" version='latest'>twoBitToFa</requirement>
+  </requirements>
+  <command interpreter="perl">
+ multiplicom_primer_trimming.pl
+ ## input files
+ -i $inputf
+ -I $inputr
+ -b $mastr
+ ## read length
+ -r $readlength
+ ## output files
+ -o $outf
+ -O $outr
+ -F $failed
+ -R $report
+ ## reference genome
+ -t "${indexes.fields.path}"
+
+  </command>
+  <inputs>
+ <param name='inputf' type='data' format='fastq,fastqsanger' label='Forward Sequences' help='Forward Reads in fastq format' />
+ <param name='inputr' type='data' format='fastq,fastqsanger' label='Reverse Sequences' help='Reverse Reads in fastq format' />
+ <param name='mastr' type='data' format='bed' label='MASTR file' help='Design file of the Multiplicom MASTR assay' />
+ <param name='readlength' type='integer' value='250' label='Read Length' help='Applied Readlength, per read' />
+ <param name="indexes" type="select" label="Reference Genome" help="Select the correct genome build" >
+ <options from_data_table="twobit" >
+ <filter type="sort_by" column="2" />
+ <validator type="no_options" message="No indexes are available" />
+ </options>
+ </param>
+   </inputs>
+   <outputs>
+ <data format_source="inputf" name="outf" label="${tool.name} on ${on_string}: Forward Reads"/>
+     <data format_source="inputr" name="outr" label="${tool.name} on ${on_string}: Reverse Reads"/>
+     <data format_source="inputf" name="failed" label="${tool.name} on ${on_string}: Failed Pairs"/>
+     <data format="txt" name="report" label="${tool.name} on ${on_string}: Runtime output"/>
+   </outputs>
+   <help>
+ This tools scans paired FASTQ files for the presence of Multiplicom MASTR PCR primers. If found, primers are clipped.
+   </help>
+</tool>
+

diff -r 000000000000 -r fadef644b886 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Fri May 22 08:27:03 2015 -0400

@@ -0,0 +1,9 @@
+
+<tables>
+    
+    <table name="2bit" comment_char="#">
+        <columns>name,value, path</columns>
+        <file path="2bit.loc" />
+    </table>
+
+</tables>

diff -r 000000000000 -r fadef644b886 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Fri May 22 08:27:03 2015 -0400

@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="twoBitToFa" version="latest">
+        <repository changeset_revision="22493869d464" name="package_twobittofa" owner="geert-vandeweyer" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+
+
+</tool_dependency>