Mercurial > repos > mcharles > rapsodyn
changeset 5:26d2c851d48e draft
Uploaded
author | mcharles |
---|---|
date | Mon, 16 Jun 2014 06:18:07 -0400 |
parents | 53eff7c08ad3 |
children | 1bf299f77ce6 |
files | filtersamunique/filtersamunique.pl filtersamunique/filtersamunique.xml mpileupfilter/mpileupfilter.pl mpileupfilter/mpileupfilter.xml mpileupfilteronblastxml/mpileupfilteronblastxml.pl mpileupfilteronblastxml/mpileupfilteronblastxml.xml |
diffstat | 6 files changed, 763 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filtersamunique/filtersamunique.pl Mon Jun 16 06:18:07 2014 -0400 @@ -0,0 +1,20 @@ +#!/usr/bin/perl +use strict; +use warnings; + +open(IN, $ARGV[0]) or die ("Can't open $ARGV[0]\n"); +while (my $line=<IN>){ + if ($line =~ /^\@/){ + #Header conservation + print $line; + } + else { + #Optionnal flag verification + if (($line =~ /XT\:A\:U/)&&($line =~ /X0\:i\:1/)&&($line =~ /X1\:i\:0\s/)){ + print $line; + } + } +} + + +close (IN); \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filtersamunique/filtersamunique.xml Mon Jun 16 06:18:07 2014 -0400 @@ -0,0 +1,18 @@ +<tool id="filtersamunique" name="filtersamunique" version="0.01"> +<description>Filter SAM file for uniquelly match reads</description> +<command interpreter="perl"> + filtersamunique.pl $input_sam_file > $output_file +</command> +<inputs> +<param name="input_sam_file" type="data" format="sam" label="Select a suitable input SAM file from your history"/> +</inputs> +<outputs> + <data name="output_file" format="sam" label="${tool.name} on ${on_string}"/> +</outputs> + +<help> + + + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpileupfilter/mpileupfilter.pl Mon Jun 16 06:18:07 2014 -0400 @@ -0,0 +1,394 @@ +#!/usr/bin/perl +use strict; +use Getopt::Long; + +# +# Filter a pileup file on forward/reverse presence and %read having the variant +# The error code +# 1 : multiple variant type detected insertion/deletion/mutation +# 1i : inconsistency in insertion +# 1d : inconsistency in deletion +# 1m : inconsistency in mutation +# 2 : insufficient depth +# 3 : insufficient variant frequency +# 4 : variant position not covered by forward and reverse reads +# 5 : variant with other variant in neighbourhood +# 6 : too much depth +# 8 : parsing error (couldn't parse the mpileup line correctly) +# 9 : parsing error (couldn't parse the readbase string correctly) + + +my $inputfile; +my $logfile; +my $MIN_DISTANCE=0; +my $MIN_VARIANTFREQUENCY=0; +my $MIN_FORWARDREVERSE=0; +my $MIN_DEPTH=0; +my $MAX_DEPTH=500; +my $VERBOSE=0; +my $ONLY_UNFILTERED_VARIANT="OFF"; + +if ($#ARGV<0){ + print "\n"; + print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n"; + print "-input_file \tinputfile in mpileup format\n"; + print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n"; + print "-min_depth \tminimum depth required [1]\n"; + print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n"; + print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n"; + print "-min_distance \tminimum distance between variant [0]\n"; + print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n"; + print "\n"; + exit(0); +} + +GetOptions ( +"input_file=s" => \$inputfile, +"log_file=s" => \$logfile, +"min_depth=i" => \$MIN_DEPTH, +"max_depth=i" => \$MAX_DEPTH, +"min_frequency=f" => \$MIN_VARIANTFREQUENCY, +"min_distance=i" => \$MIN_DISTANCE, +"min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE, +"variant_only=s" => \$ONLY_UNFILTERED_VARIANT, +"v=i" => \$VERBOSE +) or die("Error in command line arguments\n"); + + +open(IF, $inputfile) or die("Can't open $inputfile\n"); + +my @tbl_line; +my @tbl_variant_position; +my @tbl_variant_chr; +my @tbl_variant_refbase; +my @tbl_variant_coverage; +my @tbl_variant_readbase_string; +my @tbl_variant_quality_string; + +#Extraction des variants +my $nb_line=0; +while (my $line=<IF>){ + $nb_line++; + if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){ + print "$nb_line\n"; + } + my $error_code=0; + if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){ + my $current_chromosome = $1; + my $current_position = $2; + my $current_refbase = $3; + my $current_coverage = $4; + my $current_readbase_string = $5; + my $current_quality_string = $6; + + #Suppression of mPileUp special character + $current_readbase_string =~ s/\$//g; #the read start at this position + $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char + + if ($current_readbase_string =~ /[ATGCNatgcn\d]/){ + push(@tbl_line,$line); + push(@tbl_variant_chr,$current_chromosome); + push(@tbl_variant_position,$current_position); + push(@tbl_variant_refbase,$current_refbase); + push(@tbl_variant_coverage,$current_coverage); + push(@tbl_variant_readbase_string,$current_readbase_string); + push(@tbl_variant_quality_string,$current_quality_string); + if ($ONLY_UNFILTERED_VARIANT eq "ON"){ + print $line; + } + + } + else { + #Position with no variant + } + + } + else { + #Error Parsing + print STDERR "$line #8"; + } +} +close(IF); + +if ($ONLY_UNFILTERED_VARIANT eq "ON"){ + exit(0); +} + +####Checking the distance between variant and other filter + +if ($logfile){ + open(LF,">$logfile") or die ("Cant't open $logfile\n"); +} + +for (my $i=0;$i<=$#tbl_line;$i++){ + # print "ligne : $tbl_line[$i]\n"; + + my $error_code=0; + if ($i==0){ + #Comparing $i and $i+1 for neighbourhood filter; + if ($#tbl_line>0){ + if (($tbl_variant_chr[$i+1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i]+$MIN_DISTANCE>=$tbl_variant_position[$i+1])){ + $error_code=5; + chomp($tbl_line[$i]); + if ($logfile){ + print LF "$tbl_line[$i]\tcode:$error_code\n"; + } + next; + } + } + + #Additionnal filters + $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]); + + } + else { + #Compairing $i and $i-1 for neighbourhood filter + if (($tbl_variant_chr[$i-1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i-1]+$MIN_DISTANCE>=$tbl_variant_position[$i])){ + $error_code=5; + chomp($tbl_line[$i]); + if ($logfile){ + print LF "$tbl_line[$i]\tcode:$error_code\n"; + } + next; + } + else { + #Additionnal filters + $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]); + } + } + if ($error_code == 0){ + print $tbl_line[$i]; + } + else { + chomp($tbl_line[$i]); + if ($logfile){ + print LF "$tbl_line[$i]\tcode:$error_code\n"; + } + } +} + +if ($logfile){ + close (LF); +} + +sub check_error{ + my $current_chromosome = shift; + my $current_position = shift; + my $current_refbase = shift; + my $current_coverage = shift; + my $current_readbase_string = shift; + + # print "test : $current_readbase_string\n"; + + + + #Extraction of insertions + + ################################################################## + # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g; + # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g; + # print "IN : @IN\n"; + # print "DEL :@DEL\n"; + #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; + ################################################################## + #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct + ################################################################## + + # => parcours de boucle + my @readbase = split(//,$current_readbase_string); + my $cleaned_readbase_string=""; + my @IN; + my @DEL; + my $current_IN=""; + my $current_DEL=""; + my $current_size=0; + + for (my $i=0;$i<=$#readbase;$i++){ + if ($readbase[$i] eq "+"){ + #Ouverture de IN + $current_IN="+"; + + #Recuperation de la taille + my $sub = substr $current_readbase_string,$i; + if ($sub=~/^\+(\d+)/){ + $current_size = $1; + } + my $remaining_size = $current_size; + while (($remaining_size>0)&&($i<=$#readbase)){ + $i++; + $current_IN.=$readbase[$i]; + if ($readbase[$i]=~ /[ATGCNatgcn]/){ + $remaining_size--; + } + } + push(@IN,$current_IN); + } + elsif ($readbase[$i] eq "-"){ + #Ouverture de DEL + $current_DEL="-"; + + #Recuperation de la taille + my $sub = substr $current_readbase_string,$i; + if ($sub=~/^\-(\d+)/){ + $current_size = $1; + } + my $remaining_size = $current_size; + while (($remaining_size>0)&&($i<=$#readbase)){ + $i++; + $current_DEL.=$readbase[$i]; + if ($readbase[$i]=~ /[ATGCNatgcn]/){ + $remaining_size--; + } + } + push(@DEL,$current_DEL); + + } + else { + #Ajout a la string + $cleaned_readbase_string .= $readbase[$i]; + } + } + + + # print "IN : @IN\n"; + # print "DEL :@DEL\n"; + # print "$cleaned_readbase_string\n"; + + my @current_readbase_array = split(//,$cleaned_readbase_string); + + #Filtering : error detection + + if ($#current_readbase_array+1 != $current_coverage){ + return 9; + #parsing error (couldn't parse the readbase string correctly) + } + elsif ($current_coverage<$MIN_DEPTH){ + return 2; + # 2 : insufficient depth + } + elsif ($current_coverage>$MAX_DEPTH){ + return 6; + # 6 : too much depth + } + else { + if ($#IN>=0){ + if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ + return 1; + # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) + } + else { + ########## TEST de coherence des insertions ################ + # for (my $i=0;$i<=$#IN;$i++){ + # if (uc($IN[0]) ne uc($IN[$i])){ + # print uc($IN[0]),"\n"; + # print uc($IN[$i]),"\n"; + # return "1i"; + # } + # } + ########################################################### + + if($#IN+1 < $current_coverage*$MIN_VARIANTFREQUENCY){ + return 3; + # 3 : insufficient variant frequency + } + } + } + elsif ($#DEL>=0){ + if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ + return 1; + # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) + } + else { + ########## TEST de coherence des deletions ################ + # for (my $i=0;$i<=$#DEL;$i++){ + # if (uc($DEL[0]) ne uc($DEL[$i])){ + # print uc($DEL[0]),"\n"; + # print uc($DEL[$i]),"\n"; + # return "1d"; + # } + # } + ########################################################### + + if($#DEL+1 < $current_coverage*$MIN_VARIANTFREQUENCY){ + return 3; + # 3 : insufficient variant frequency + } + } + } + else { + my $nbA=0; + $nbA++ while ($current_readbase_string =~ m/A/g); + my $nbC=0; + $nbC++ while ($current_readbase_string =~ m/C/g); + my $nbT=0; + $nbT++ while ($current_readbase_string =~ m/T/g); + my $nbG=0; + $nbG++ while ($current_readbase_string =~ m/G/g); + my $nbN=0; + $nbN++ while ($current_readbase_string =~ m/N/g); + my $nba=0; + $nba++ while ($current_readbase_string =~ m/a/g); + my $nbc=0; + $nbc++ while ($current_readbase_string =~ m/c/g); + my $nbt=0; + $nbt++ while ($current_readbase_string =~ m/t/g); + my $nbg=0; + $nbg++ while ($current_readbase_string =~ m/g/g); + my $nbn=0; + $nbn++ while ($current_readbase_string =~ m/n/g); + + if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){ + return "1m"; + } + if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){ + return "1m"; + } + + if ($nbA+$nba >= $current_coverage*$MIN_VARIANTFREQUENCY){ + if (($nbA<$MIN_FORWARDREVERSE)||($nba<$MIN_FORWARDREVERSE)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbT+$nbt >= $current_coverage*$MIN_VARIANTFREQUENCY){ + if (($nbT<$MIN_FORWARDREVERSE)||($nbt<$MIN_FORWARDREVERSE)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbG+$nbg >= $current_coverage*$MIN_VARIANTFREQUENCY){ + if (($nbG<$MIN_FORWARDREVERSE)||($nbg<$MIN_FORWARDREVERSE)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbC+$nbc >= $current_coverage*$MIN_VARIANTFREQUENCY){ + if (($nbC<$MIN_FORWARDREVERSE)||($nbc<$MIN_FORWARDREVERSE)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + elsif ($nbN+$nbn >= $current_coverage*$MIN_VARIANTFREQUENCY){ + if (($nbN<$MIN_FORWARDREVERSE)||($nbn<$MIN_FORWARDREVERSE)){ + return 4; + # 4 : variant position not covered by forward and reverse reads + } + } + else { + return 3; + # 3 : insufficient variant frequency + } + } + } + + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpileupfilter/mpileupfilter.xml Mon Jun 16 06:18:07 2014 -0400 @@ -0,0 +1,23 @@ +<tool id="mpileupfilter" name="mpileupfilter" version="0.05"> +<description>Filter mpileup file entry</description> +<command interpreter="perl"> + mpileupfilter.pl -input_file $input_file -min_depth $min_depth -min_frequency $min_frequency -min_distance $min_distance -min_forward_and_reverse $min_forward_and_reverse -max_depth $max_depth > $output_file +</command> +<inputs> +<param name="input_file" type="data" format="pileup" label="Select a suitable input file from your history"/> +<param name="min_depth" type="integer" value="2" label="Minimum depth at variant position "/> +<param name="max_depth" type="integer" value="100" label="Maximum depth at variant position "/> +<param name="min_frequency" type="float" value="1" label="Minimum variant frequency (between 0-1 : 0.5 for 50%) "/> +<param name="min_forward_and_reverse" type="integer" value="0" label="Minimum variant coverage by forward and reverse reads"/> +<param name="min_distance" type="integer" value="50" label="Minimum physical distance between variant"/> +</inputs> +<outputs> + <data name="output_file" format="pileup" label="${tool.name} on ${on_string}"/> +</outputs> + +<help> + + + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpileupfilteronblastxml/mpileupfilteronblastxml.pl Mon Jun 16 06:18:07 2014 -0400 @@ -0,0 +1,287 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Long; + +my $input_variant_file; +my $input_blastxml_file; +my $window_length = 50; +my $nb_mismatch_max = 2; + +GetOptions ( +"input_variant_file=s" => \$input_variant_file, +"input_blastxml_file=s" => \$input_blastxml_file, +"window_length=i" => \$window_length, +"nb_mismatch_max=i" => \$nb_mismatch_max +) or die("Error in command line arguments\n"); + + +open(INB, $input_blastxml_file) or die ("Can't open $input_blastxml_file\n"); + + +my $iteration_stop="</Iteration>"; +my $hit_stop="</Hit>"; +my $hsp_stop="</Hsp>"; +my $query_flag_start="<Iteration_query-def>"; +my $query_flag_stop="</Iteration_query-def>"; +my $subject_flag_start="<Hit_def>"; +my $subject_flag_stop="</Hit_def>"; +my $query_HSP_flag_from_start="<Hsp_query-from>"; +my $query_HSP_flag_from_stop="</Hsp_query-from>"; +my $query_HSP_flag_to_start="<Hsp_query-to>"; +my $query_HSP_flag_to_stop="</Hsp_query-to>"; +my $subject_HSP_flag_from_start="<Hsp_hit-from>"; +my $subject_HSP_flag_from_stop="</Hsp_hit-from>"; +my $subject_HSP_flag_to_start="<Hsp_hit-to>"; +my $subject_HSP_flag_to_stop="</Hsp_hit-to>"; +my $query_HSP_flag_seq_start="<Hsp_qseq>"; +my $query_HSP_flag_seq_stop="</Hsp_qseq>"; +my $subject_HSP_flag_seq_start="<Hsp_hseq>"; +my $subject_HSP_flag_seq_stop="</Hsp_hseq>"; +my $HSP_midline_flag_start="<Hsp_midline>"; +my $HSP_midline_flag_stop="</Hsp_midline>"; + +my %hash; + +my $compt=0; +my $current_query=""; +my $current_subject=""; +my $current_HSP_query_from; +my $current_HSP_query_to; +my $current_HSP_subject_from; +my $current_HSP_subject_to; +my $current_HSP_midline; +my $current_HSP_qseq; +my $current_HSP_hseq; +my $current_query_short; + +while (my $ligne = <INB>) { + + if ($ligne=~/$subject_flag_start(.*?)$subject_flag_stop/){ + $current_subject=$1; + #print "--",$1,"\n"; + } + + if ($ligne=~/$query_flag_start(.*?)$query_flag_stop/){ + $current_query=$1; + $current_query_short=$current_query; + if ($current_query =~ /^(.*?_\d+)\_/){ + $current_query_short=$1; + } + if (!$hash{$current_query_short}){ + $hash{$current_query_short}=0; + } + + + #print "--",$1,"\n"; + } + if ($ligne=~/$query_HSP_flag_from_start(.*?)$query_HSP_flag_from_stop/){ + $current_HSP_query_from=$1; + #print "--",$1,"..."; + } + if ($ligne=~/$query_HSP_flag_to_start(.*?)$query_HSP_flag_to_stop/){ + $current_HSP_query_to=$1; + #print $1,"\n"; + } + if ($ligne=~/$subject_HSP_flag_from_start(.*?)$subject_HSP_flag_from_stop/){ + $current_HSP_subject_from=$1; + #print "--",$1,"..."; + } + if ($ligne=~/$subject_HSP_flag_to_start(.*?)$subject_HSP_flag_to_stop/){ + $current_HSP_subject_to=$1; + #print $1,"\n"; + } + if ($ligne=~/$query_HSP_flag_seq_start(.*?)$query_HSP_flag_seq_stop/){ + $current_HSP_qseq=$1; + #print "--",$1,"\n"; + } + if ($ligne=~/$subject_HSP_flag_seq_start(.*?)$subject_HSP_flag_seq_stop/){ + $current_HSP_hseq=$1; + #print "--",$1,"\n"; + } + if ($ligne=~/$HSP_midline_flag_start(.*?)$HSP_midline_flag_stop/){ + $current_HSP_midline=$1; + #print "--",$1,"\n"; + } + + if ($ligne=~/$hsp_stop/){ + if ($current_HSP_query_from){ + #print "\ntest1\n"; + #print "Query : $current_query\n"; + #print "Subject : $current_subject\n"; + #print "$current_HSP_query_from ... $current_HSP_query_to\n"; + #print "$current_HSP_subject_from ... $current_HSP_subject_to\n"; + for (my $i=1;$i<$current_HSP_query_from;$i++){ + $current_HSP_qseq = "N".$current_HSP_qseq; + $current_HSP_midline = " ".$current_HSP_midline; + $current_HSP_hseq = "N".$current_HSP_hseq; + } + for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){ + $current_HSP_qseq .= "N"; + $current_HSP_midline .= " "; + $current_HSP_hseq .= "N"; + } + + my @qseq = split(//,$current_HSP_qseq); + my @midline = split(//,$current_HSP_midline); + my @hseq = split(//,$current_HSP_hseq); + + my $comptbase=0; + my $compt5p=0; + my $compt3p=0; + for (my $i=0;$i<=$#qseq;$i++){ + if ($qseq[$i] ne "-"){ + $comptbase++; # Va de 1 -> $window_length *2 +1 + } + if ($midline[$i] eq " "){ + if ($comptbase<=$window_length){ #1 -> $window_length + $compt5p++; + } + elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1; + $compt3p++; + } + else { #+1-$window_length*2+1 + + } + } + } + if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){ + $hash{$current_query_short}++; + } + + #print "$current_HSP_qseq\n"; + #print "$current_HSP_midline\n"; + #print "$current_HSP_hseq\n"; + #print "$compt5p // $compt3p\n"; + #print $hash{$current_query_short},"\n"; + + } + + undef $current_HSP_query_from; + undef $current_HSP_query_to; + undef $current_HSP_subject_from; + undef $current_HSP_subject_to; + $current_HSP_midline=""; + $current_HSP_qseq=""; + $current_HSP_hseq=""; + } + + if ($ligne=~/$iteration_stop/){ + if ($current_HSP_query_from){ + #print "\ntest2\n"; + #print "Query : $current_query\n"; + #print "Subject : $current_subject\n"; + #print "$current_HSP_query_from ... $current_HSP_query_to\n"; + #print "$current_HSP_subject_from ... $current_HSP_subject_to\n"; + for (my $i=1;$i<$current_HSP_query_from;$i++){ + $current_HSP_qseq = "N".$current_HSP_qseq; + $current_HSP_midline = " ".$current_HSP_midline; + $current_HSP_hseq = "N".$current_HSP_hseq; + } + for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){ + $current_HSP_qseq .= "N"; + $current_HSP_midline .= " "; + $current_HSP_hseq .= "N"; + } + + my @qseq = split(//,$current_HSP_qseq); + my @midline = split(//,$current_HSP_midline); + my @hseq = split(//,$current_HSP_hseq); + + my $comptbase=0; + my $compt5p=0; + my $compt3p=0; + for (my $i=0;$i<=$#qseq;$i++){ + if ($qseq[$i] ne "-"){ + $comptbase++; # Va de 1 -> $window_length *2 +1 + } + if ($midline[$i] eq " "){ + if ($comptbase<=$window_length){ #1 -> $window_length + $compt5p++; + } + elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1; + $compt3p++; + } + else { #+1-$window_length*2+1 + + } + } + } + if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){ + $hash{$current_query_short}++; + } + + #print "$current_HSP_qseq\n"; + #print "$current_HSP_midline\n"; + #print "$current_HSP_hseq\n"; + #print "$compt5p // $compt3p\n"; + #print $hash{$current_query_short},"\n"; + } + $current_query=""; + $current_query_short=""; + $current_subject=""; + undef $current_HSP_query_from; + undef $current_HSP_query_to; + undef $current_HSP_subject_from; + undef $current_HSP_subject_to; + $current_HSP_midline=""; + $current_HSP_qseq=""; + $current_HSP_hseq=""; + } + +} + +close (INB); + +# foreach my $key (sort trinombre keys %hash){ + # print $key," ",$hash{$key},"\n"; + +# } +# exit(0); + +open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); + +while (my $ligne = <INV>) { + my @champs = split (/\s+/,$ligne); + my $header = $champs[0]."_".$champs[1]; + + if ($hash{$header}){ + if ($hash{$header}==1){ + print "$ligne"; + } + else { + #print $hash{$header}," $ligne"; + } + } + else { + print STDERR "No blast result for ",$header,"\n"; + } + + +} + +close(INV); + + + +sub trinombre { + my $chra=$a; + my $posa=0; + my $chrb=$b; + my $posb=0; + + if ($a =~/(.*?)\_(\d+)/){ + $chra=$1; + $posa=$2; + } + if ($b =~/(.*?)\_(\d+)/){ + $chrb=$1; + $posb=$2; + } + + + $chra cmp $chrb + || + $posa <=> $posb; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mpileupfilteronblastxml/mpileupfilteronblastxml.xml Mon Jun 16 06:18:07 2014 -0400 @@ -0,0 +1,21 @@ +<tool id="mpileupfilteronblastxml" name="mpileupfilteronblastxml" version="0.03"> +<description>Filter mpileup with blast results</description> +<command interpreter="perl"> + mpileupfilteronblastxml.pl -input_variant_file $input_variant_file -input_blastxml_file $input_blastxml_file -window_length $window_length -nb_mismatch_max $nb_mismatch_max > $output_file +</command> +<inputs> +<param name="input_variant_file" type="data" format="pileup" label="Select a suitable input VARIANT file from your history"/> +<param name="input_blastxml_file" type="data" format="xml" label="Select a suitable input BLASTXML file from your history"/> +<param name="window_length" type="integer" value="50" label="Number of bases extracted before and after the variant position"/> +<param name="nb_mismatch_max" type="integer" value="3" label="Threshold for mismatch filter"/> +</inputs> +<outputs> + <data name="output_file" format="fasta" label="${tool.name} on ${on_string}"/> +</outputs> + +<help> + + + +</help> +</tool>