Mercurial > repos > mcharles > rapsodyn
changeset 23:edddaa8ab855 draft
Uploaded
author | mcharles |
---|---|
date | Thu, 21 Aug 2014 08:39:15 -0400 |
parents | afaf2e8aedcc |
children | e8e6b962c1f2 |
files | rapsodyn/extractseq.pl rapsodyn/extractseq.xml |
diffstat | 2 files changed, 152 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/extractseq.pl Thu Aug 21 08:39:15 2014 -0400 @@ -0,0 +1,132 @@ +#!/usr/bin/perl +#V1.10 + +use strict; +use warnings; +use Getopt::Long; + +my $input_variant_file; +my $input_assembly_file; +my $WINDOWS_LENGTH = 50; + +GetOptions ( +"input_variant_file=s" => \$input_variant_file, +"input_assembly_file=s" => \$input_assembly_file, +"window_length=i" => \$WINDOWS_LENGTH +) or die("Error in command line arguments\n"); + +open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); +open(INA, $input_assembly_file) or die ("Can't open $input_assembly_file\n"); + +my @variant_list; + + +### Retrieving the assembly +my %genome; + +my $current_header=""; +my $current_seq=""; +while (my $ligne = <INA>){ + if ($ligne =~ /^\>(.*?)\s*$/){ + if ($current_header){ + $genome{$current_header} = $current_seq; + } + $current_header=$1; + $current_seq = ""; + } + else { + if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){ + $current_seq .= $1; + } + else { + print STDERR "Erreur Parsing n°2\n$ligne\n"; + } + } +} +#TRAITEMENT DU DERNIER +if ($current_header){ + $genome{$current_header} = $current_seq; + undef($current_seq); +} +close (INA); + + +### Retrieving the variant +while (my $ligne=<INV>){ + if ($ligne !~ /^\s*$/){ + my %variant; + my @fields = split (/\s+/,$ligne); + $variant{"ref"}=$fields[0]; + $variant{"position"}=$fields[1]; + $variant{"baseref"}=$fields[2]; + $variant{"depth"}=$fields[3]; + $variant{"pileup"}=$fields[4]; + + + my $start = &max($variant{"position"} - $WINDOWS_LENGTH,1); + my $stop = &min ($variant{"position"} + $WINDOWS_LENGTH,length($genome{$variant{"ref"}})); + my $length = $stop-$start+1; + + #print $variant{"position"}," / ",length($genome{$variant{"ref"}})," / ","$start / $stop / $length \n"; + + $variant{"SEQ"} = substr $genome{$variant{"ref"}},$start-1,$length; + + my $pileup = $variant{"pileup"}; + $pileup =~ s/\$//g; #the read start at this position + $pileup =~ s/\^.//g; #the read end at this position + my $descriptor = $variant{"position"}."_".$variant{"depth"}."_"; + if ($pileup=~/\+([0-9]+)([ACGTNacgtn]+)/){ + $descriptor .="I".$1."_".$2; + } + elsif ($pileup=~/\-([0-9]+)([ACGTNacgtn]+)/){ + $descriptor .="D".$1."_".$2; + } + elsif ($pileup=~/([ACGTNacgtn])/){ + $descriptor.="M1"."_".$1; + } + else { + $descriptor.="?_?"; + } + $variant{"desc"}=$descriptor; + + print ">",$variant{"ref"},"_",$descriptor,"\n",$variant{"SEQ"},"\n"; + + + + #print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n"; + + push(@variant_list,\%variant); + } +} +close (INV); + + + + + + + + + +#*********** +sub min{ + my $first = shift; + my $second = shift; + if ($first <= $second){ + return $first; + } + else { + return $second; + } +} + +sub max { + my $first = shift; + my $second = shift; + if ($first >= $second){ + return $first; + } + else { + return $second; + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/extractseq.xml Thu Aug 21 08:39:15 2014 -0400 @@ -0,0 +1,20 @@ +<tool id="extractseq" name="extractseq" version="0.03"> +<description>Extract Sequence around variant position</description> +<command interpreter="perl"> + extractseq.pl -input_variant_file $input_variant_file -input_assembly_file $input_assembly_file -window_length $window_length > $output_file +</command> +<inputs> +<param name="input_variant_file" type="data" format="pileup" label="Select a suitable input VARIANT file from your history"/> +<param name="input_assembly_file" type="data" format="fasta" label="Select a suitable input ASSEMBLY file from your history"/> +<param name="window_length" type="integer" value="50" label="Number of bases extracted before and after the variant position"/> +</inputs> +<outputs> + <data name="output_file" format="fasta" label="${tool.name} on ${on_string}"/> +</outputs> + +<help> + + + +</help> +</tool>