# HG changeset patch # User mcharles # Date 1408624755 14400 # Node ID edddaa8ab855f8ee6c66bdb3befba12c5301f52f # Parent afaf2e8aedcc323a8b8e11888d451662ffd9b7f9 Uploaded diff -r afaf2e8aedcc -r edddaa8ab855 rapsodyn/extractseq.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/extractseq.pl Thu Aug 21 08:39:15 2014 -0400 @@ -0,0 +1,132 @@ +#!/usr/bin/perl +#V1.10 + +use strict; +use warnings; +use Getopt::Long; + +my $input_variant_file; +my $input_assembly_file; +my $WINDOWS_LENGTH = 50; + +GetOptions ( +"input_variant_file=s" => \$input_variant_file, +"input_assembly_file=s" => \$input_assembly_file, +"window_length=i" => \$WINDOWS_LENGTH +) or die("Error in command line arguments\n"); + +open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); +open(INA, $input_assembly_file) or die ("Can't open $input_assembly_file\n"); + +my @variant_list; + + +### Retrieving the assembly +my %genome; + +my $current_header=""; +my $current_seq=""; +while (my $ligne = ){ + if ($ligne =~ /^\>(.*?)\s*$/){ + if ($current_header){ + $genome{$current_header} = $current_seq; + } + $current_header=$1; + $current_seq = ""; + } + else { + if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){ + $current_seq .= $1; + } + else { + print STDERR "Erreur Parsing n°2\n$ligne\n"; + } + } +} +#TRAITEMENT DU DERNIER +if ($current_header){ + $genome{$current_header} = $current_seq; + undef($current_seq); +} +close (INA); + + +### Retrieving the variant +while (my $ligne=){ + if ($ligne !~ /^\s*$/){ + my %variant; + my @fields = split (/\s+/,$ligne); + $variant{"ref"}=$fields[0]; + $variant{"position"}=$fields[1]; + $variant{"baseref"}=$fields[2]; + $variant{"depth"}=$fields[3]; + $variant{"pileup"}=$fields[4]; + + + my $start = &max($variant{"position"} - $WINDOWS_LENGTH,1); + my $stop = &min ($variant{"position"} + $WINDOWS_LENGTH,length($genome{$variant{"ref"}})); + my $length = $stop-$start+1; + + #print $variant{"position"}," / ",length($genome{$variant{"ref"}})," / ","$start / $stop / $length \n"; + + $variant{"SEQ"} = substr $genome{$variant{"ref"}},$start-1,$length; + + my $pileup = $variant{"pileup"}; + $pileup =~ s/\$//g; #the read start at this position + $pileup =~ s/\^.//g; #the read end at this position + my $descriptor = $variant{"position"}."_".$variant{"depth"}."_"; + if ($pileup=~/\+([0-9]+)([ACGTNacgtn]+)/){ + $descriptor .="I".$1."_".$2; + } + elsif ($pileup=~/\-([0-9]+)([ACGTNacgtn]+)/){ + $descriptor .="D".$1."_".$2; + } + elsif ($pileup=~/([ACGTNacgtn])/){ + $descriptor.="M1"."_".$1; + } + else { + $descriptor.="?_?"; + } + $variant{"desc"}=$descriptor; + + print ">",$variant{"ref"},"_",$descriptor,"\n",$variant{"SEQ"},"\n"; + + + + #print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n"; + + push(@variant_list,\%variant); + } +} +close (INV); + + + + + + + + + +#*********** +sub min{ + my $first = shift; + my $second = shift; + if ($first <= $second){ + return $first; + } + else { + return $second; + } +} + +sub max { + my $first = shift; + my $second = shift; + if ($first >= $second){ + return $first; + } + else { + return $second; + } +} \ No newline at end of file diff -r afaf2e8aedcc -r edddaa8ab855 rapsodyn/extractseq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/extractseq.xml Thu Aug 21 08:39:15 2014 -0400 @@ -0,0 +1,20 @@ + +Extract Sequence around variant position + + extractseq.pl -input_variant_file $input_variant_file -input_assembly_file $input_assembly_file -window_length $window_length > $output_file + + + + + + + + + + + + + + + +