Mercurial > repos > mcharles > rapsosnp
diff rapsodyn/PrepareFastqLight.pl @ 0:442a7c88b886 draft
Uploaded
author | mcharles |
---|---|
date | Wed, 10 Sep 2014 09:18:15 -0400 |
parents | |
children | 9074a5104cdd |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rapsodyn/PrepareFastqLight.pl Wed Sep 10 09:18:15 2014 -0400 @@ -0,0 +1,420 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $read1 = $ARGV[0]; +my $read2 = $ARGV[1]; + +my $output1 = $ARGV[2]; +my $output2 = $ARGV[3]; + +my $TYPE = $ARGV[4]; +my $MIN_LENGTH = $ARGV[5]; +my $MIN_QUALITY = $ARGV[6]; + +my $VERBOSE = $ARGV[7]; + +if (!$VERBOSE){ + $VERBOSE ="OFF"; +} + +open(READ1, $read1) or die ("Can't open $read1\n"); +open(READ2, $read2) or die ("Can't open $read2\n"); +open(OUT1, ">$output1") or die ("Can't open $output1\n"); +open(OUT2, ">$output2") or die ("Can't open $output2\n"); + + +my $error1=0; +my $error2=0; +my $error3=0; +my $error4=0; +my $error5=0; +my $error6=0; +my $error7=0; +my $error8=0; +my $error9=0; +my $error10=0; + +while (my $ligne1_r1 =<READ1>){ + my $ligne2_r1 =<READ1>; + my $ligne3_r1 =<READ1>; + my $ligne4_r1 =<READ1>; + my $ligne1_r2 =<READ2>; + my $ligne2_r2 =<READ2>; + my $ligne3_r2 =<READ2>; + my $ligne4_r2 =<READ2>; + +#@ 1 sec + if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ + if ($VERBOSE eq "ON"){ + print "Error in file format"; + if ($ligne1_r1){print $ligne1_r1;} + if ($ligne2_r1){print $ligne2_r1;} + if ($ligne3_r1){print $ligne3_r1;} + if ($ligne4_r1){print $ligne4_r1;} + if ($ligne1_r2){print $ligne1_r2;} + if ($ligne2_r2){print $ligne2_r2;} + if ($ligne3_r2){print $ligne3_r2;} + if ($ligne4_r2){print $ligne4_r2;} + print "\n"; + } + $error1++; + } + elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ + if ($VERBOSE eq "ON"){ + print "Error in header : format\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error2++; + } +#@ 1 - 2 sec + else { + + my $length_seq1 = length($ligne2_r1); + my $length_qual1 =length($ligne4_r1); + my $seq1; + my $qual1; + + my $length_seq2 = length($ligne2_r2); + my $length_qual2 =length($ligne4_r2); + my $seq2; + my $qual2; + my $header1=""; + my $header2=""; + my $repheader1=""; + my $repheader2=""; + + if ($ligne1_r1 =~/^\@(.*?)\#/){ + $header1 = $1; + } + + if ($ligne3_r1 =~/^\+(.*?)\#/){ + $repheader1 = $1; + } + + if ($ligne1_r2 =~/^\@(.*?)\#/){ + $header2 = $1; + } + + if ($ligne3_r2 =~/^\+(.*?)\#/){ + $repheader2 = $1; + } +#@ 2 sec + + ### Verification de la coherence sequence /qualité @ 1 sec + if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ + if ($VERBOSE eq "ON"){ + print "Error in header : empty\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error3++; + } + elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ + if ($VERBOSE eq "ON"){ + print "Error in header refgsd : empty\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error3++; + } + elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ + if ($VERBOSE eq "ON"){ + print "Error in header : different\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error4++; + } + elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ + if ($VERBOSE eq "ON"){ + print "Error in header : different\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error4++; + } + elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ + if ($VERBOSE eq "ON"){ + print "Error in seq/qual length\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error5++; + } +#@ 1 - 2 sec + else { + ### Parsing sequence & qualité + if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ + $seq1 = $1; + } + if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ + $seq2 = $1; + } + if ($ligne4_r1 =~ /^(.*)\s*$/i){ + $qual1 = $1; + } + if ($ligne4_r2 =~ /^(.*)\s*$/i){ + $qual2 = $1; + } +#@ 2 sec + ### Verification du parsing et de la coherence sequence /qualité (n°2) + if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ + if ($VERBOSE eq "ON"){ + print "Error parsing seq / quality \n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error6++; + } + elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ + if ($VERBOSE eq "ON"){ + print "Error in seq/qual length after parsing\n"; + print $ligne1_r1; + print $ligne2_r1; + print $ligne3_r1; + print $ligne4_r1; + print $ligne1_r2; + print $ligne2_r2; + print $ligne3_r2; + print $ligne4_r2; + print "\n"; + } + $error7++; + } +#@ <1 sec + else { + my $fastq_lines_r1=""; + my $fastq_lines_r2=""; + $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); + if ($fastq_lines_r1){ + $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); + } + if ($fastq_lines_r2){ + print OUT1 $fastq_lines_r1; + print OUT2 $fastq_lines_r2; + } + } + } + + # print OUT1 $ligne1_r1; + # print OUT1 $ligne2_r1; + # print OUT1 $ligne3_r1; + # print OUT1 $ligne4_r1; + # print OUT2 $ligne1_r2; + # print OUT2 $ligne2_r2; + # print OUT2 $ligne3_r2; + # print OUT2 $ligne4_r2; + +#@ 7 sec + } +} + + + +close (READ1); +close (READ2); +close (OUT1); +close (OUT2); + + + + +sub grooming_and_trimming{ + my $header = shift; + my $seq = shift; + my $quality = shift; + my $quality_converted=""; + + my $startnoN = 0; + my $stopnoN = length($quality)-1; + + #print "SEQ :\n$seq\n"; + + my $chercheN = $seq; + my @bad_position; + my $current_index = index($chercheN,"N"); + my $abs_index = $current_index; + while ($current_index >=0){ + push (@bad_position,$abs_index); + + if ($current_index<length($seq)){ + $chercheN = substr($chercheN,$current_index+1); + $current_index = index($chercheN,"N"); + $abs_index = $current_index + $bad_position[$#bad_position]+1; + } + else { + last; + } + } + + + if ($#bad_position>=0){ + my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)}; + $startnoN = $coord{"start"}; + $stopnoN = $coord{"stop"}; + } + my $lengthnoN = $stopnoN - $startnoN + 1; + my $seqnoN = substr($seq,$startnoN,$lengthnoN); + #print "$seqnoN\n"; + + if ($lengthnoN >= $MIN_LENGTH){ + my $startTrim = $startnoN; + my $stopTrim = $stopnoN; + + my $quality_converted=""; + my @bad_position; + + my @q = split(//,$quality); + #print "QUALITY\n"; + #print "$quality\n"; + for (my $i=0;$i<=$stopnoN;$i++){ + my $chr = $q[$i]; + my $num = ord($q[$i]); + if ($TYPE eq "illumina"){ + $num = $num -64+33; + $quality_converted .= chr($num); + } + + if ($num <$MIN_QUALITY + 64 - 33 ){ + push(@bad_position,$i+$startnoN); + } + } + if ($quality_converted){$quality = $quality_converted;} + #print "$quality\n"; + + + + if ($#bad_position>=0){ + # for (my $i=0;$i<=$#bad_position;$i++){ + # print $bad_position[$i]."\t"; + # } + # print "\n"; + my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)}; + $startTrim = $coord{"start"}; + $stopTrim = $coord{"stop"}; + #print "$startTrim .. $stopTrim\n"; + + } + my $lengthTrim = $stopTrim - $startTrim +1; + + + my $fastq_lines=""; + + if ($lengthTrim >= $MIN_LENGTH){ + $fastq_lines .= $header; + $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n"; + $fastq_lines .= "+\n"; + $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n"; + return $fastq_lines; + } + else { + return ""; + } + + + + } + else { + return ""; + } + + + # my @s = split(//,$seq); + # my $sanger_quality=""; + + + + + # return $sanger_quality; +} + +sub extract_longer_string_coordinates_from_bad_position{ + my $start=shift; + my $stop =shift; + my $refbad = shift; + my @bad_position = @$refbad; + my %coord; + + my $current_start = $start; + my $current_stop = $bad_position[0]-1; + if ($current_stop < $start){$current_stop = $start;} + + + #debut -> premier N + my $current_length = $current_stop - $current_start +1; + my $test_length; + + #entre les N + for (my $i=1;$i<=$#bad_position;$i++){ + $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; + if ( $test_length > $current_length){ + $current_start = $bad_position[$i-1]+1; + $current_stop = $bad_position[$i]-1; + $current_length = $current_stop - $current_start +1; + } + } + + #dernier N -> fin + $test_length = $stop-$bad_position[$#bad_position]+1; + if ( $test_length > $current_length){ + $current_start = $bad_position[$#bad_position]+1; + if ($current_start > $stop){$current_start=$stop;} + $current_stop = $stop; + } + $coord{"start"}=$current_start; + $coord{"stop"}= $current_stop; + $coord{"lenght"}=$current_stop-$current_start+1; + + return \%coord; +}