diff rapsodyn/PrepareFastqLight.pl @ 0:442a7c88b886 draft

Uploaded
author mcharles
date Wed, 10 Sep 2014 09:18:15 -0400
parents
children 9074a5104cdd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapsodyn/PrepareFastqLight.pl	Wed Sep 10 09:18:15 2014 -0400
@@ -0,0 +1,420 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $read1 = $ARGV[0];
+my $read2 = $ARGV[1];
+
+my $output1 = $ARGV[2];
+my $output2 = $ARGV[3];
+
+my $TYPE = $ARGV[4];
+my $MIN_LENGTH = $ARGV[5];
+my $MIN_QUALITY = $ARGV[6];
+
+my $VERBOSE = $ARGV[7];
+
+if (!$VERBOSE){
+	$VERBOSE ="OFF";
+}
+
+open(READ1, $read1) or die ("Can't open $read1\n");
+open(READ2, $read2) or die ("Can't open $read2\n");
+open(OUT1, ">$output1") or die ("Can't open $output1\n");
+open(OUT2, ">$output2") or die ("Can't open $output2\n");
+
+
+my $error1=0;
+my $error2=0;
+my $error3=0;
+my $error4=0;
+my $error5=0;
+my $error6=0;
+my $error7=0;
+my $error8=0;
+my $error9=0;
+my $error10=0;
+
+while (my $ligne1_r1 =<READ1>){
+	my $ligne2_r1 =<READ1>;
+	my $ligne3_r1 =<READ1>;
+	my $ligne4_r1 =<READ1>;
+	my $ligne1_r2 =<READ2>;
+	my $ligne2_r2 =<READ2>;
+	my $ligne3_r2 =<READ2>;
+	my $ligne4_r2 =<READ2>;
+	
+#@ 1 sec
+	if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
+		if ($VERBOSE eq "ON"){
+			print "Error in file format";
+			if ($ligne1_r1){print $ligne1_r1;}
+			if ($ligne2_r1){print $ligne2_r1;}
+			if ($ligne3_r1){print $ligne3_r1;}
+			if ($ligne4_r1){print $ligne4_r1;}
+			if ($ligne1_r2){print $ligne1_r2;}
+			if ($ligne2_r2){print $ligne2_r2;}
+			if ($ligne3_r2){print $ligne3_r2;}
+			if ($ligne4_r2){print $ligne4_r2;}
+			print "\n";
+		}
+		$error1++;
+	}
+	elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
+		if ($VERBOSE eq "ON"){
+			print "Error in header : format\n";
+			print $ligne1_r1;
+			print $ligne2_r1;
+			print $ligne3_r1;
+			print $ligne4_r1;
+			print $ligne1_r2;
+			print $ligne2_r2;
+			print $ligne3_r2;
+			print $ligne4_r2;
+			print "\n";
+		}
+		$error2++;
+	}
+#@ 1 - 2 sec
+	else {
+		
+		my $length_seq1 = length($ligne2_r1);
+		my $length_qual1 =length($ligne4_r1);
+		my $seq1;
+		my $qual1;
+	
+		my $length_seq2 = length($ligne2_r2);
+		my $length_qual2 =length($ligne4_r2);
+		my $seq2;
+		my $qual2;
+		my $header1="";
+		my $header2="";
+		my $repheader1="";
+		my $repheader2="";
+		
+		if ($ligne1_r1 =~/^\@(.*?)\#/){
+			$header1 = $1;
+		}
+		
+		if ($ligne3_r1 =~/^\+(.*?)\#/){
+                        $repheader1 = $1;
+               	}
+
+		if ($ligne1_r2 =~/^\@(.*?)\#/){
+                        $header2 = $1;
+               	}
+
+		if ($ligne3_r2 =~/^\+(.*?)\#/){
+                        $repheader2 = $1;
+               	}
+#@ 2 sec
+		
+		### Verification de la coherence sequence /qualité   @ 1 sec
+		if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : empty\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error3++;
+		}
+		elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header refgsd : empty\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error3++;
+		}
+		elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : different\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error4++;
+		}
+		elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
+			if ($VERBOSE eq "ON"){
+				print "Error in header : different\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error4++;
+		}
+		elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
+			if ($VERBOSE eq "ON"){
+				print "Error in seq/qual length\n";
+				print $ligne1_r1;
+				print $ligne2_r1;
+				print $ligne3_r1;
+				print $ligne4_r1;
+				print $ligne1_r2;
+				print $ligne2_r2;
+				print $ligne3_r2;
+				print $ligne4_r2;
+				print "\n";
+			}
+			$error5++;
+		}
+#@ 1 - 2 sec	
+		else {
+			### Parsing sequence & qualité
+			if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
+				$seq1 = $1;
+			}
+			if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
+				$seq2 = $1;
+			}
+			if ($ligne4_r1 =~ /^(.*)\s*$/i){
+				$qual1 = $1;
+			}
+			if ($ligne4_r2 =~ /^(.*)\s*$/i){
+				$qual2 = $1;
+			}
+#@ 2 sec			
+			### Verification du parsing et de la coherence sequence /qualité (n°2)
+			if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
+				if ($VERBOSE eq "ON"){
+					print "Error parsing seq / quality \n";
+					print $ligne1_r1;
+					print $ligne2_r1;
+					print $ligne3_r1;
+					print $ligne4_r1;
+					print $ligne1_r2;
+					print $ligne2_r2;
+					print $ligne3_r2;
+					print $ligne4_r2;
+					print "\n";
+				}
+				$error6++;
+			}
+			elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
+				if ($VERBOSE eq "ON"){
+					print "Error in seq/qual length after parsing\n";
+					print $ligne1_r1;
+					print $ligne2_r1;
+					print $ligne3_r1;
+					print $ligne4_r1;
+					print $ligne1_r2;
+					print $ligne2_r2;
+					print $ligne3_r2;
+					print $ligne4_r2;
+					print "\n";
+				}
+				$error7++;
+			}
+#@ <1 sec
+			else {
+				my $fastq_lines_r1="";
+				my $fastq_lines_r2="";
+				$fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
+				if ($fastq_lines_r1){
+					$fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
+				}
+				if ($fastq_lines_r2){
+					print OUT1 $fastq_lines_r1;
+					print OUT2 $fastq_lines_r2;
+				}
+			}
+		}
+	
+		# print OUT1 $ligne1_r1;
+		# print OUT1 $ligne2_r1;
+		# print OUT1 $ligne3_r1;
+		# print OUT1 $ligne4_r1;
+		# print OUT2 $ligne1_r2;
+		# print OUT2 $ligne2_r2;
+		# print OUT2 $ligne3_r2;
+		# print OUT2 $ligne4_r2;
+		
+#@ 7 sec
+	}
+}
+
+
+
+close (READ1);
+close (READ2);
+close (OUT1);
+close (OUT2);
+
+
+
+
+sub grooming_and_trimming{
+	my $header = shift;
+	my $seq = shift;
+	my $quality = shift;
+	my $quality_converted="";
+	
+	my $startnoN = 0;
+	my $stopnoN = length($quality)-1;
+	
+	#print "SEQ :\n$seq\n";
+	
+	my $chercheN = $seq;
+	my @bad_position;
+	my $current_index = index($chercheN,"N");
+	my $abs_index = $current_index;
+	while ($current_index >=0){
+		push (@bad_position,$abs_index);
+
+		if ($current_index<length($seq)){
+			$chercheN = substr($chercheN,$current_index+1);
+			$current_index = index($chercheN,"N");
+			$abs_index = $current_index + $bad_position[$#bad_position]+1;
+		}
+		else {
+			last;
+		}
+	}
+	
+	
+	if ($#bad_position>=0){
+		my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
+		$startnoN = $coord{"start"};
+		$stopnoN = $coord{"stop"};	
+	}
+	my $lengthnoN = $stopnoN - $startnoN + 1;
+	my $seqnoN = substr($seq,$startnoN,$lengthnoN);
+	#print "$seqnoN\n";
+	
+	if ($lengthnoN >= $MIN_LENGTH){
+		my $startTrim = $startnoN;
+		my $stopTrim = $stopnoN;
+		
+		my $quality_converted="";
+		my @bad_position;
+		
+		my @q = split(//,$quality);
+		#print "QUALITY\n";
+		#print "$quality\n";
+		for (my $i=0;$i<=$stopnoN;$i++){
+			my $chr = $q[$i];
+			my $num = ord($q[$i]);
+			if ($TYPE eq "illumina"){
+				$num = $num -64+33;
+				$quality_converted .= chr($num);
+			}
+			
+			if ($num <$MIN_QUALITY + 64 - 33 ){
+				push(@bad_position,$i+$startnoN);
+			}
+		}
+		if ($quality_converted){$quality = $quality_converted;}
+		#print "$quality\n";
+		
+		
+		
+		if ($#bad_position>=0){
+			# for (my $i=0;$i<=$#bad_position;$i++){
+				# print $bad_position[$i]."\t";
+			# }
+			# print "\n";
+			my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
+			$startTrim = $coord{"start"};
+			$stopTrim = $coord{"stop"};
+			#print "$startTrim .. $stopTrim\n";
+			
+		}
+		my $lengthTrim = $stopTrim - $startTrim +1;
+		
+		
+		my $fastq_lines="";
+		
+		if ($lengthTrim >= $MIN_LENGTH){
+			$fastq_lines .= $header;
+			$fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
+			$fastq_lines .= "+\n";
+			$fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
+			return $fastq_lines;
+		}
+		else {
+			return "";
+		}
+		
+		
+		
+	}
+	else {
+		return "";
+	}
+	
+	
+	# my @s = split(//,$seq);
+	# my $sanger_quality="";
+
+	
+	
+
+	# return $sanger_quality;
+}
+
+sub extract_longer_string_coordinates_from_bad_position{
+	my $start=shift;
+	my $stop =shift;
+	my $refbad = shift;
+	my @bad_position = @$refbad;
+	my %coord;
+	
+	my $current_start = $start;
+	my $current_stop = $bad_position[0]-1;
+	if ($current_stop < $start){$current_stop = $start;}
+	
+	
+	#debut -> premier N
+	my $current_length = $current_stop - $current_start +1;
+	my $test_length;
+	
+	#entre les N
+	for (my $i=1;$i<=$#bad_position;$i++){
+		$test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
+		if ( $test_length >  $current_length){
+			$current_start = $bad_position[$i-1]+1;
+			$current_stop = $bad_position[$i]-1;
+			$current_length = $current_stop - $current_start +1;
+		}
+	}
+	
+	#dernier N -> fin
+	$test_length = $stop-$bad_position[$#bad_position]+1;
+	if ( $test_length >  $current_length){
+		$current_start = $bad_position[$#bad_position]+1;
+		if ($current_start > $stop){$current_start=$stop;}
+		$current_stop = $stop;
+	}	
+	$coord{"start"}=$current_start;
+	$coord{"stop"}= $current_stop;
+	$coord{"lenght"}=$current_stop-$current_start+1;
+	
+	return \%coord;
+}