view rapsodyn/PrepareFastqLight.pl @ 5:b0cbb9d21aa9 draft

Uploaded
author mcharles
date Mon, 22 Sep 2014 10:19:53 -0400
parents 9074a5104cdd
children 3f7b0788a1c4
line wrap: on
line source

#!/usr/bin/perl
use strict;
use warnings;

my $read1 = $ARGV[0];
my $read2 = $ARGV[1];

my $output1 = $ARGV[2];
my $output2 = $ARGV[3];

my $TYPE = $ARGV[4];
my $MIN_LENGTH = $ARGV[5];
my $MIN_QUALITY = $ARGV[6];

my $VERBOSE = $ARGV[7];

if (!$VERBOSE){
	$VERBOSE ="OFF";
}

open(READ1, $read1) or die ("Can't open $read1\n");
open(READ2, $read2) or die ("Can't open $read2\n");
open(OUT1, ">$output1") or die ("Can't open $output1\n");
open(OUT2, ">$output2") or die ("Can't open $output2\n");


my $error1=0;
my $error2=0;
my $error3=0;
my $error4=0;
my $error5=0;
my $error6=0;
my $error7=0;
my $error8=0;
my $error9=0;
my $error10=0;

while (my $ligne1_r1 =<READ1>){
	my $ligne2_r1 =<READ1>;
	my $ligne3_r1 =<READ1>;
	my $ligne4_r1 =<READ1>;
	my $ligne1_r2 =<READ2>;
	my $ligne2_r2 =<READ2>;
	my $ligne3_r2 =<READ2>;
	my $ligne4_r2 =<READ2>;
	
#@ 1 sec
	if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
		if ($VERBOSE eq "ON"){
			print "Error in file format";
			if ($ligne1_r1){print $ligne1_r1;}
			if ($ligne2_r1){print $ligne2_r1;}
			if ($ligne3_r1){print $ligne3_r1;}
			if ($ligne4_r1){print $ligne4_r1;}
			if ($ligne1_r2){print $ligne1_r2;}
			if ($ligne2_r2){print $ligne2_r2;}
			if ($ligne3_r2){print $ligne3_r2;}
			if ($ligne4_r2){print $ligne4_r2;}
			print "\n";
		}
		$error1++;
	}
	elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
		if ($VERBOSE eq "ON"){
			print "Error in header : format\n";
			print $ligne1_r1;
			print $ligne2_r1;
			print $ligne3_r1;
			print $ligne4_r1;
			print $ligne1_r2;
			print $ligne2_r2;
			print $ligne3_r2;
			print $ligne4_r2;
			print "\n";
		}
		$error2++;
	}
#@ 1 - 2 sec
	else {
		
		my $length_seq1 = length($ligne2_r1);
		my $length_qual1 =length($ligne4_r1);
		my $seq1;
		my $qual1;
	
		my $length_seq2 = length($ligne2_r2);
		my $length_qual2 =length($ligne4_r2);
		my $seq2;
		my $qual2;
		my $header1="";
		my $header2="";
		my $repheader1="";
		my $repheader2="";
		
		if ($ligne1_r1 =~/^\@(.*?)\#/){
			$header1 = $1;
		}
		
		if ($ligne3_r1 =~/^\+(.*?)\#/){
                        $repheader1 = $1;
               	}

		if ($ligne1_r2 =~/^\@(.*?)\#/){
                        $header2 = $1;
               	}

		if ($ligne3_r2 =~/^\+(.*?)\#/){
                        $repheader2 = $1;
               	}
#@ 2 sec
		
		### Verification de la coherence sequence /qualité   @ 1 sec
		if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
			if ($VERBOSE eq "ON"){
				print "Error in header : empty\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error3++;
		}
		elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
			if ($VERBOSE eq "ON"){
				print "Error in header refgsd : empty\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error3++;
		}
		elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
			if ($VERBOSE eq "ON"){
				print "Error in header : different\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error4++;
		}
		elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
			if ($VERBOSE eq "ON"){
				print "Error in header : different\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error4++;
		}
		elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
			if ($VERBOSE eq "ON"){
				print "Error in seq/qual length\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error5++;
		}
#@ 1 - 2 sec	
		else {
			### Parsing sequence & qualité
			if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
				$seq1 = $1;
			}
			if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
				$seq2 = $1;
			}
			if ($ligne4_r1 =~ /^(.*)\s*$/i){
				$qual1 = $1;
			}
			if ($ligne4_r2 =~ /^(.*)\s*$/i){
				$qual2 = $1;
			}
#@ 2 sec			
			### Verification du parsing et de la coherence sequence /qualité (n°2)
			if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
				if ($VERBOSE eq "ON"){
					print "Error parsing seq / quality \n";
					print $ligne1_r1;
					print $ligne2_r1;
					print $ligne3_r1;
					print $ligne4_r1;
					print $ligne1_r2;
					print $ligne2_r2;
					print $ligne3_r2;
					print $ligne4_r2;
					print "\n";
				}
				$error6++;
			}
			elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
				if ($VERBOSE eq "ON"){
					print "Error in seq/qual length after parsing\n";
					print $ligne1_r1;
					print $ligne2_r1;
					print $ligne3_r1;
					print $ligne4_r1;
					print $ligne1_r2;
					print $ligne2_r2;
					print $ligne3_r2;
					print $ligne4_r2;
					print "\n";
				}
				$error7++;
			}
#@ <1 sec
			else {
				my $fastq_lines_r1="";
				my $fastq_lines_r2="";
				$fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
				if ($fastq_lines_r1){
					$fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
				}
				if ($fastq_lines_r2){
					print OUT1 $fastq_lines_r1;
					print OUT2 $fastq_lines_r2;
				}
			}
		}
	
		# print OUT1 $ligne1_r1;
		# print OUT1 $ligne2_r1;
		# print OUT1 $ligne3_r1;
		# print OUT1 $ligne4_r1;
		# print OUT2 $ligne1_r2;
		# print OUT2 $ligne2_r2;
		# print OUT2 $ligne3_r2;
		# print OUT2 $ligne4_r2;
		
#@ 7 sec
	}
}



close (READ1);
close (READ2);
close (OUT1);
close (OUT2);


sub grooming_and_trimming{
	my $header = shift;
	my $seq = shift;
	my $quality = shift;
	my $quality_converted="";
	my $quality_ori=$quality;
	
	my $lengthseq = length($seq);
	my $startTrim = 0;
	my $stopTrim = length($quality)-1;
	my $startnoN = $startTrim;
	my $stopnoN = $stopTrim;
	
	
	my $chercheN = $seq;
	my @bad_position_N;
	my @bad_position_Q;
	my $current_index = index($chercheN,"N");
	my $abs_index = $current_index;
	while ($current_index >=0){
		push (@bad_position_N,$abs_index);

		if ($current_index<length($seq)){
			$chercheN = substr($chercheN,$current_index+1);
			$current_index = index($chercheN,"N");
			$abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
		}
		else {
			last;
		}
	}
		
	my @q = split(//,$quality);
	for (my $i=0;$i<=$#q;$i++){
		my $chr = $q[$i];
		my $num = ord($q[$i]);
		if ($TYPE eq "illumina"){
			$num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
			$quality_converted .= chr($num);
		}
		
		if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
			push(@bad_position_Q,$i);
		}
	}
	if ($quality_converted){$quality = $quality_converted;}
	
	my @bad_position = (@bad_position_N, @bad_position_Q);
	
	if ($#bad_position>=0){
		@bad_position = sort {$a <=> $b} @bad_position;
		my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
		$startTrim = $coord{"start"};
		$stopTrim = $coord{"stop"};
#print "$startTrim .. $stopTrim\n";
		
	}
	my $lengthTrim = $stopTrim - $startTrim +1;
	
	my $fastq_lines="";
	
#	if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
#		print "HEAD:\t$header";	
#		print "SEQ:\n$seq\n";
#		print "$quality_ori\n";
#		print "$quality\n";
#		for (my $i=0;$i<=$#bad_position;$i++){
#			print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
#		}
#		print "\n";
#		print "$startTrim .. $stopTrim / $lengthTrim \n";
#		print $fastq_lines;
#		print "\n";
#	}

	if ($lengthTrim >= $MIN_LENGTH){
		$fastq_lines .= $header;
		$fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
		$fastq_lines .= "+\n";
		$fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
		return $fastq_lines;
		
	}
	else {
		#print "Insufficient length after trimming\n";
		return "";
	}
}

sub extract_longer_string_coordinates_from_bad_position{
	my $start=shift;
	my $stop =shift;
	my $refbad = shift;
	my @bad_position = @$refbad;
	my %coord;
	
	my $current_start = $start;
	my $current_stop = $bad_position[0]-1;
	if ($current_stop < $start){$current_stop = $start;}
	
	
	#debut -> premier N
	my $current_length = $current_stop - $current_start +1;
	my $test_length;
	
	#entre les N
	for (my $i=1;$i<=$#bad_position;$i++){
		$test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
		if ( $test_length >  $current_length){
			$current_start = $bad_position[$i-1]+1;
			$current_stop = $bad_position[$i]-1;
			$current_length = $current_stop - $current_start +1;
		}
	}
	
	#dernier N -> fin
	$test_length = $stop-$bad_position[$#bad_position]+1;
	if ( $test_length >  $current_length){
		$current_start = $bad_position[$#bad_position]+1;
		if ($current_start > $stop){$current_start=$stop;}
		$current_stop = $stop;
	}	
	$coord{"start"}=$current_start;
	$coord{"stop"}= $current_stop;
	$coord{"lenght"}=$current_stop-$current_start+1;
	
	return \%coord;
}