view rapsodyn/PrepareFastqLight.pl @ 10:0a6c1cfe4dc8 draft

Uploaded
author mcharles
date Mon, 19 Jan 2015 04:33:21 -0500
parents d857538d9fea
children 56d328bce3a7
line wrap: on
line source

#!/usr/bin/perl
#v1.1.0 manage empty files
#v1.0.4 bug correction, last read not considered
#v1.0.3 support rapsodyn header (.... 1:...  / .... 2:...)
#V1.0.2 added auto type detection
#V1.0.1 added log, option parameters
use strict;
use warnings;
use Getopt::Long;

my $read1_file;
my $read2_file;
my $log_file;
my $output1_file;
my $output2_file;

my $TYPE="sanger";
my $MIN_LENGTH=30;
my $MIN_QUALITY=30;

my $VERBOSE = "OFF";

GetOptions (
"read1_file=s" => \$read1_file,
"read2_file=s" => \$read2_file,
"log_file=s" => \$log_file,
"output1_file=s" => \$output1_file,
"output2_file=s" => \$output2_file,
"type=s" => \$TYPE,
"min_length=i" => \$MIN_LENGTH,
"min_quality=i" => \$MIN_QUALITY,
"verbose=s" => \$VERBOSE
) or die("Error in command line arguments\n");


my $nb_read1=0;
my $nb_base_read1=0;
my $nb_read2=0;
my $nb_base_read2=0;

my $nb_read1_t=0;
my $nb_base_read1_t=0;
my $nb_read2_t=0;
my $nb_base_read2_t=0;

my $nb_base_current_t=0;

open(READ1, $read1_file) or die ("Can't open $read1_file\n");
open(READ2, $read2_file) or die ("Can't open $read2_file\n");
open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n");
open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n");
open (LF,">$log_file") or die("Can't open $log_file\n");

if (( -z READ1)&&( -z READ2)){
	exit(0);
}
elsif (( -z READ1)||( -z READ2)){
	print STDERR "One empty File\n";
	exit(0);
}


my $error1=0;
my $error2=0;
my $error3=0;
my $error4=0;
my $error5=0;
my $error6=0;
my $error7=0;
my $error8=0;
my $error9=0;
my $error10=0;

my $auto_type="";
my %qual;
if ($TYPE eq "auto"){
	my $compt=0;
	open(DETECT, $read1_file) or die ("Can't open $read1_file\n"); 
	while (my $ligne1_r1 =<DETECT>){
		my $ligne2_r1 =<DETECT>;
		my $ligne3_r1 =<DETECT>;
		my $ligne4_r1 =<DETECT>;
		$compt++;
		if ($ligne4_r1 =~ /^(.*)\s*$/i){
			my $qual = $1;
			my @q = split(//,$qual);
			for (my $i=0;$i<=$#q;$i++){
				my $num = ord($q[$i]);
				if ($qual{$num}){
					$qual{$num}++;
				}
				else {
					$qual{$num} = 1;
				}
				#range sanger / illumina 1.8+ 	: 33->94
				#range illumina 1.3->1.7 	: 64->105
				if ($num > 94){$auto_type = "illumina";last;}
				if ($num < 64){$auto_type = "sanger";last;}
			}
		}
		else {
			print STDERR "Error in format detection : quality not recognized\n$ligne4_r1";
			exit(0);
		}
	
		if ($auto_type ne ""){
			last;
		}

	}
	close (DETECT);
	if ($auto_type eq ""){
		print STDERR "Error in format detection : type not recognized parsing read1\n";
		foreach my $key (sort {$a <=> $b} keys %qual){
			print "$key\t:\t",$qual{$key},"\n";
		}
		exit(0);
	}
	else {
		$TYPE = $auto_type;
	}
}



my $compt=0;
while (my $ligne1_r1 =<READ1>){
	my $ligne2_r1 =<READ1>;
	my $ligne3_r1 =<READ1>;
	my $ligne4_r1 =<READ1>;
	my $ligne1_r2 =<READ2>;
	my $ligne2_r2 =<READ2>;
	my $ligne3_r2 =<READ2>;
	my $ligne4_r2 =<READ2>;
	# chomp($ligne1_r1);
	# chomp($ligne2_r1);
	# chomp($ligne3_r1);
	# chomp($ligne4_r1);
	# chomp($ligne2_r1);
	
	$compt++;
	$nb_read1++;
	$nb_read2++;
	
#@ 1 sec
	if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
		if ($VERBOSE eq "ON"){
			print "Error in file format";
			if ($ligne1_r1){print $ligne1_r1;}
			if ($ligne2_r1){print $ligne2_r1;}
			if ($ligne3_r1){print $ligne3_r1;}
			if ($ligne4_r1){print $ligne4_r1;}
			if ($ligne1_r2){print $ligne1_r2;}
			if ($ligne2_r2){print $ligne2_r2;}
			if ($ligne3_r2){print $ligne3_r2;}
			if ($ligne4_r2){print $ligne4_r2;}
			print "\n";
		}
		$error1++;
	}
	elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
		if ($VERBOSE eq "ON"){
			print "Error in header : format\n";
			print $ligne1_r1;
			print $ligne2_r1;
			print $ligne3_r1;
			print $ligne4_r1;
			print $ligne1_r2;
			print $ligne2_r2;
			print $ligne3_r2;
			print $ligne4_r2;
			print "\n";
		}
		$error2++;
	}
#@ 1 - 2 sec
	else {
		
		my $length_seq1 = length(chomp($ligne2_r1));
		my $length_qual1 =length(chomp($ligne4_r1));
		my $seq1;
		my $qual1;
	
		my $length_seq2 = length(chomp($ligne2_r2));
		my $length_qual2 =length(chomp($ligne4_r2));
		my $seq2;
		my $qual2;
		my $header1="";
		my $header2="";
		my $repheader1="";
		my $repheader2="";

		
		if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){
			$header1 = $1;
		}
		
		if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){
                        $repheader1 = $1;
               	}

		if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){
                        $header2 = $1;
               	}

		if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){
                        $repheader2 = $1;
               	}
#@ 2 sec
		
		### Verification de la coherence sequence /qualité   @ 1 sec
		if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
			if ($VERBOSE eq "ON"){
				print "Error in header : empty\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error3++;
		}
		elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
			if ($VERBOSE eq "ON"){
				print "Error in header ref : empty\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error3++;
		}
		elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
			if ($VERBOSE eq "ON"){
				print "Error in header : different\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error4++;
		}
		elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
			if ($VERBOSE eq "ON"){
				print "Error in header : different\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error4++;
		}
		elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
			if ($VERBOSE eq "ON"){
				print "Error in seq/qual length\n";
				print "$length_seq1 / $length_qual1 \t $length_seq2 / $length_qual2\n";
				print $ligne1_r1;
				print $ligne2_r1;
				print $ligne3_r1;
				print $ligne4_r1;
				print "\n";
				print $ligne1_r2;
				print $ligne2_r2;
				print $ligne3_r2;
				print $ligne4_r2;
				print "\n";
			}
			$error5++;
		}
#@ 1 - 2 sec	
		else {
			#print "TEST : $compt\n";
			### Parsing sequence & qualité
			if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
				$seq1 = $1;
				$nb_base_read1 += length($seq1);
			}
			if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
				$seq2 = $1;
				$nb_base_read2 += length($seq2);
			}
			if ($ligne4_r1 =~ /^(.*)\s*$/i){
				$qual1 = $1;
			}
			if ($ligne4_r2 =~ /^(.*)\s*$/i){
				$qual2 = $1;
			}
#@ 2 sec			
			### Verification du parsing et de la coherence sequence /qualité (n°2)
			if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
				if ($VERBOSE eq "ON"){
					print "Error parsing seq / quality \n";
					print $ligne1_r1;
					print $ligne2_r1;
					print $ligne3_r1;
					print $ligne4_r1;
					print $ligne1_r2;
					print $ligne2_r2;
					print $ligne3_r2;
					print $ligne4_r2;
					print "\n";
				}
				$error6++;
			}
			elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
				if ($VERBOSE eq "ON"){
					print "Error in seq/qual length after parsing\n";
					print $ligne1_r1;
					print $ligne2_r1;
					print $ligne3_r1;
					print $ligne4_r1;
					print $ligne1_r2;
					print $ligne2_r2;
					print $ligne3_r2;
					print $ligne4_r2;
					print "\n";
				}
				$error7++;
			}
#@ <1 sec
			else {
				my $fastq_lines_r1="";
				my $fastq_lines_r2="";
				my $nb_base_current_read1_t = 0;
				my $nb_base_current_read2_t = 0;
				
				$fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
				$nb_base_current_read1_t = $nb_base_current_t;
				if ($fastq_lines_r1){
					$fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
					$nb_base_current_read2_t = $nb_base_current_t;
				}
				if ($fastq_lines_r2){
					print OUT1 $fastq_lines_r1;
					print OUT2 $fastq_lines_r2;

					$nb_read1_t++;
					$nb_read2_t++;
					$nb_base_read1_t += $nb_base_current_read1_t;
					$nb_base_read2_t += $nb_base_current_read2_t;


				}
			}
		}
	

#@ 7 sec
	}
}

close (READ1);
close (READ2);
close (OUT1);
close (OUT2);

print LF "\n####\t Fastq preparation \n";
print LF "Fastq format : $TYPE\n";
print LF "## Before preparation\n";
print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n";
print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n";
print LF "## After preparation\n";
print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n";
print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n";
close (LF);
		

sub grooming_and_trimming{
	my $header = shift;
	my $seq = shift;
	my $quality = shift;
	my $quality_converted="";
	my $quality_ori=$quality;
	
	my $lengthseq = length($seq);
	my $startTrim = 0;
	my $stopTrim = length($quality)-1;
	my $startnoN = $startTrim;
	my $stopnoN = $stopTrim;
	
	
	my $chercheN = $seq;
	my @bad_position_N;
	my @bad_position_Q;
	my $current_index = index($chercheN,"N");
	my $abs_index = $current_index;
	while ($current_index >=0){
		push (@bad_position_N,$abs_index);

		if ($current_index<length($seq)){
			$chercheN = substr($chercheN,$current_index+1);
			$current_index = index($chercheN,"N");
			$abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
		}
		else {
			last;
		}
	}
		
	my @q = split(//,$quality);
	for (my $i=0;$i<=$#q;$i++){
		my $chr = $q[$i];
		my $num = ord($q[$i]);
		if ($TYPE eq "illumina"){
			$num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
			$quality_converted .= chr($num);
		}
		
		if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
			push(@bad_position_Q,$i);
		}
	}
	if ($quality_converted){$quality = $quality_converted;}
	
	my @bad_position = (@bad_position_N, @bad_position_Q);
	
	if ($#bad_position>=0){
		@bad_position = sort {$a <=> $b} @bad_position;
		my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
		$startTrim = $coord{"start"};
		$stopTrim = $coord{"stop"};
#print "$startTrim .. $stopTrim\n";
		
	}
	my $lengthTrim = $stopTrim - $startTrim +1;

	#if ($stats_length{$lengthTrim}){
	#	$stats_length{$lengthTrim} = 1;		
	#}
	#else {
	#	$stats_length{$lengthTrim}++;
	#}
	my $fastq_lines="";
	
#	if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
#		print "HEAD:\t$header";	
#		print "SEQ:\n$seq\n";
#		print "$quality_ori\n";
#		print "$quality\n";
#		for (my $i=0;$i<=$#bad_position;$i++){
#			print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
#		}
#		print "\n";
#		print "$startTrim .. $stopTrim / $lengthTrim \n";
#		print $fastq_lines;
#		print "\n";
#	}

	#for (my $i=$startTrim;$i<=$stopTrim;$i++){
	#	if ($stats_quality{ord($q{$i])}){
	#		$stats_quality{ord($q{$i])}=1;
	#	}
	#	else {
	#		$stats_quality{ord($q{$i])}++;
	#	}
	#}

	if ($lengthTrim >= $MIN_LENGTH){
		$fastq_lines .= $header;
		my $new_seq = substr($seq,$startTrim,$lengthTrim);
		$nb_base_current_t = length($new_seq);
		$fastq_lines .= $new_seq."\n";
		$fastq_lines .= "+\n";
		my $new_q = substr($quality,$startTrim,$lengthTrim);
		$fastq_lines .= $new_q."\n";
		return $fastq_lines;
		
	}
	else {
		#print "Insufficient length after trimming\n";
		return "";
	}
}

sub extract_longer_string_coordinates_from_bad_position{
	my $start=shift;
	my $stop =shift;
	my $refbad = shift;
	my @bad_position = @$refbad;
	my %coord;
	
	my $current_start = $start;
	my $current_stop = $bad_position[0]-1;
	if ($current_stop < $start){$current_stop = $start;}
	
	
	#debut -> premier N
	my $current_length = $current_stop - $current_start +1;
	my $test_length;
	
	#entre les N
	for (my $i=1;$i<=$#bad_position;$i++){
		$test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
		if ( $test_length >  $current_length){
			$current_start = $bad_position[$i-1]+1;
			$current_stop = $bad_position[$i]-1;
			$current_length = $current_stop - $current_start +1;
		}
	}
	
	#dernier N -> fin
	$test_length = $stop-$bad_position[$#bad_position]+1;
	if ( $test_length >  $current_length){
		$current_start = $bad_position[$#bad_position]+1;
		if ($current_start > $stop){$current_start=$stop;}
		$current_stop = $stop;
	}	
	$coord{"start"}=$current_start;
	$coord{"stop"}= $current_stop;
	$coord{"lenght"}=$current_stop-$current_start+1;
	
	return \%coord;
}