diff rapsodyn/PrepareFastqLight.pl @ 7:3f7b0788a1c4 draft

Uploaded
author mcharles
date Tue, 07 Oct 2014 10:34:34 -0400
parents b0cbb9d21aa9
children d857538d9fea
line wrap: on
line diff
--- a/rapsodyn/PrepareFastqLight.pl	Mon Sep 29 03:02:16 2014 -0400
+++ b/rapsodyn/PrepareFastqLight.pl	Tue Oct 07 10:34:34 2014 -0400
@@ -1,28 +1,51 @@
 #!/usr/bin/perl
+#V1.0.1 added log, option parameters
 use strict;
 use warnings;
-
-my $read1 = $ARGV[0];
-my $read2 = $ARGV[1];
+use Getopt::Long;
 
-my $output1 = $ARGV[2];
-my $output2 = $ARGV[3];
+my $read1_file;
+my $read2_file;
+my $log_file;
+my $output1_file;
+my $output2_file;
 
-my $TYPE = $ARGV[4];
-my $MIN_LENGTH = $ARGV[5];
-my $MIN_QUALITY = $ARGV[6];
+my $TYPE="sanger";
+my $MIN_LENGTH=30;
+my $MIN_QUALITY=30;
+
+my $VERBOSE = "OFF";
 
-my $VERBOSE = $ARGV[7];
+GetOptions (
+"read1_file=s" => \$read1_file,
+"read2_file=s" => \$read2_file,
+"log_file=s" => \$log_file,
+"output1_file=s" => \$output1_file,
+"output2_file=s" => \$output2_file,
+"type=s" => \$TYPE,
+"min_length=i" => \$MIN_LENGTH,
+"min_quality=i" => \$MIN_QUALITY,
+"verbose=s" => \$VERBOSE
+) or die("Error in command line arguments\n");
 
-if (!$VERBOSE){
-	$VERBOSE ="OFF";
-}
 
-open(READ1, $read1) or die ("Can't open $read1\n");
-open(READ2, $read2) or die ("Can't open $read2\n");
-open(OUT1, ">$output1") or die ("Can't open $output1\n");
-open(OUT2, ">$output2") or die ("Can't open $output2\n");
+my $nb_read1=0;
+my $nb_base_read1=0;
+my $nb_read2=0;
+my $nb_base_read2=0;
 
+my $nb_read1_t=0;
+my $nb_base_read1_t=0;
+my $nb_read2_t=0;
+my $nb_base_read2_t=0;
+
+my $nb_base_current_t=0;
+
+
+open(READ1, $read1_file) or die ("Can't open $read1_file\n");
+open(READ2, $read2_file) or die ("Can't open $read2_file\n");
+open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n");
+open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n");
 
 my $error1=0;
 my $error2=0;
@@ -43,6 +66,9 @@
 	my $ligne2_r2 =<READ2>;
 	my $ligne3_r2 =<READ2>;
 	my $ligne4_r2 =<READ2>;
+
+	$nb_read1++;
+	$nb_read2++;
 	
 #@ 1 sec
 	if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
@@ -91,6 +117,7 @@
 		my $header2="";
 		my $repheader1="";
 		my $repheader2="";
+
 		
 		if ($ligne1_r1 =~/^\@(.*?)\#/){
 			$header1 = $1;
@@ -190,9 +217,11 @@
 			### Parsing sequence & qualité
 			if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
 				$seq1 = $1;
+				$nb_base_read1 += length($seq1);
 			}
 			if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
 				$seq2 = $1;
+				$nb_base_read2 += length($seq2);
 			}
 			if ($ligne4_r1 =~ /^(.*)\s*$/i){
 				$qual1 = $1;
@@ -236,37 +265,49 @@
 			else {
 				my $fastq_lines_r1="";
 				my $fastq_lines_r2="";
+				my $nb_base_current_read1_t = 0;
+				my $nb_base_current_read2_t = 0;
+				
 				$fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
+				$nb_base_current_read1_t = $nb_base_current_t;
 				if ($fastq_lines_r1){
 					$fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
+					$nb_base_current_read2_t = $nb_base_current_t;
 				}
 				if ($fastq_lines_r2){
 					print OUT1 $fastq_lines_r1;
 					print OUT2 $fastq_lines_r2;
+
+					$nb_read1_t++;
+					$nb_read2_t++;
+					$nb_base_read1_t += $nb_base_current_read1_t;
+					$nb_base_read2_t += $nb_base_current_read2_t;
+
+
 				}
 			}
 		}
 	
-		# print OUT1 $ligne1_r1;
-		# print OUT1 $ligne2_r1;
-		# print OUT1 $ligne3_r1;
-		# print OUT1 $ligne4_r1;
-		# print OUT2 $ligne1_r2;
-		# print OUT2 $ligne2_r2;
-		# print OUT2 $ligne3_r2;
-		# print OUT2 $ligne4_r2;
-		
+
 #@ 7 sec
 	}
 }
 
-
-
 close (READ1);
 close (READ2);
 close (OUT1);
 close (OUT2);
 
+open (LF,">$log_file") or die("Can't open $log_file\n");
+print LF "\n####\t Fastq preparation \n";
+print LF "## Before preparation\n";
+print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n";
+print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n";
+print LF "## After preparation\n";
+print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n";
+print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n";
+close (LF);
+		
 
 sub grooming_and_trimming{
 	my $header = shift;
@@ -326,7 +367,13 @@
 		
 	}
 	my $lengthTrim = $stopTrim - $startTrim +1;
-	
+
+	#if ($stats_length{$lengthTrim}){
+	#	$stats_length{$lengthTrim} = 1;		
+	#}
+	#else {
+	#	$stats_length{$lengthTrim}++;
+	#}
 	my $fastq_lines="";
 	
 #	if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
@@ -343,11 +390,23 @@
 #		print "\n";
 #	}
 
+	#for (my $i=$startTrim;$i<=$stopTrim;$i++){
+	#	if ($stats_quality{ord($q{$i])}){
+	#		$stats_quality{ord($q{$i])}=1;
+	#	}
+	#	else {
+	#		$stats_quality{ord($q{$i])}++;
+	#	}
+	#}
+
 	if ($lengthTrim >= $MIN_LENGTH){
 		$fastq_lines .= $header;
-		$fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
+		my $new_seq = substr($seq,$startTrim,$lengthTrim);
+		$nb_base_current_t = length($new_seq);
+		$fastq_lines .= $new_seq."\n";
 		$fastq_lines .= "+\n";
-		$fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
+		my $new_q = substr($quality,$startTrim,$lengthTrim);
+		$fastq_lines .= $new_q."\n";
 		return $fastq_lines;
 		
 	}