changeset 0:c450731486c8 draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:24:33 -0500
parents
children 3990d6b37e2d
files FastQ_QualConverter.pl
diffstat 1 files changed, 138 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FastQ_QualConverter.pl	Thu Feb 13 08:24:33 2014 -0500
@@ -0,0 +1,138 @@
+#!/usr/bin/perl
+
+# load modules
+use Getopt::Std;
+use File::Basename;
+
+##########
+## opts ##
+##########
+## input files
+# i : path to (i)nput fastq file
+## output files
+# o : (o)utput file
+## optional options
+# f : (f)ormat of input fastq file, default : detect automatically.
+getopts('i:o:f:', \%opts) ;
+
+## check existence of input file
+if (!exists($opts{'i'}) || !-e $opts{'i'}) {
+	die('input file not found');
+}
+
+
+
+############################
+## Construct Quality hash ##
+############################
+my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
+my @asci = ('') x 33; 
+my @qvalues = split('',$qstring);
+push(@asci, @qvalues);
+
+#############################
+## INPUT FILE SCORE FORMAT ##
+#############################
+my $informat = '';
+my $validate = 0;
+if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') {
+	$informat = $opts{'f'};
+}
+else {
+	$validate = 1; ## track during conversion to make sure.
+	my %tracker = ();
+	my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger.
+	## scan input file until clear.
+	open IN, $opts{'i'};
+	$try = 0;
+	while ($try < 15000) {
+		$try ++;
+		my $name = <IN>;
+		my $seq = <IN>;
+		my $dump = <IN>;
+		my $qual = <IN>;
+		chomp($qual);
+		for ($i = 0; $i < length($qual); $i++) {
+			my $char = quotemeta(substr($qual,$i,1));
+			if ($sanger =~ m/$char/) {
+				$informat = 'sanger';
+				$try = 50000;
+				print "Item ($char) found in :\n $qual\n";
+				last;
+			}
+			$tracker{$char} = 1;
+		}
+	}
+	close IN;
+	if ($informat eq '' ) {
+		## no sanger (or ALL reads should be completely above qual=26)
+		## check solexa items
+		foreach (qw/;<=>?/) {
+			if (exists($tracker{$_})) {
+				$informat = 'solexa';
+				last;
+			}
+		}
+	}
+	if ($informat eq '') {
+		## both illumina 1.3 and 1.5 are phred+64. 
+		$informat = 'illumina';
+	}
+}
+
+
+print "Input format detected : $informat\n";
+
+## SET CONVERSION RULE to sanger
+my @conv_table;
+if ($informat eq 'solexa') {
+	## advanced conversion table
+	foreach (@qvalues) {
+		my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred	
+		$conv_table[ord($_)] = $asci[$Q + 33];  # phred + 33 = sanger
+	}
+}
+elsif ($informat ne 'sanger') {
+	## easy conversion table
+	foreach (@qvalues) {
+		$conv_table[ord($_)] = $asci[ord($_) - 31];  # base64 - 31 = sanger
+	}
+}
+else {
+	foreach (@qvalues) {
+		$conv_table[ord($_)] = $asci[ord($_)];  # sanger = sanger
+	}
+}
+
+if ($informat eq 'sanger') {
+	system("cp -f ".$opts{'i'}." ".$opts{'o'});
+	exit;
+}
+######################
+## CONVERT THE FILE ##
+######################
+open IN, $opts{'i'};
+open OUT, ">".$opts{'o'};
+my $counter = 0;
+my $outstring = '';
+while ($readname = <IN>) {
+	$counter++;
+	my $seq = <IN>;
+	my $readnamebis = <IN>;
+	my $qstring = <IN>;
+	chomp($qstring);
+	my $outqual = '';
+	foreach $ascval (unpack("C*",$qstring)) {	# => returns the asci value of each item in string
+		$outqual .= $conv_table[$ascval];
+	}
+	$outstring .= "$readname$seq$readnamebis$outqual\n";
+	if ($counter > 500000) { # print per 500K reads.
+		print OUT $outstring;
+		$outstring = '';
+		$counter = 0;
+	}
+
+}
+print OUT $outstring;
+close IN;
+close OUT;