view FastQ_QualConverter.pl @ 1:3990d6b37e2d draft default tip

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:24:43 -0500
parents c450731486c8
children
line wrap: on
line source

#!/usr/bin/perl

# load modules
use Getopt::Std;
use File::Basename;

##########
## opts ##
##########
## input files
# i : path to (i)nput fastq file
## output files
# o : (o)utput file
## optional options
# f : (f)ormat of input fastq file, default : detect automatically.
getopts('i:o:f:', \%opts) ;

## check existence of input file
if (!exists($opts{'i'}) || !-e $opts{'i'}) {
	die('input file not found');
}



############################
## Construct Quality hash ##
############################
my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
my @asci = ('') x 33; 
my @qvalues = split('',$qstring);
push(@asci, @qvalues);

#############################
## INPUT FILE SCORE FORMAT ##
#############################
my $informat = '';
my $validate = 0;
if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') {
	$informat = $opts{'f'};
}
else {
	$validate = 1; ## track during conversion to make sure.
	my %tracker = ();
	my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger.
	## scan input file until clear.
	open IN, $opts{'i'};
	$try = 0;
	while ($try < 15000) {
		$try ++;
		my $name = <IN>;
		my $seq = <IN>;
		my $dump = <IN>;
		my $qual = <IN>;
		chomp($qual);
		for ($i = 0; $i < length($qual); $i++) {
			my $char = quotemeta(substr($qual,$i,1));
			if ($sanger =~ m/$char/) {
				$informat = 'sanger';
				$try = 50000;
				print "Item ($char) found in :\n $qual\n";
				last;
			}
			$tracker{$char} = 1;
		}
	}
	close IN;
	if ($informat eq '' ) {
		## no sanger (or ALL reads should be completely above qual=26)
		## check solexa items
		foreach (qw/;<=>?/) {
			if (exists($tracker{$_})) {
				$informat = 'solexa';
				last;
			}
		}
	}
	if ($informat eq '') {
		## both illumina 1.3 and 1.5 are phred+64. 
		$informat = 'illumina';
	}
}


print "Input format detected : $informat\n";

## SET CONVERSION RULE to sanger
my @conv_table;
if ($informat eq 'solexa') {
	## advanced conversion table
	foreach (@qvalues) {
		my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred	
		$conv_table[ord($_)] = $asci[$Q + 33];  # phred + 33 = sanger
	}
}
elsif ($informat ne 'sanger') {
	## easy conversion table
	foreach (@qvalues) {
		$conv_table[ord($_)] = $asci[ord($_) - 31];  # base64 - 31 = sanger
	}
}
else {
	foreach (@qvalues) {
		$conv_table[ord($_)] = $asci[ord($_)];  # sanger = sanger
	}
}

if ($informat eq 'sanger') {
	system("cp -f ".$opts{'i'}." ".$opts{'o'});
	exit;
}
######################
## CONVERT THE FILE ##
######################
open IN, $opts{'i'};
open OUT, ">".$opts{'o'};
my $counter = 0;
my $outstring = '';
while ($readname = <IN>) {
	$counter++;
	my $seq = <IN>;
	my $readnamebis = <IN>;
	my $qstring = <IN>;
	chomp($qstring);
	my $outqual = '';
	foreach $ascval (unpack("C*",$qstring)) {	# => returns the asci value of each item in string
		$outqual .= $conv_table[$ascval];
	}
	$outstring .= "$readname$seq$readnamebis$outqual\n";
	if ($counter > 500000) { # print per 500K reads.
		print OUT $outstring;
		$outstring = '';
		$counter = 0;
	}

}
print OUT $outstring;
close IN;
close OUT;