Mercurial > repos > jjohnson > fastq_sync
diff pe_sync_2_files.pl @ 0:751f4938cf0d
Uploaded
author | jjohnson |
---|---|
date | Tue, 05 Feb 2013 15:23:18 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pe_sync_2_files.pl Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,148 @@ +#!/usr/bin/perl -w + +############################################### +# pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl +# John Garbe +# February 2012 +# +# Paired-end read file synchronization script: checks if left and right +# reads are in the same order. +# This script checks the two files specificd on the command line. +# The "quick" option will not report the percent of out-of-sync reads +# for many failed files, but will run much faster. +# +# This script should run fine on linux systems or solaris +# (which is what the project space fileserver runs) +# +# This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. +# Other types of read ID formats will cause a terminal error. +# +############################################### + +die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0); + +# determine if we're running on solaris in order to change system calls +$output = `perl -v`; +$solaris = 0; +$solaris = 1 if ($output =~ /solaris/); +if ($solaris) { +# print STDERR "Running on Solaris\n"; +} + +# handle "quick" option +$quick = 0; # 1 for quickcheck, 0 for thorough check +if (defined($ARGV[0])) { + if ($ARGV[0] eq "quick") { + $quick = 1; + shift @ARGV; + } +} +print STDERR "QUICK CHECK enabled\n" if ($quick); + +$filename_r1 = $ARGV[0]; +$filename_r2 = $ARGV[1]; + +# identify read id format type +if ($solaris) { + $readid = `head -1 $filename_r1`; +} else { + $readid = `head -n 1 $filename_r1`; +} +chomp $readid; +if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line + $id_type = 1; + print STDERR "Casava 1.7 read id style\n"; # TESTING +} elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace + $id_type = 2; + print STDERR "Casava 1.8 read id style\n"; # TESTING +} else { + print STDERR "Cannot determine read id style\n"; + print STDOUT "Unknwon id style: $readid\n"; + exit 1; +} + +chomp $filename_r1; +chomp $filename_r2; + +if ($quick) { + $failure = &quickcheck($filename_r1, $filename_r2); # quickie check + print STDERR "FAILED quickcheck\n" if ($failure); +} +# If we aren't doing the quick check, or the quick check passed, do the slow check +if (!($quick) || !($failure)) { + $failure = &percent($filename_r1, $filename_r2); # slow thorough check +} + +exit; + +######################## subs ######################### +# do a quick check to see if the files are bad (many bad files will pass this quick check) +sub quickcheck { + my ($file1, $file2) = @_; + if ($solaris) { + $result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF + $result2 = `tail -10000l $file2 | head -1`; + } else { + $result1 = `tail -n 10000 $file1 | head -n 1`; + $result2 = `tail -n 10000 $file2 | head -n 1`; + } + # process the ids depending on id style + if ($id_type == 1) { # 1 or 2 at end of id + chomp $result1; + chomp $result2; + chop $result1; + chop $result2; + } else { # first word is a unique read id, second word is left/right-specific + ($result1) = split ' ', $result1; + ($result2) = split ' ', $result2; + } + if ($result1 eq $result2) { +# print "pass\n"; + return 0; + } else { +# print "fail\n"; + return 1; + } +} + + +# compare every read id from each file and determine what percent of reads are +# out of sync +sub percent { + my($file1, $file2) = @_; + + open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!"; + open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!"; + + my $readCount = 0; + my $failcount = 0; + while (!eof(R1) and !eof(R2)) { + $id1 = <R1>; + $id2 = <R2>; + next unless ($. % 4 == 1); # check every sequence (4th line in the file) + # process the ids depending on id style + if ($id_type == 1) { # 1 or 2 at end of id + chomp $id1; # newline + chop $id1; # last char of the id (should be the "1" or "2") + chomp $id2; + chop $id2; + } else { + ($id1) = split ' ', $id1; + ($id2) = split ' ', $id2; + } + $failcount++ unless ($id1 eq $id2); + $readCount++; + } + my $percent = $failcount / $readCount; + $prettypercent = $percent * 100; # make it percent + $prettypercent = sprintf("%.2f", $prettypercent); + if ($failcount > 0) { + print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n"; + $exitStatus=1; + } else { + print STDERR "PASSED full check\n"; + $exitStatus=0; + } + + return $exitStatus; +}