0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 ###############################################
|
|
4 # pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl
|
|
5 # John Garbe
|
|
6 # February 2012
|
|
7 #
|
|
8 # Paired-end read file synchronization script: checks if left and right
|
|
9 # reads are in the same order.
|
|
10 # This script checks the two files specificd on the command line.
|
|
11 # The "quick" option will not report the percent of out-of-sync reads
|
|
12 # for many failed files, but will run much faster.
|
|
13 #
|
|
14 # This script should run fine on linux systems or solaris
|
|
15 # (which is what the project space fileserver runs)
|
|
16 #
|
|
17 # This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids.
|
|
18 # Other types of read ID formats will cause a terminal error.
|
|
19 #
|
|
20 ###############################################
|
|
21
|
|
22 die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0);
|
|
23
|
|
24 # determine if we're running on solaris in order to change system calls
|
|
25 $output = `perl -v`;
|
|
26 $solaris = 0;
|
|
27 $solaris = 1 if ($output =~ /solaris/);
|
|
28 if ($solaris) {
|
|
29 # print STDERR "Running on Solaris\n";
|
|
30 }
|
|
31
|
|
32 # handle "quick" option
|
|
33 $quick = 0; # 1 for quickcheck, 0 for thorough check
|
|
34 if (defined($ARGV[0])) {
|
|
35 if ($ARGV[0] eq "quick") {
|
|
36 $quick = 1;
|
|
37 shift @ARGV;
|
|
38 }
|
|
39 }
|
|
40 print STDERR "QUICK CHECK enabled\n" if ($quick);
|
|
41
|
|
42 $filename_r1 = $ARGV[0];
|
|
43 $filename_r2 = $ARGV[1];
|
|
44
|
|
45 # identify read id format type
|
|
46 if ($solaris) {
|
|
47 $readid = `head -1 $filename_r1`;
|
|
48 } else {
|
|
49 $readid = `head -n 1 $filename_r1`;
|
|
50 }
|
|
51 chomp $readid;
|
|
52 if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line
|
|
53 $id_type = 1;
|
|
54 print STDERR "Casava 1.7 read id style\n"; # TESTING
|
|
55 } elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace
|
|
56 $id_type = 2;
|
|
57 print STDERR "Casava 1.8 read id style\n"; # TESTING
|
|
58 } else {
|
|
59 print STDERR "Cannot determine read id style\n";
|
|
60 print STDOUT "Unknwon id style: $readid\n";
|
|
61 exit 1;
|
|
62 }
|
|
63
|
|
64 chomp $filename_r1;
|
|
65 chomp $filename_r2;
|
|
66
|
|
67 if ($quick) {
|
|
68 $failure = &quickcheck($filename_r1, $filename_r2); # quickie check
|
|
69 print STDERR "FAILED quickcheck\n" if ($failure);
|
|
70 }
|
|
71 # If we aren't doing the quick check, or the quick check passed, do the slow check
|
|
72 if (!($quick) || !($failure)) {
|
|
73 $failure = &percent($filename_r1, $filename_r2); # slow thorough check
|
|
74 }
|
|
75
|
|
76 exit;
|
|
77
|
|
78 ######################## subs #########################
|
|
79 # do a quick check to see if the files are bad (many bad files will pass this quick check)
|
|
80 sub quickcheck {
|
|
81 my ($file1, $file2) = @_;
|
|
82 if ($solaris) {
|
|
83 $result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF
|
|
84 $result2 = `tail -10000l $file2 | head -1`;
|
|
85 } else {
|
|
86 $result1 = `tail -n 10000 $file1 | head -n 1`;
|
|
87 $result2 = `tail -n 10000 $file2 | head -n 1`;
|
|
88 }
|
|
89 # process the ids depending on id style
|
|
90 if ($id_type == 1) { # 1 or 2 at end of id
|
|
91 chomp $result1;
|
|
92 chomp $result2;
|
|
93 chop $result1;
|
|
94 chop $result2;
|
|
95 } else { # first word is a unique read id, second word is left/right-specific
|
|
96 ($result1) = split ' ', $result1;
|
|
97 ($result2) = split ' ', $result2;
|
|
98 }
|
|
99 if ($result1 eq $result2) {
|
|
100 # print "pass\n";
|
|
101 return 0;
|
|
102 } else {
|
|
103 # print "fail\n";
|
|
104 return 1;
|
|
105 }
|
|
106 }
|
|
107
|
|
108
|
|
109 # compare every read id from each file and determine what percent of reads are
|
|
110 # out of sync
|
|
111 sub percent {
|
|
112 my($file1, $file2) = @_;
|
|
113
|
|
114 open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!";
|
|
115 open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!";
|
|
116
|
|
117 my $readCount = 0;
|
|
118 my $failcount = 0;
|
|
119 while (!eof(R1) and !eof(R2)) {
|
|
120 $id1 = <R1>;
|
|
121 $id2 = <R2>;
|
|
122 next unless ($. % 4 == 1); # check every sequence (4th line in the file)
|
|
123 # process the ids depending on id style
|
|
124 if ($id_type == 1) { # 1 or 2 at end of id
|
|
125 chomp $id1; # newline
|
|
126 chop $id1; # last char of the id (should be the "1" or "2")
|
|
127 chomp $id2;
|
|
128 chop $id2;
|
|
129 } else {
|
|
130 ($id1) = split ' ', $id1;
|
|
131 ($id2) = split ' ', $id2;
|
|
132 }
|
|
133 $failcount++ unless ($id1 eq $id2);
|
|
134 $readCount++;
|
|
135 }
|
|
136 my $percent = $failcount / $readCount;
|
|
137 $prettypercent = $percent * 100; # make it percent
|
|
138 $prettypercent = sprintf("%.2f", $prettypercent);
|
|
139 if ($failcount > 0) {
|
|
140 print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n";
|
|
141 $exitStatus=1;
|
|
142 } else {
|
|
143 print STDERR "PASSED full check\n";
|
|
144 $exitStatus=0;
|
|
145 }
|
|
146
|
|
147 return $exitStatus;
|
|
148 }
|