0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 ###########################################################
|
|
4 # resync.pl
|
|
5 # John Garbe
|
|
6 # June 2012
|
|
7 #
|
|
8 # Resynchronize a pair of paired-end fastq files. Read in two unsynchronized
|
|
9 # fastq files and write out two synchronized fastq files. The
|
|
10 # synchronized files have properly paired reads, with singelton reads removed.
|
|
11 #
|
|
12 #
|
|
13 ###########################################################
|
|
14
|
|
15 die "USAGE: resync.pl sample1_R1.fastq sample1_R2.fastq [sample1_R1_synced.fastq] [sample1_R2_synced.fastq]\n" unless ($#ARGV >= 1 && $#ARGV <= 3);
|
|
16
|
|
17 # open up input files
|
|
18 open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n";
|
|
19 open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n";
|
|
20
|
|
21 # open up output files
|
|
22 if ($#ARGV > 1) {
|
|
23 open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n";
|
|
24 } else {
|
|
25 open O1, ">$ARGV[0].out" or die "cannot open $ARGV[0].out\n";
|
|
26 }
|
|
27 if ($#ARGV == 3) {
|
|
28 open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n";
|
|
29 } else {
|
|
30 open O2, ">$ARGV[1].out" or die "cannot open $ARGV[1].out\n";
|
|
31 }
|
|
32
|
|
33 $readid = `head -n 1 $ARGV[0]`;
|
|
34 chomp $readid;
|
|
35 if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line
|
|
36 $id_type = 1;
|
|
37 print STDERR "Casava 1.7 read id style\n"; # TESTING
|
|
38 } elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace
|
|
39 $id_type = 2;
|
|
40 print STDERR "Casava 1.8 read id style\n"; # TESTING
|
|
41 } else {
|
|
42 print STDERR "Cannot determine read id style\n";
|
|
43 print STDOUT "Unknwon id style: $readid\n";
|
|
44 exit 1;
|
|
45 }
|
|
46
|
|
47
|
|
48 # read in the first fastq file and store in a hash
|
|
49 $f1readcount = 0;
|
|
50 while ($f1line1 = <F1>) {
|
|
51 $f1readcount++;
|
|
52 if ($f1readcount % 1000000 == 0) { # print progress update
|
|
53 print STDERR "$f1readcount reads processed in first file\n";
|
|
54 }
|
|
55 $f1line2 = <F1>;
|
|
56 $f1line3 = <F1>;
|
|
57 $f1line4 = <F1>;
|
|
58
|
|
59 ($id, $junk) = split /\//, $f1line1;
|
|
60 $r1{$id} = $f1line1 . $f1line2 . $f1line3 . $f1line4;
|
|
61 }
|
|
62
|
|
63 # read in the second fastq file, printing out both read pairs if available, otherwise tossing the reads
|
|
64 $f2readcount = 0;
|
|
65 $f1missed = $f1readcount;
|
|
66 $f2missed = 0;
|
|
67 while ($f2line1 = <F2>) {
|
|
68 $f2readcount++;
|
|
69 if ($f2readcount % 1000000 == 0) { # print progress update
|
|
70 print STDERR "$f2readcount reads processed in second file\n";
|
|
71 }
|
|
72 $f2line2 = <F2>;
|
|
73 $f2line3 = <F2>;
|
|
74 $f2line4 = <F2>;
|
|
75
|
|
76 ($id, $junk) = split /\//, $f2line1;
|
|
77 if (defined($r1{$id})) {
|
|
78 $f1missed--;
|
|
79 print O1 "$r1{$id}";
|
|
80 print O2 "$f2line1";
|
|
81 print O2 "$f2line2";
|
|
82 print O2 "$f2line3";
|
|
83 print O2 "$f2line4";
|
|
84 } else {
|
|
85 $f2missed++;
|
|
86 }
|
|
87 }
|
|
88
|
|
89 print "$f1readcount reads in $ARGV[0], $f1missed without mate\n";
|
|
90 print "$f2readcount reads in $ARGV[1], $f2missed without mate\n";
|
|
91
|
|
92
|