Mercurial > repos > jjohnson > fastq_sync
comparison resync.pl @ 0:751f4938cf0d
Uploaded
author | jjohnson |
---|---|
date | Tue, 05 Feb 2013 15:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:751f4938cf0d |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 ########################################################### | |
4 # resync.pl | |
5 # John Garbe | |
6 # June 2012 | |
7 # | |
8 # Resynchronize a pair of paired-end fastq files. Read in two unsynchronized | |
9 # fastq files and write out two synchronized fastq files. The | |
10 # synchronized files have properly paired reads, with singelton reads removed. | |
11 # | |
12 # | |
13 ########################################################### | |
14 | |
15 die "USAGE: resync.pl sample1_R1.fastq sample1_R2.fastq [sample1_R1_synced.fastq] [sample1_R2_synced.fastq]\n" unless ($#ARGV >= 1 && $#ARGV <= 3); | |
16 | |
17 # open up input files | |
18 open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n"; | |
19 open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n"; | |
20 | |
21 # open up output files | |
22 if ($#ARGV > 1) { | |
23 open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n"; | |
24 } else { | |
25 open O1, ">$ARGV[0].out" or die "cannot open $ARGV[0].out\n"; | |
26 } | |
27 if ($#ARGV == 3) { | |
28 open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n"; | |
29 } else { | |
30 open O2, ">$ARGV[1].out" or die "cannot open $ARGV[1].out\n"; | |
31 } | |
32 | |
33 $readid = `head -n 1 $ARGV[0]`; | |
34 chomp $readid; | |
35 if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line | |
36 $id_type = 1; | |
37 print STDERR "Casava 1.7 read id style\n"; # TESTING | |
38 } elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace | |
39 $id_type = 2; | |
40 print STDERR "Casava 1.8 read id style\n"; # TESTING | |
41 } else { | |
42 print STDERR "Cannot determine read id style\n"; | |
43 print STDOUT "Unknwon id style: $readid\n"; | |
44 exit 1; | |
45 } | |
46 | |
47 | |
48 # read in the first fastq file and store in a hash | |
49 $f1readcount = 0; | |
50 while ($f1line1 = <F1>) { | |
51 $f1readcount++; | |
52 if ($f1readcount % 1000000 == 0) { # print progress update | |
53 print STDERR "$f1readcount reads processed in first file\n"; | |
54 } | |
55 $f1line2 = <F1>; | |
56 $f1line3 = <F1>; | |
57 $f1line4 = <F1>; | |
58 | |
59 ($id, $junk) = split /\//, $f1line1; | |
60 $r1{$id} = $f1line1 . $f1line2 . $f1line3 . $f1line4; | |
61 } | |
62 | |
63 # read in the second fastq file, printing out both read pairs if available, otherwise tossing the reads | |
64 $f2readcount = 0; | |
65 $f1missed = $f1readcount; | |
66 $f2missed = 0; | |
67 while ($f2line1 = <F2>) { | |
68 $f2readcount++; | |
69 if ($f2readcount % 1000000 == 0) { # print progress update | |
70 print STDERR "$f2readcount reads processed in second file\n"; | |
71 } | |
72 $f2line2 = <F2>; | |
73 $f2line3 = <F2>; | |
74 $f2line4 = <F2>; | |
75 | |
76 ($id, $junk) = split /\//, $f2line1; | |
77 if (defined($r1{$id})) { | |
78 $f1missed--; | |
79 print O1 "$r1{$id}"; | |
80 print O2 "$f2line1"; | |
81 print O2 "$f2line2"; | |
82 print O2 "$f2line3"; | |
83 print O2 "$f2line4"; | |
84 } else { | |
85 $f2missed++; | |
86 } | |
87 } | |
88 | |
89 print "$f1readcount reads in $ARGV[0], $f1missed without mate\n"; | |
90 print "$f2readcount reads in $ARGV[1], $f2missed without mate\n"; | |
91 | |
92 |