# HG changeset patch
# User jjohnson
# Date 1360095798 18000
# Node ID 751f4938cf0dac8458b1ac28d5f386b1fc1948f7
Uploaded
diff -r 000000000000 -r 751f4938cf0d README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,7 @@
+These tools check and fix synchronization of paired read fastq files.
+Those files may get out of synchronization due to quality filtering.
+
+These script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids.
+Other types of read ID formats may cause a terminal error.
+
+
diff -r 000000000000 -r 751f4938cf0d pe_sync.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pe_sync.xml Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,32 @@
+
+ The Paired-end synchronization check program determines if the reads in paired-end fastq files are in the proper order (synchronized).
+
+ pe_sync_2_files.pl $quickcheck $input1 $input2 2>&1 | tee $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Paired-end read file synchronization checks if left and right reads are in the same order.
+
+The "quick" option will not report the percent of out-of-sync reads
+for many failed files, but will run much faster.
+
+This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids.
+Other types of read ID formats may cause a terminal error.
+
+
+
diff -r 000000000000 -r 751f4938cf0d pe_sync_2_files.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pe_sync_2_files.pl Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,148 @@
+#!/usr/bin/perl -w
+
+###############################################
+# pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl
+# John Garbe
+# February 2012
+#
+# Paired-end read file synchronization script: checks if left and right
+# reads are in the same order.
+# This script checks the two files specificd on the command line.
+# The "quick" option will not report the percent of out-of-sync reads
+# for many failed files, but will run much faster.
+#
+# This script should run fine on linux systems or solaris
+# (which is what the project space fileserver runs)
+#
+# This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids.
+# Other types of read ID formats will cause a terminal error.
+#
+###############################################
+
+die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0);
+
+# determine if we're running on solaris in order to change system calls
+$output = `perl -v`;
+$solaris = 0;
+$solaris = 1 if ($output =~ /solaris/);
+if ($solaris) {
+# print STDERR "Running on Solaris\n";
+}
+
+# handle "quick" option
+$quick = 0; # 1 for quickcheck, 0 for thorough check
+if (defined($ARGV[0])) {
+ if ($ARGV[0] eq "quick") {
+ $quick = 1;
+ shift @ARGV;
+ }
+}
+print STDERR "QUICK CHECK enabled\n" if ($quick);
+
+$filename_r1 = $ARGV[0];
+$filename_r2 = $ARGV[1];
+
+# identify read id format type
+if ($solaris) {
+ $readid = `head -1 $filename_r1`;
+} else {
+ $readid = `head -n 1 $filename_r1`;
+}
+chomp $readid;
+if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line
+ $id_type = 1;
+ print STDERR "Casava 1.7 read id style\n"; # TESTING
+} elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace
+ $id_type = 2;
+ print STDERR "Casava 1.8 read id style\n"; # TESTING
+} else {
+ print STDERR "Cannot determine read id style\n";
+ print STDOUT "Unknwon id style: $readid\n";
+ exit 1;
+}
+
+chomp $filename_r1;
+chomp $filename_r2;
+
+if ($quick) {
+ $failure = &quickcheck($filename_r1, $filename_r2); # quickie check
+ print STDERR "FAILED quickcheck\n" if ($failure);
+}
+# If we aren't doing the quick check, or the quick check passed, do the slow check
+if (!($quick) || !($failure)) {
+ $failure = &percent($filename_r1, $filename_r2); # slow thorough check
+}
+
+exit;
+
+######################## subs #########################
+# do a quick check to see if the files are bad (many bad files will pass this quick check)
+sub quickcheck {
+ my ($file1, $file2) = @_;
+ if ($solaris) {
+ $result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF
+ $result2 = `tail -10000l $file2 | head -1`;
+ } else {
+ $result1 = `tail -n 10000 $file1 | head -n 1`;
+ $result2 = `tail -n 10000 $file2 | head -n 1`;
+ }
+ # process the ids depending on id style
+ if ($id_type == 1) { # 1 or 2 at end of id
+ chomp $result1;
+ chomp $result2;
+ chop $result1;
+ chop $result2;
+ } else { # first word is a unique read id, second word is left/right-specific
+ ($result1) = split ' ', $result1;
+ ($result2) = split ' ', $result2;
+ }
+ if ($result1 eq $result2) {
+# print "pass\n";
+ return 0;
+ } else {
+# print "fail\n";
+ return 1;
+ }
+}
+
+
+# compare every read id from each file and determine what percent of reads are
+# out of sync
+sub percent {
+ my($file1, $file2) = @_;
+
+ open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!";
+ open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!";
+
+ my $readCount = 0;
+ my $failcount = 0;
+ while (!eof(R1) and !eof(R2)) {
+ $id1 = ;
+ $id2 = ;
+ next unless ($. % 4 == 1); # check every sequence (4th line in the file)
+ # process the ids depending on id style
+ if ($id_type == 1) { # 1 or 2 at end of id
+ chomp $id1; # newline
+ chop $id1; # last char of the id (should be the "1" or "2")
+ chomp $id2;
+ chop $id2;
+ } else {
+ ($id1) = split ' ', $id1;
+ ($id2) = split ' ', $id2;
+ }
+ $failcount++ unless ($id1 eq $id2);
+ $readCount++;
+ }
+ my $percent = $failcount / $readCount;
+ $prettypercent = $percent * 100; # make it percent
+ $prettypercent = sprintf("%.2f", $prettypercent);
+ if ($failcount > 0) {
+ print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n";
+ $exitStatus=1;
+ } else {
+ print STDERR "PASSED full check\n";
+ $exitStatus=0;
+ }
+
+ return $exitStatus;
+}
diff -r 000000000000 -r 751f4938cf0d resync.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/resync.pl Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,92 @@
+#!/usr/bin/perl -w
+
+###########################################################
+# resync.pl
+# John Garbe
+# June 2012
+#
+# Resynchronize a pair of paired-end fastq files. Read in two unsynchronized
+# fastq files and write out two synchronized fastq files. The
+# synchronized files have properly paired reads, with singelton reads removed.
+#
+#
+###########################################################
+
+die "USAGE: resync.pl sample1_R1.fastq sample1_R2.fastq [sample1_R1_synced.fastq] [sample1_R2_synced.fastq]\n" unless ($#ARGV >= 1 && $#ARGV <= 3);
+
+# open up input files
+open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n";
+open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n";
+
+# open up output files
+if ($#ARGV > 1) {
+ open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n";
+} else {
+ open O1, ">$ARGV[0].out" or die "cannot open $ARGV[0].out\n";
+}
+if ($#ARGV == 3) {
+ open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n";
+} else {
+ open O2, ">$ARGV[1].out" or die "cannot open $ARGV[1].out\n";
+}
+
+$readid = `head -n 1 $ARGV[0]`;
+chomp $readid;
+if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line
+ $id_type = 1;
+ print STDERR "Casava 1.7 read id style\n"; # TESTING
+} elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace
+ $id_type = 2;
+ print STDERR "Casava 1.8 read id style\n"; # TESTING
+} else {
+ print STDERR "Cannot determine read id style\n";
+ print STDOUT "Unknwon id style: $readid\n";
+ exit 1;
+}
+
+
+# read in the first fastq file and store in a hash
+$f1readcount = 0;
+while ($f1line1 = ) {
+ $f1readcount++;
+ if ($f1readcount % 1000000 == 0) { # print progress update
+ print STDERR "$f1readcount reads processed in first file\n";
+ }
+ $f1line2 = ;
+ $f1line3 = ;
+ $f1line4 = ;
+
+ ($id, $junk) = split /\//, $f1line1;
+ $r1{$id} = $f1line1 . $f1line2 . $f1line3 . $f1line4;
+}
+
+# read in the second fastq file, printing out both read pairs if available, otherwise tossing the reads
+$f2readcount = 0;
+$f1missed = $f1readcount;
+$f2missed = 0;
+while ($f2line1 = ) {
+ $f2readcount++;
+ if ($f2readcount % 1000000 == 0) { # print progress update
+ print STDERR "$f2readcount reads processed in second file\n";
+ }
+ $f2line2 = ;
+ $f2line3 = ;
+ $f2line4 = ;
+
+ ($id, $junk) = split /\//, $f2line1;
+ if (defined($r1{$id})) {
+ $f1missed--;
+ print O1 "$r1{$id}";
+ print O2 "$f2line1";
+ print O2 "$f2line2";
+ print O2 "$f2line3";
+ print O2 "$f2line4";
+ } else {
+ $f2missed++;
+ }
+}
+
+print "$f1readcount reads in $ARGV[0], $f1missed without mate\n";
+print "$f2readcount reads in $ARGV[1], $f2missed without mate\n";
+
+
diff -r 000000000000 -r 751f4938cf0d resync.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/resync.xml Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,32 @@
+
+ Resynchronize a pair of paired-end fastq files
+
+ resync.pl $input1 $input2 $output1 $output2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Resynchronize a pair of paired-end fastq files.
+
+Reads in two potentially unsynchronized fastq files and writes out two synchronized fastq files.
+
+This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids.
+Other types of read ID formats may cause a terminal error.
+
+The synchronized files have properly paired reads, with singelton reads removed.
+
+
+
diff -r 000000000000 -r 751f4938cf0d test-data/reads1.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reads1.fastqsanger Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,20 @@
+@HWUSI-EAS1737:7:1:4411:1170#CAGATC/1
+AGAAGGATATGGTGAAGAAATAGCCTGCACTCAGAATGGCCAGATGTACTTAAACAGGGACATTTGGAAACCTGCC
++
+IIGHIIIGIIIIGIIIIFHGIIIIIIIHIIHIIHIIIIHHIIHIIIGIHIIHIIIIIIIHIHHIIIIIIIIIHIII
+@HWUSI-EAS1737:7:1:16187:1196#CAGATC/1
+TACGACTACGTGCGTGAGTTTAAAAGAAGCACAGGAACCTGGCACCACACTAAACCACTCCTTCCATCCGACCTTC
++
+GHHHHHHHHDHHHHHGDGGGGGGGDHHHAHEEGGGGGGFGGEGGGHHHHHHHHHHHHHHHHFBHHBEEFEGGEGCH
+@HWUSI-EAS1737:7:1:19051:1194#CAGATC/1
+AAGTACCTGCTTTCAACGTGTTGAGGGTTGACATAGGTGCTTGAAGAACAGAATGTAACATTTTGTGGTGTAAAAT
++
+IIIIIIIIIIIIIIIIIIIGIIIIIIHIIHIIHIIIIGIIIIHIIIIIIIIIHIIHIIIIHIHIHFIHGGEIHHHI
+@HWUSI-EAS1737:7:1:10050:1205#CAGATC/1
+ATATGCACATTTTTAGAATTCCATTATCTTGAATACAAATAGGGAAATATCTTTTGGAAATTAAAATAATTTCAAA
++
+GHHHHHFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+@HWUSI-EAS1737:7:1:16633:1206#CAGATC/1
+GGAGGAACAGTAAGCATGTATTTTCTGATGACGTATTTTCTGACAATTTCTATGGAAGAAGCTTCCCCCATGCCCT
++
+HHHHHHHHHHFHHFHHGHGHHHHHHHHHHHHHHHGHHHHHHHGHHGHHHHHHHHHGHHHHHHHHHHHGHHCFEGF>
diff -r 000000000000 -r 751f4938cf0d test-data/reads2.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reads2.fastqsanger Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,20 @@
+@HWUSI-EAS1737:7:1:4411:1170#CAGATC/2
+....AGCAAAATCATTAATCATTAACCCCAAAAGCCACCAAAAAAGCTGCTTAATTATTCTCTTGAGTTGCTGAAT
++
+####)////-@CC@C@@@@@@@@@@CCCCCCC@@;@@@@C@C@@44@@@CC@@C@@@@CCC@@CC@@@@@@C@@@@
+@HWUSI-EAS1737:7:1:16187:1196#CAGATC/2
+.CCCAAACAAGAGCTAAAGCTCAAGTTTGTAAGCATCATAAAACCACCAAAGAGTTACCATACCAAAGCCCAGCAC
++
+#*,'&-*+*)*6636@@@@@:<<<<16544@@@@@@@@@@@@44@@@@@@@5@@@<<<<572777@@@@@@@@@@C
+@HWUSI-EAS1737:7:1:19051:1194#CAGATC/2
+.TTTTCTTGCAATAATAGTATGATTTTGAGGTTAAGGGTGCATGCTCTTCTAATGCAAAATATTGTATTTATTTAG
++
+#110071117CCCCCCCC@C@@C@@@C@@C@@@@@<::<<@@@@@@C@CC@C@C@CC@@@<44<<@@@C@C@444@
+@HWUSI-EAS1737:7:1:10050:1205#CAGATC/2
+GATGATAACTAACCTTTTAAACTCATCAACTGGACAGGTTTCATAGCGGCAGACATTAAAGGTACAGGTTCCTGGA
++
+IHIIIIHGIIIIIIIIIIIHIIIIIIHIIIHIIIIHIIIIIIHIIIIIIHDIHHGIIFIHIDBHHDDD@DIFFFEE
+@HWUSI-EAS1737:7:1:16633:1206#CAGATC/2
+GAAACCCTTGGCCAAAAACTACCTCTCTGTTGCAGGCTCCCTGCAGCTCGATCTCAACCGCATGCCCAAGCCAGCC
++
+IIIHIIHGIIHHIIIIGIEIGDIGGBGGGGIIIIIIH>IIIIGEIBBEGFBHHDBEDEFBE<3CEAC>???;-8??
diff -r 000000000000 -r 751f4938cf0d test-data/reads_unsync_1.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reads_unsync_1.fastqsanger Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,20 @@
+@HWUSI-EAS1737:7:1:4411:1170#CAGATC/1
+AGAAGGATATGGTGAAGAAATAGCCTGCACTCAGAATGGCCAGATGTACTTAAACAGGGACATTTGGAAACCTGCC
++
+IIGHIIIGIIIIGIIIIFHGIIIIIIIHIIHIIHIIIIHHIIHIIIGIHIIHIIIIIIIHIHHIIIIIIIIIHIII
+@HWUSI-EAS1737:7:1:10050:1205#CAGATC/1
+ATATGCACATTTTTAGAATTCCATTATCTTGAATACAAATAGGGAAATATCTTTTGGAAATTAAAATAATTTCAAA
++
+GHHHHHFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+@HWUSI-EAS1737:7:1:16187:1196#CAGATC/1
+TACGACTACGTGCGTGAGTTTAAAAGAAGCACAGGAACCTGGCACCACACTAAACCACTCCTTCCATCCGACCTTC
++
+GHHHHHHHHDHHHHHGDGGGGGGGDHHHAHEEGGGGGGFGGEGGGHHHHHHHHHHHHHHHHFBHHBEEFEGGEGCH
+@HWUSI-EAS1737:7:1:19051:1194#CAGATC/1
+AAGTACCTGCTTTCAACGTGTTGAGGGTTGACATAGGTGCTTGAAGAACAGAATGTAACATTTTGTGGTGTAAAAT
++
+IIIIIIIIIIIIIIIIIIIGIIIIIIHIIHIIHIIIIGIIIIHIIIIIIIIIHIIHIIIIHIHIHFIHGGEIHHHI
+@HWUSI-EAS1737:7:1:16633:1206#CAGATC/1
+GGAGGAACAGTAAGCATGTATTTTCTGATGACGTATTTTCTGACAATTTCTATGGAAGAAGCTTCCCCCATGCCCT
++
+HHHHHHHHHHFHHFHHGHGHHHHHHHHHHHHHHHGHHHHHHHGHHGHHHHHHHHHGHHHHHHHHHHHGHHCFEGF>
diff -r 000000000000 -r 751f4938cf0d test-data/reads_unsync_2.fastqsanger
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reads_unsync_2.fastqsanger Tue Feb 05 15:23:18 2013 -0500
@@ -0,0 +1,16 @@
+@HWUSI-EAS1737:7:1:4411:1170#CAGATC/2
+....AGCAAAATCATTAATCATTAACCCCAAAAGCCACCAAAAAAGCTGCTTAATTATTCTCTTGAGTTGCTGAAT
++
+####)////-@CC@C@@@@@@@@@@CCCCCCC@@;@@@@C@C@@44@@@CC@@C@@@@CCC@@CC@@@@@@C@@@@
+@HWUSI-EAS1737:7:1:19051:1194#CAGATC/2
+.TTTTCTTGCAATAATAGTATGATTTTGAGGTTAAGGGTGCATGCTCTTCTAATGCAAAATATTGTATTTATTTAG
++
+#110071117CCCCCCCC@C@@C@@@C@@C@@@@@<::<<@@@@@@C@CC@C@C@CC@@@<44<<@@@C@C@444@
+@HWUSI-EAS1737:7:1:10050:1205#CAGATC/2
+GATGATAACTAACCTTTTAAACTCATCAACTGGACAGGTTTCATAGCGGCAGACATTAAAGGTACAGGTTCCTGGA
++
+IHIIIIHGIIIIIIIIIIIHIIIIIIHIIIHIIIIHIIIIIIHIIIIIIHDIHHGIIFIHIDBHHDDD@DIFFFEE
+@HWUSI-EAS1737:7:1:16633:1206#CAGATC/2
+GAAACCCTTGGCCAAAAACTACCTCTCTGTTGCAGGCTCCCTGCAGCTCGATCTCAACCGCATGCCCAAGCCAGCC
++
+IIIHIIHGIIHHIIIIGIEIGDIGGBGGGGIIIIIIH>IIIIGEIBBEGFBHHDBEDEFBE<3CEAC>???;-8??